6395371 ASSERT in dmu_tx_count_free: blkid + i < dn->dn_phys->dn_nblkptr
6396359 infinite loop due to dangling dbufs (hang on unmount)
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h Fri Mar 10 16:27:46 2006 -0800
@@ -164,6 +164,11 @@
#define DTRACE_PROBE2(a, b, c, d, e) ((void)0)
#endif /* DTRACE_PROBE2 */
+#ifdef DTRACE_PROBE3
+#undef DTRACE_PROBE3
+#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0)
+#endif /* DTRACE_PROBE3 */
+
/*
* Threads
*/
--- a/usr/src/uts/common/fs/zfs/arc.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c Fri Mar 10 16:27:46 2006 -0800
@@ -1859,7 +1859,8 @@
mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size);
+ DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
+ zbookmark_t *, zb);
atomic_add_64(&arc.misses, 1);
rzio = zio_read(pio, spa, bp, buf->b_data, size,
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Fri Mar 10 16:27:46 2006 -0800
@@ -761,14 +761,8 @@
mutex_exit(&db->db_mtx);
continue;
}
- if (db->db_state == DB_READ) {
- /* this will be handled in dbuf_read_done() */
- db->db_d.db_freed_in_flight = TRUE;
- mutex_exit(&db->db_mtx);
- continue;
- }
- if (db->db_state == DB_FILL) {
- /* this will be handled in dbuf_rele() */
+ if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+ /* will be handled in dbuf_read_done or dbuf_rele */
db->db_d.db_freed_in_flight = TRUE;
mutex_exit(&db->db_mtx);
continue;
@@ -844,16 +838,12 @@
/* XXX does *this* func really need the lock? */
ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
- if (osize == size)
- return;
-
/*
* This call to dbuf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
*/
- /* Make a copy of the data if necessary */
/*
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
@@ -871,12 +861,10 @@
bzero((uint8_t *)buf->b_data + osize, size - osize);
mutex_enter(&db->db_mtx);
- /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
dbuf_set_data(db, buf);
VERIFY(arc_buf_remove_ref(obuf, db) == 1);
db->db.db_size = size;
- /* fix up the dirty info */
if (db->db_level == 0)
db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
mutex_exit(&db->db_mtx);
@@ -1234,6 +1222,7 @@
{
dnode_t *dn = db->db_dnode;
dmu_buf_impl_t *parent = db->db_parent;
+ dmu_buf_impl_t *dndb = dn->dn_dbuf;
int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1270,7 +1259,7 @@
* If this dbuf is referened from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
- if (parent && parent != dn->dn_dbuf)
+ if (parent && parent != dndb)
dbuf_rele(parent, db);
}
@@ -1305,17 +1294,23 @@
return (err);
err = dbuf_read(*parentp, NULL,
(DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
- if (err == 0) {
- *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
- (blkid & ((1ULL << epbs) - 1));
+ if (err) {
+ dbuf_rele(*parentp, NULL);
+ *parentp = NULL;
+ return (err);
}
- return (err);
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ return (0);
} else {
/* the block is referenced from the dnode */
ASSERT3U(level, ==, nlevels-1);
ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
blkid < dn->dn_phys->dn_nblkptr);
- *parentp = dn->dn_dbuf;
+ if (dn->dn_dbuf) {
+ dbuf_add_ref(dn->dn_dbuf, NULL);
+ *parentp = dn->dn_dbuf;
+ }
*bpp = &dn->dn_phys->dn_blkptr[blkid];
return (0);
}
@@ -1427,18 +1422,9 @@
* remove it from that list.
*/
if (list_link_active(&db->db_link)) {
- int need_mutex;
-
- ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx));
- need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
- if (need_mutex)
- mutex_enter(&dn->dn_dbufs_mtx);
-
- /* remove from dn_dbufs */
+ mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
-
- if (need_mutex)
- mutex_exit(&dn->dn_dbufs_mtx);
+ mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
}
@@ -1494,7 +1480,7 @@
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
(ARC_NOWAIT | ARC_PREFETCH), &zb);
}
- if (parent && parent != dn->dn_dbuf)
+ if (parent)
dbuf_rele(parent, NULL);
}
}
@@ -1522,12 +1508,13 @@
blkptr_t *bp = NULL;
int err;
+ ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
if (err == 0 && bp && BP_IS_HOLE(bp))
err = ENOENT;
if (err) {
- if (parent && parent != dn->dn_dbuf)
+ if (parent)
dbuf_rele(parent, NULL);
return (err);
}
@@ -1541,6 +1528,10 @@
arc_buf_add_ref(db->db_buf, db);
if (db->db_buf->b_data == NULL) {
dbuf_clear(db);
+ if (parent) {
+ dbuf_rele(parent, NULL);
+ parent = NULL;
+ }
goto top;
}
ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
@@ -1572,7 +1563,7 @@
mutex_exit(&db->db_mtx);
/* NOTE: we can't rele the parent until after we drop the db_mtx */
- if (parent && parent != dn->dn_dbuf)
+ if (parent)
dbuf_rele(parent, NULL);
ASSERT3P(db->db_dnode, ==, dn);
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c Fri Mar 10 16:27:46 2006 -0800
@@ -1638,7 +1638,7 @@
* we go trundling through the block pointers.
*/
for (i = 0; i < TXG_SIZE; i++) {
- if (dn->dn_dirtyblksz[i])
+ if (list_link_active(&dn->dn_dirty_link[i]))
break;
}
if (i != TXG_SIZE) {
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Mar 10 16:27:46 2006 -0800
@@ -279,41 +279,43 @@
dmu_objset_evict_dbufs(objset_t *os)
{
objset_impl_t *osi = os->os;
- dnode_t *mdn = osi->os_meta_dnode;
dnode_t *dn;
- int allzero = B_TRUE;
+
+ mutex_enter(&osi->os_lock);
+
+ /* process the mdn last, since the other dnodes have holds on it */
+ list_remove(&osi->os_dnodes, osi->os_meta_dnode);
+ list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
/*
- * Each time we process an entry on the list, we first move it
- * to the tail so that we don't process it over and over again.
- * We use the meta-dnode as a marker: if we make a complete pass
- * over the list without finding any work to do, we're done.
- * This ensures that we complete in linear time rather than
- * quadratic time, as described in detail in bug 1182169.
+ * Find the first dnode with holds. We have to do this dance
+ * because dnode_add_ref() only works if you already have a
+ * hold. If there are no holds then it has no dbufs so OK to
+ * skip.
*/
- mutex_enter(&osi->os_lock);
- list_remove(&osi->os_dnodes, mdn);
- list_insert_tail(&osi->os_dnodes, mdn);
- while ((dn = list_head(&osi->os_dnodes)) != NULL) {
- list_remove(&osi->os_dnodes, dn);
- list_insert_tail(&osi->os_dnodes, dn);
- if (dn == mdn) {
- if (allzero)
- break;
- allzero = B_TRUE;
- continue;
- }
- if (!refcount_is_zero(&dn->dn_holds)) {
- allzero = B_FALSE;
- dnode_add_ref(dn, FTAG);
- mutex_exit(&osi->os_lock);
- dnode_evict_dbufs(dn);
- dnode_rele(dn, FTAG);
- mutex_enter(&osi->os_lock);
- }
+ for (dn = list_head(&osi->os_dnodes);
+ dn && refcount_is_zero(&dn->dn_holds);
+ dn = list_next(&osi->os_dnodes, dn))
+ continue;
+ if (dn)
+ dnode_add_ref(dn, FTAG);
+
+ while (dn) {
+ dnode_t *next_dn = dn;
+
+ do {
+ next_dn = list_next(&osi->os_dnodes, next_dn);
+ } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
+ if (next_dn)
+ dnode_add_ref(next_dn, FTAG);
+
+ mutex_exit(&osi->os_lock);
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
+ mutex_enter(&osi->os_lock);
+ dn = next_dn;
}
mutex_exit(&osi->os_lock);
- dnode_evict_dbufs(mdn);
}
void
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Fri Mar 10 16:27:46 2006 -0800
@@ -331,26 +331,53 @@
uint64_t space = 0;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- if (dn->dn_datablkshift == 0)
- return;
/*
- * not that the dnode can change, since it isn't dirty, but
- * dbuf_hold_impl() wants us to have the struct_rwlock.
- * also need it to protect dn_maxblkid.
+ * We don't use any locking to check for dirtyness because it's
+ * OK if we get stale data -- the dnode may become dirty
+ * immediately after our check anyway. This is just a means to
+ * avoid the expensive count when we aren't sure we need it. We
+ * need to be able to deal with a dirty dnode.
+ */
+ if ((uintptr_t)dn->dn_assigned_tx |
+ list_link_active(&dn->dn_dirty_link[0]) |
+ list_link_active(&dn->dn_dirty_link[1]) |
+ list_link_active(&dn->dn_dirty_link[2]) |
+ list_link_active(&dn->dn_dirty_link[3]))
+ return;
+
+ /*
+ * the struct_rwlock protects us against dn_phys->dn_nlevels
+ * changing, in case (against all odds) we manage to dirty &
+ * sync out the changes after we check for being dirty.
+ * also, dbuf_hold_impl() wants us to have the struct_rwlock.
+ *
+ * It's fine to use dn_datablkshift rather than the dn_phys
+ * equivalent because if it is changing, maxblkid==0 and we will
+ * bail.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = off >> dn->dn_datablkshift;
- nblks = (off + len) >> dn->dn_datablkshift;
+ if (dn->dn_phys->dn_maxblkid == 0) {
+ if (off == 0 && len >= dn->dn_datablksz) {
+ blkid = 0;
+ nblks = 1;
+ } else {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ } else {
+ blkid = off >> dn->dn_datablkshift;
+ nblks = (off + len) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_maxblkid) {
- rw_exit(&dn->dn_struct_rwlock);
- return;
+ if (blkid >= dn->dn_phys->dn_maxblkid) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid)
+ nblks = dn->dn_phys->dn_maxblkid - blkid;
+
+ /* don't bother after 128,000 blocks */
+ nblks = MIN(nblks, 128*1024);
}
- if (blkid + nblks > dn->dn_maxblkid)
- nblks = dn->dn_maxblkid - blkid;
-
- /* don't bother after the 100,000 blocks */
- nblks = MIN(nblks, 128*1024);
if (dn->dn_phys->dn_nlevels == 1) {
int i;
@@ -399,9 +426,10 @@
}
}
dbuf_rele(dbuf, FTAG);
- } else {
- /* the indirect block is sparse */
- ASSERT(err == ENOENT);
+ }
+ if (err != 0 && err != ENOENT) {
+ tx->tx_err = err;
+ break;
}
blkid += tochk;
@@ -416,7 +444,7 @@
dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
{
uint64_t start, end, i;
- int dirty, err, shift;
+ int err, shift;
zio_t *zio;
/* first block */
@@ -465,12 +493,7 @@
}
dmu_tx_count_dnode(tx, dn);
-
- /* XXX locking */
- dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
- dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
- if (dn->dn_assigned_tx != NULL && !dirty)
- dmu_tx_count_free(tx, dn, off, len);
+ dmu_tx_count_free(tx, dn, off, len);
}
void
--- a/usr/src/uts/common/fs/zfs/dnode.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode.c Fri Mar 10 16:27:46 2006 -0800
@@ -356,7 +356,8 @@
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
- ASSERT3U(dn->dn_dirtyblksz[i], ==, 0);
+ ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
}
@@ -386,6 +387,8 @@
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ int i;
+
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
@@ -395,10 +398,9 @@
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
- ASSERT(dn->dn_dirtyblksz[0] == 0);
- ASSERT(dn->dn_dirtyblksz[1] == 0);
- ASSERT(dn->dn_dirtyblksz[2] == 0);
- ASSERT(dn->dn_dirtyblksz[3] == 0);
+
+ for (i = 0; i < TXG_SIZE; i++)
+ ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
@@ -418,9 +420,7 @@
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
- /* don't need dd_dirty_mtx, dnode is already dirty */
- ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0);
- dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize;
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
rw_exit(&dn->dn_struct_rwlock);
/* change type */
@@ -508,7 +508,7 @@
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
for (n = 0; n < TXG_SIZE; n++)
- ASSERT(dn->dn_dirtyblksz[n] == 0);
+ ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
#endif
children_dnodes[i] = NULL;
dnode_destroy(dn);
@@ -660,14 +660,13 @@
/*
* If we are already marked dirty, we're done.
*/
- if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) {
+ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
mutex_exit(&os->os_lock);
return;
}
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
- dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz;
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
@@ -699,6 +698,8 @@
void
dnode_free(dnode_t *dn, dmu_tx_t *tx)
{
+ int txgoff = tx->tx_txg & TXG_MASK;
+
dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
/* we should be the only holder... hopefully */
@@ -717,11 +718,9 @@
* the dirty list to the free list.
*/
mutex_enter(&dn->dn_objset->os_lock);
- if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) {
- list_remove(
- &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn);
- list_insert_tail(
- &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn);
+ if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+ list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+ list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
mutex_exit(&dn->dn_objset->os_lock);
} else {
mutex_exit(&dn->dn_objset->os_lock);
@@ -760,16 +759,7 @@
if (dn->dn_phys->dn_maxblkid != 0)
goto end;
- /*
- * Any buffers allocated for blocks beyond the first
- * must be evictable/evicted, because they're the wrong size.
- */
mutex_enter(&dn->dn_dbufs_mtx);
- /*
- * Since we have the dn_dbufs_mtx, nothing can be
- * removed from dn_dbufs. Since we have dn_struct_rwlock/w,
- * nothing can be added to dn_dbufs.
- */
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
@@ -782,29 +772,21 @@
}
mutex_exit(&dn->dn_dbufs_mtx);
- /* Fast-track if there is no data in the file */
- if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) {
- dnode_setdblksz(dn, size);
- dn->dn_indblkshift = ibs;
- dnode_setdirty(dn, tx);
- /* don't need dd_dirty_mtx, dnode is already dirty */
- dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
- rw_exit(&dn->dn_struct_rwlock);
- return (0);
+ db = NULL;
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
+ /* obtain the old block */
+ db = dbuf_hold(dn, 0, FTAG);
+ dbuf_new_size(db, size, tx);
}
- /* obtain the old block */
- db = dbuf_hold(dn, 0, FTAG);
-
- dbuf_new_size(db, size, tx);
-
dnode_setdblksz(dn, size);
dn->dn_indblkshift = ibs;
- /* don't need dd_dirty_mtx, dnode is already dirty */
- dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+ dnode_setdirty(dn, tx);
+ dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
- dbuf_rele(db, FTAG);
+
+ if (db)
+ dbuf_rele(db, FTAG);
err = 0;
end:
@@ -827,71 +809,45 @@
{
uint64_t txgoff = tx->tx_txg & TXG_MASK;
int drop_struct_lock = FALSE;
- int epbs, old_nlevels, new_nlevels;
+ int epbs, new_nlevels;
uint64_t sz;
- if (blkid == DB_BONUS_BLKID)
- return;
+ ASSERT(blkid != DB_BONUS_BLKID);
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
drop_struct_lock = TRUE;
}
- if (blkid > dn->dn_maxblkid)
- dn->dn_maxblkid = blkid;
+ if (blkid <= dn->dn_maxblkid)
+ goto out;
+
+ dn->dn_maxblkid = blkid;
/*
- * Compute the number of levels necessary to support the
- * new blkid.
+ * Compute the number of levels necessary to support the new maxblkid.
*/
new_nlevels = 1;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr;
- sz <<= epbs)
+ for (sz = dn->dn_nblkptr;
+ sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
new_nlevels++;
- old_nlevels = dn->dn_nlevels;
- if (new_nlevels > dn->dn_next_nlevels[txgoff])
+ if (new_nlevels > dn->dn_nlevels) {
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;
- if (new_nlevels > old_nlevels) {
- dprintf("dn %p increasing nlevels from %u to %u\n",
- dn, dn->dn_nlevels, new_nlevels);
- dn->dn_nlevels = new_nlevels;
- }
-
- /*
- * Dirty the left indirects.
- * Note: the caller should have just dnode_use_space()'d one
- * data block's worth, so we could subtract that out of
- * dn_inflight_data to determine if there is any dirty data
- * besides this block.
- * We don't strictly need to dirty them unless there's
- * *something* in the object (eg. on disk or dirty)...
- */
- if (new_nlevels > old_nlevels) {
- dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
- dprintf("dn %p dirtying left indirects\n", dn);
+ /* Dirty the left indirects. */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
+
}
-#ifdef ZFS_DEBUG
- else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
- dmu_buf_impl_t *db;
- int i;
-
- for (i = 0; i < dn->dn_nblkptr; i++) {
- db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
- ASSERT(!
- list_link_active(&db->db_dirty_node[txgoff]));
- dbuf_rele(db, FTAG);
- }
- }
-#endif
-
- dprintf("dn %p done\n", dn);
out:
if (drop_struct_lock)
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Mar 10 16:27:46 2006 -0800
@@ -346,33 +346,58 @@
void
dnode_evict_dbufs(dnode_t *dn)
{
- dmu_buf_impl_t *db;
+ int progress;
+ int pass = 0;
+
+ do {
+ dmu_buf_impl_t *db, *db_next;
+ int evicting = FALSE;
- mutex_enter(&dn->dn_dbufs_mtx);
- while (db = list_head(&dn->dn_dbufs)) {
- int progress = 0;
- for (; db; db = list_next(&dn->dn_dbufs, db)) {
+ progress = FALSE;
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ /* dbuf_clear() may remove db from this list */
+ db_next = list_next(&dn->dn_dbufs, db);
+
mutex_enter(&db->db_mtx);
- if (db->db_state != DB_EVICTING &&
- refcount_is_zero(&db->db_holds))
- break;
- else if (db->db_state == DB_EVICTING)
- progress = 1;
- mutex_exit(&db->db_mtx);
+ if (db->db_state == DB_EVICTING) {
+ progress = TRUE;
+ evicting = TRUE;
+ mutex_exit(&db->db_mtx);
+ } else if (refcount_is_zero(&db->db_holds)) {
+ progress = TRUE;
+ ASSERT(!arc_released(db->db_buf));
+ dbuf_clear(db); /* exits db_mtx for us */
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+
}
- if (db) {
- ASSERT(!arc_released(db->db_buf));
- dbuf_clear(db);
- mutex_exit(&dn->dn_dbufs_mtx);
- progress = 1;
- } else {
- if (progress == 0)
- break;
- mutex_exit(&dn->dn_dbufs_mtx);
- }
- mutex_enter(&dn->dn_dbufs_mtx);
+ /*
+ * NB: we need to drop dn_dbufs_mtx between passes so
+ * that any DB_EVICTING dbufs can make progress.
+ * Ideally, we would have some cv we could wait on, but
+ * since we don't, just wait a bit to give the other
+ * thread a chance to run.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ if (evicting)
+ delay(1);
+ pass++;
+ ASSERT(pass < 100); /* sanity check */
+ } while (progress);
+
+ /*
+ * This function works fine even if it can't evict everything,
+ * but all of our callers need this assertion, so let's put it
+ * here (for now). Perhaps in the future there will be a try vs
+ * doall flag.
+ */
+ if (list_head(&dn->dn_dbufs) != NULL) {
+ panic("dangling dbufs (dn=%p, dbuf=%p)\n",
+ dn, list_head(&dn->dn_dbufs));
}
- mutex_exit(&dn->dn_dbufs_mtx);
+
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
@@ -380,6 +405,7 @@
dn->dn_bonus = NULL;
}
rw_exit(&dn->dn_struct_rwlock);
+
}
static int
@@ -420,6 +446,7 @@
/* Undirty next bits */
dn->dn_next_nlevels[txgoff] = 0;
dn->dn_next_indblkshift[txgoff] = 0;
+ dn->dn_next_blksz[txgoff] = 0;
/* free up all the blocks in the file. */
dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
@@ -436,7 +463,6 @@
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
- dn->dn_dirtyblksz[txgoff] = 0;
dn->dn_maxblkid = 0;
dn->dn_allocated_txg = 0;
mutex_exit(&dn->dn_mtx);
@@ -465,13 +491,10 @@
int txgoff = tx->tx_txg & TXG_MASK;
dnode_phys_t *dnp = dn->dn_phys;
- /* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_dirtyblksz[txgoff] > 0);
-
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
DNODE_VERIFY(dn);
+
/*
* Make sure the dbuf for the dn_phys is released before we modify it.
*/
@@ -502,11 +525,12 @@
dnp->dn_nblkptr = dn->dn_nblkptr;
}
- if (dn->dn_dirtyblksz[txgoff]) {
- ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff],
+ if (dn->dn_next_blksz[txgoff]) {
+ ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
dnp->dn_datablkszsec =
- dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+ dn->dn_next_blksz[txgoff] = 0;
}
if (dn->dn_next_indblkshift[txgoff]) {
@@ -577,9 +601,6 @@
dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH))
panic("data after EOF: off=%llu\n", (u_longlong_t)off);
- dn->dn_dirtyblksz[txgoff] = 0;
-
-
if (dn->dn_object != DMU_META_DNODE_OBJECT) {
dbuf_will_dirty(dn->dn_dbuf, tx);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Mar 10 16:27:46 2006 -0800
@@ -112,36 +112,9 @@
typedef struct dnode {
/*
- * lock ordering:
- *
- * db_mtx > dn_dirty_mtx
- * dbuf_syncdone
- *
- * dn_struct_rwlock/r > dn_dirty_mtx
- * dmu_object_info
- *
- * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
- * dbuf_dirty
- * dbuf_setdirty
- *
- * dn_struct_rwlock/w > db_mtx > dn_mtx
- * dnode_increase_indirection -> dbuf_find
- * dbuf_hold_impl
- * dnode_set_bonus
- *
- * dn_struct_rwlock/w > dn_mtx
- * dnode_increase_indirection
- *
- * dn_dirty_mtx > dn_mtx
- * dnode_buf_pageout
- *
- * db_mtx > dn_mtx
- * dbuf_create
- */
-
- /*
- * dn_struct_rwlock protects the structure of the dnode.
- * In particular, it protects the number of levels of indirection.
+ * dn_struct_rwlock protects the structure of the dnode,
+ * including the number of levels of indirection (dn_nlevels),
+ * dn_maxblkid, and dn_next_*
*/
krwlock_t dn_struct_rwlock;
@@ -158,38 +131,32 @@
dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
/*
- * Copies of stuff in dn_phys. They're valid here even before
- * the dnode is first synced.
+ * Copies of stuff in dn_phys. They're valid in the open
+ * context (eg. even before the dnode is first synced).
+ * Where necessary, these are protected by dn_struct_rwlock.
*/
- dmu_object_type_t dn_type; /* object type (immutable) */
- uint8_t dn_bonustype; /* bonus type (immutable) */
- uint16_t dn_bonuslen; /* bonus length (immutable) */
+ dmu_object_type_t dn_type; /* object type */
+ uint16_t dn_bonuslen; /* bonus length */
+ uint8_t dn_bonustype; /* bonus type */
uint8_t dn_nblkptr; /* number of blkptrs (immutable) */
- uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
- uint32_t dn_datablksz; /* in bytes */
- uint16_t dn_datablkszsec; /* in 512b sectors */
-
uint8_t dn_checksum; /* ZIO_CHECKSUM type */
uint8_t dn_compress; /* ZIO_COMPRESS type */
-
- /*
- * The following are kept up-to-date in the *open* context, the syncing
- * context should only pay attention to the dn_next_* values.
- */
uint8_t dn_nlevels;
uint8_t dn_indblkshift;
-
+ uint8_t dn_datablkshift; /* zero if blksz not power of 2! */
+ uint16_t dn_datablkszsec; /* in 512b sectors */
+ uint32_t dn_datablksz; /* in bytes */
+ uint64_t dn_maxblkid;
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
/* protected by os_lock: */
- uint32_t dn_dirtyblksz[TXG_SIZE]; /* dirty block size in bytes */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
/* protected by dn_mtx: */
kmutex_t dn_mtx;
list_t dn_dirty_dbufs[TXG_SIZE];
- uint64_t dn_maxblkid;
avl_tree_t dn_ranges[TXG_SIZE];
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;