--- a/usr/src/cmd/zdb/zdb.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/cmd/zdb/zdb.c Fri Feb 02 15:36:58 2007 -0800
@@ -1017,21 +1017,21 @@
if (dds.dds_type == DMU_OST_META) {
dds.dds_creation_txg = TXG_INITIAL;
- usedobjs = os->os->os_rootbp.blk_fill;
+ usedobjs = os->os->os_rootbp->blk_fill;
refdbytes =
os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
} else {
dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
}
- ASSERT3U(usedobjs, ==, os->os->os_rootbp.blk_fill);
+ ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
nicenum(refdbytes, numbuf);
if (verbosity >= 4) {
(void) strcpy(blkbuf, ", rootbp ");
sprintf_blkptr(blkbuf + strlen(blkbuf),
- BP_SPRINTF_LEN - strlen(blkbuf), &os->os->os_rootbp);
+ BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
} else {
blkbuf[0] = '\0';
}
--- a/usr/src/uts/common/fs/zfs/arc.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c Fri Feb 02 15:36:58 2007 -0800
@@ -315,14 +315,23 @@
typedef struct arc_callback arc_callback_t;
struct arc_callback {
+ void *acb_private;
arc_done_func_t *acb_done;
- void *acb_private;
arc_byteswap_func_t *acb_byteswap;
arc_buf_t *acb_buf;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
@@ -2357,6 +2366,7 @@
atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
}
hdr->b_datacnt -= 1;
+ arc_cksum_verify(buf);
mutex_exit(hash_lock);
@@ -2369,11 +2379,7 @@
nhdr->b_arc_access = 0;
nhdr->b_flags = 0;
nhdr->b_datacnt = 1;
- if (hdr->b_freeze_cksum != NULL) {
- nhdr->b_freeze_cksum =
- kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
- *nhdr->b_freeze_cksum = *hdr->b_freeze_cksum;
- }
+ nhdr->b_freeze_cksum = NULL;
buf->b_hdr = nhdr;
buf->b_next = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -2390,10 +2396,10 @@
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
+ arc_buf_thaw(buf);
}
buf->b_efunc = NULL;
buf->b_private = NULL;
- arc_buf_thaw(buf);
}
int
@@ -2417,17 +2423,26 @@
#endif
static void
+arc_write_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ if (callback->awcb_ready) {
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+ }
+ arc_cksum_compute(buf);
+}
+
+static void
arc_write_done(zio_t *zio)
{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr;
- arc_callback_t *acb;
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- buf = zio->io_private;
- hdr = buf->b_hdr;
- acb = hdr->b_acb;
hdr->b_acb = NULL;
- ASSERT(acb != NULL);
/* this buffer is on no lists and is not in the hash table */
ASSERT3P(hdr->b_state, ==, arc_anon);
@@ -2469,7 +2484,7 @@
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
- } else if (acb->acb_done == NULL) {
+ } else if (callback->awcb_done == NULL) {
int destroy_hdr;
/*
* This is an anonymous buffer with no user callback,
@@ -2485,23 +2500,23 @@
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- if (acb->acb_done) {
+ if (callback->awcb_done) {
ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- acb->acb_done(zio, buf, acb->acb_private);
+ callback->awcb_done(zio, buf, callback->awcb_private);
}
- kmem_free(acb, sizeof (arc_callback_t));
+ kmem_free(callback, sizeof (arc_write_callback_t));
}
-int
+zio_t *
arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags, zbookmark_t *zb)
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- arc_callback_t *acb;
- zio_t *rzio;
+ arc_write_callback_t *callback;
+ zio_t *zio;
/* this is a private buffer - no locking required */
ASSERT3P(hdr->b_state, ==, arc_anon);
@@ -2509,23 +2524,17 @@
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
ASSERT(hdr->b_acb == 0);
- acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
- acb->acb_done = done;
- acb->acb_private = private;
- acb->acb_byteswap = (arc_byteswap_func_t *)-1;
- hdr->b_acb = acb;
+ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+ callback->awcb_ready = ready;
+ callback->awcb_done = done;
+ callback->awcb_private = private;
+ callback->awcb_buf = buf;
hdr->b_flags |= ARC_IO_IN_PROGRESS;
- arc_cksum_compute(buf);
- rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
- buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
+ zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
+ buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
+ priority, flags, zb);
- if (arc_flags & ARC_WAIT)
- return (zio_wait(rzio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(rzio);
-
- return (0);
+ return (zio);
}
int
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,6 +39,9 @@
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx);
+static arc_done_func_t dbuf_write_ready;
static arc_done_func_t dbuf_write_done;
int zfs_mdcomp_disable = 0;
@@ -46,7 +49,6 @@
/*
* Global data structures and functions for the dbuf cache.
*/
-taskq_t *dbuf_tq;
static kmem_cache_t *dbuf_cache;
/* ARGSUSED */
@@ -210,31 +212,24 @@
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
+ if (db->db_level != 0 || db->db_evict_func == NULL)
return;
- if (db->db_d.db_user_data_ptr_ptr)
- *db->db_d.db_user_data_ptr_ptr = db->db.db_data;
- db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
- db->db_d.db_user_ptr = NULL;
- db->db_d.db_user_data_ptr_ptr = NULL;
- db->db_d.db_evict_func = NULL;
+ if (db->db_user_data_ptr_ptr)
+ *db->db_user_data_ptr_ptr = db->db.db_data;
+ db->db_evict_func(&db->db, db->db_user_ptr);
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
}
void
dbuf_evict(dmu_buf_impl_t *db)
{
- int i;
-
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_data_pending == NULL);
-#ifdef ZFS_DEBUG
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&db->db_dirty_node[i]));
- ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
- }
-#endif
dbuf_clear(db);
dbuf_destroy(db);
}
@@ -267,8 +262,6 @@
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
- dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
- TASKQ_PREPOPULATE);
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
@@ -280,9 +273,6 @@
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
- taskq_destroy(dbuf_tq);
- dbuf_tq = NULL;
-
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
@@ -297,7 +287,6 @@
static void
dbuf_verify(dmu_buf_impl_t *db)
{
- int i;
dnode_t *dn = db->db_dnode;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -330,15 +319,13 @@
ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
}
if (db->db.db_object == DMU_META_DNODE_OBJECT) {
- for (i = 0; i < TXG_SIZE; i++) {
- /*
- * it should only be modified in syncing
- * context, so make sure we only have
- * one copy of the data.
- */
- ASSERT(db->db_d.db_data_old[i] == NULL ||
- db->db_d.db_data_old[i] == db->db_buf);
- }
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ /*
+ * it should only be modified in syncing
+ * context, so make sure we only have
+ * one copy of the data.
+ */
+ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
}
}
@@ -395,9 +382,9 @@
dbuf_update_data(dmu_buf_impl_t *db)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
+ if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
ASSERT(!refcount_is_zero(&db->db_holds));
- *db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+ *db->db_user_data_ptr_ptr = db->db.db_data;
}
}
@@ -444,12 +431,12 @@
ASSERT(refcount_count(&db->db_holds) > 0);
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
/* we were freed in flight; disregard any error */
arc_release(buf, db);
bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf);
- db->db_d.db_freed_in_flight = FALSE;
+ db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else if (zio == NULL || zio->io_error == 0) {
@@ -646,120 +633,69 @@
static void
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
{
- arc_buf_t **quiescing, **syncing;
- arc_buf_contents_t type;
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
- quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
- syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+ ASSERT(db->db_level == 0);
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
- /*
- * If this buffer is referenced from the current quiescing
- * transaction group: either make a copy and reset the reference
- * to point to the copy, or (if there a no active holders) just
- * null out the current db_data pointer.
- */
- if (*quiescing == db->db_buf) {
- /*
- * If the quiescing txg is "dirty", then we better not
- * be referencing the same buffer from the syncing txg.
- */
- ASSERT(*syncing != db->db_buf);
- if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = db->db.db_size;
- type = DBUF_GET_BUFC_TYPE(db);
- *quiescing = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
- bcopy(db->db.db_data, (*quiescing)->b_data, size);
- } else {
- dbuf_set_data(db, NULL);
- }
+ if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
return;
- }
/*
- * If this buffer is referenced from the current syncing
- * transaction group: either
- * 1 - make a copy and reset the reference, or
- * 2 - if there are no holders, just null the current db_data.
+ * If the last dirty record for this dbuf has not yet synced
+ * and its referencing the dbuf data, either:
+ * reset the reference to point to a new copy,
+ * or (if there a no active holders)
+ * just null out the current db_data pointer.
*/
- if (*syncing == db->db_buf) {
- ASSERT3P(*quiescing, ==, NULL);
- ASSERT3U(db->db_dirtycnt, ==, 1);
- if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
- int size = db->db.db_size;
- type = DBUF_GET_BUFC_TYPE(db);
- /* we can't copy if we have already started a write */
- ASSERT(*syncing != db->db_data_pending);
- *syncing = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db, type);
- bcopy(db->db.db_data, (*syncing)->b_data, size);
- } else {
- dbuf_set_data(db, NULL);
- }
- }
-}
-
-/*
- * This is the "bonus buffer" version of the above routine
- */
-static void
-dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
-{
- arc_buf_t **quiescing, **syncing;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_blkid == DB_BONUS_BLKID);
-
- quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
- syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
-
- if (*quiescing == db->db.db_data) {
- ASSERT(*syncing != db->db.db_data);
- *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
- bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
- } else if (*syncing == db->db.db_data) {
- ASSERT3P(*quiescing, ==, NULL);
- ASSERT3U(db->db_dirtycnt, ==, 1);
- *syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
- bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
+ ASSERT(dr->dr_txg >= txg - 2);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ /* Note that the data bufs here are zio_bufs */
+ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ dr->dt.dl.dr_data = arc_buf_alloc(
+ db->db_dnode->dn_objset->os_spa, size, db, type);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+ } else {
+ dbuf_set_data(db, NULL);
}
}
void
-dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
+dbuf_unoverride(dbuf_dirty_record_t *dr)
{
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC);
+ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+ ASSERT(db->db_level == 0);
+
+ if (db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+ return;
- if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
- /* free this block */
- ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
- db->db_dnode->dn_free_txg == txg);
- if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
- /* XXX can get silent EIO here */
- (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
- txg, db->db_d.db_overridden_by[txg&TXG_MASK],
- NULL, NULL, ARC_WAIT);
- }
- kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
- sizeof (blkptr_t));
- db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
- /*
- * Release the already-written buffer, so we leave it in
- * a consistent dirty state. Note that all callers are
- * modifying the buffer, so they will immediately do
- * another (redundant) arc_release(). Therefore, leave
- * the buf thawed to save the effort of freezing &
- * immediately re-thawing it.
- */
- arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+ /* free this block */
+ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
+ /* XXX can get silent EIO here */
+ (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
}
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ /*
+ * Release the already-written buffer, so we leave it in
+ * a consistent dirty state. Note that all callers are
+ * modifying the buffer, so they will immediately do
+ * another (redundant) arc_release(). Therefore, leave
+ * the buf thawed to save the effort of freezing &
+ * immediately re-thawing it.
+ */
+ arc_release(dr->dt.dl.dr_data, db);
}
void
@@ -793,7 +729,7 @@
}
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
/* will be handled in dbuf_read_done or dbuf_rele */
- db->db_d.db_freed_in_flight = TRUE;
+ db->db_freed_in_flight = TRUE;
mutex_exit(&db->db_mtx);
continue;
}
@@ -802,26 +738,31 @@
dbuf_clear(db);
continue;
}
- /* The dbuf is CACHED and referenced */
+ /* The dbuf is referenced */
+
+ if (db->db_last_dirty != NULL) {
+ dbuf_dirty_record_t *dr = db->db_last_dirty;
- if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
- /*
- * This dbuf is not currently dirty. Either
- * uncache it (if its not referenced in the open
- * context) or reset its contents to empty.
- */
- dbuf_fix_old_data(db, txg);
- } else {
- if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
+ if (dr->dr_txg == txg) {
/*
- * This dbuf is overridden. Clear that state.
+ * This buffer is "in-use", re-adjust the file
+ * size to reflect that this buffer may
+ * contain new data when we sync.
*/
- dbuf_unoverride(db, txg);
+ if (db->db_blkid > dn->dn_maxblkid)
+ dn->dn_maxblkid = db->db_blkid;
+ dbuf_unoverride(dr);
+ } else {
+ /*
+ * This dbuf is not dirty in the open context.
+ * Either uncache it (if its not referenced in
+ * the open context) or reset its contents to
+ * empty.
+ */
+ dbuf_fix_old_data(db, txg);
}
- if (db->db_blkid > dn->dn_maxblkid)
- dn->dn_maxblkid = db->db_blkid;
}
- /* fill in with appropriate data */
+ /* clear the contents if its cached */
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
@@ -846,13 +787,13 @@
/*
* We don't need any locking to protect db_blkptr:
- * If it's syncing, then db_dirtied will be set so we'll
- * ignore db_blkptr.
+ * If it's syncing, then db_last_dirty will be set
+ * so we'll ignore db_blkptr.
*/
- ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
/* If we have been dirtied since the last snapshot, its not new */
- if (db->db_dirtied)
- birth_txg = db->db_dirtied;
+ if (db->db_last_dirty)
+ birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
@@ -901,18 +842,21 @@
VERIFY(arc_buf_remove_ref(obuf, db) == 1);
db->db.db_size = size;
- if (db->db_level == 0)
- db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
+ if (db->db_level == 0) {
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ db->db_last_dirty->dt.dl.dr_data = buf;
+ }
mutex_exit(&db->db_mtx);
dnode_willuse_space(db->db_dnode, size-osize, tx);
}
-void
+dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
+ dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
@@ -927,12 +871,11 @@
* XXX We may want to prohibit dirtying in syncing context even
* if they did pre-dirty.
*/
- ASSERT(!(dmu_tx_is_syncing(tx) &&
- !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
- dn->dn_object != DMU_META_DNODE_OBJECT &&
- dn->dn_objset->os_dsl_dataset != NULL &&
- !dsl_dir_is_private(
- dn->dn_objset->os_dsl_dataset->ds_dir)));
+ ASSERT(!dmu_tx_is_syncing(tx) ||
+ BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+ dn->dn_object == DMU_META_DNODE_OBJECT ||
+ dn->dn_objset->os_dsl_dataset == NULL ||
+ dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
/*
* We make this assert for private objects as well, but after we
@@ -940,23 +883,17 @@
* in syncing context.
*/
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
- dn->dn_dirtyctx == DN_UNDIRTIED ||
- dn->dn_dirtyctx ==
+ dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
mutex_enter(&db->db_mtx);
- /* XXX make this true for indirects too? */
- ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
- db->db_state == DB_FILL);
-
/*
- * If this buffer is currently part of an "overridden" region,
- * we now need to remove it from that region.
+ * XXX make this true for indirects too? The problem is that
+ * transactions created with dmu_tx_create_assigned() from
+ * syncing context don't bother holding ahead.
*/
- if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- db->db_d.db_overridden_by[txgoff] != NULL) {
- dbuf_unoverride(db, tx->tx_txg);
- }
+ ASSERT(db->db_level != 0 ||
+ db->db_state == DB_CACHED || db->db_state == DB_FILL);
mutex_enter(&dn->dn_mtx);
/*
@@ -964,7 +901,7 @@
* initialize the objset.
*/
if (dn->dn_dirtyctx == DN_UNDIRTIED &&
- !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
+ !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
dn->dn_dirtyctx =
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
ASSERT(dn->dn_dirtyctx_firstset == NULL);
@@ -975,13 +912,23 @@
/*
* If this buffer is already dirty, we're done.
*/
- if (list_link_active(&db->db_dirty_node[txgoff])) {
- if (db->db_blkid != DB_BONUS_BLKID && db->db_level == 0 &&
- db->db.db_object != DMU_META_DNODE_OBJECT)
- arc_buf_thaw(db->db_buf);
-
+ drp = &db->db_last_dirty;
+ ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+ db->db.db_object == DMU_META_DNODE_OBJECT);
+ while (*drp && (*drp)->dr_txg > tx->tx_txg)
+ drp = &(*drp)->dr_next;
+ if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(*drp);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ arc_buf_thaw(db->db_buf);
+ }
mutex_exit(&db->db_mtx);
- return;
+ return (*drp);
}
/*
@@ -1007,7 +954,7 @@
ASSERT(!dmu_tx_is_syncing(tx) ||
os->os_dsl_dataset == NULL ||
!dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
- !BP_IS_HOLE(&os->os_rootbp));
+ !BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1017,44 +964,50 @@
* to make a copy of it so that the changes we make in this
* transaction group won't leak out when we sync the older txg.
*/
- if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_d.db_data_old[txgoff] == NULL);
- dbuf_fix_old_bonus_data(db, tx->tx_txg);
- db->db_d.db_data_old[txgoff] = db->db.db_data;
- } else if (db->db_level == 0) {
- /*
- * Release the data buffer from the cache so that we
- * can modify it without impacting possible other users
- * of this cached data block. Note that indirect blocks
- * and private objects are not released until the syncing
- * state (since they are only modified then).
- */
- ASSERT(db->db_buf != NULL);
- ASSERT(db->db_d.db_data_old[txgoff] == NULL);
- if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+ if (db->db_level == 0) {
+ void *data_old = db->db_buf;
+
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ dbuf_fix_old_data(db, tx->tx_txg);
+ data_old = db->db.db_data;
+ } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+ /*
+ * Release the data buffer from the cache so that we
+ * can modify it without impacting possible other users
+ * of this cached data block. Note that indirect
+ * blocks and private objects are not released until the
+ * syncing state (since they are only modified then).
+ */
arc_release(db->db_buf, db);
dbuf_fix_old_data(db, tx->tx_txg);
- ASSERT(db->db_buf != NULL);
+ data_old = db->db_buf;
}
- db->db_d.db_data_old[txgoff] = db->db_buf;
+ ASSERT(data_old != NULL);
+ dr->dt.dl.dr_data = data_old;
+ } else {
+ mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&dr->dt.di.dr_children,
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ dr->dr_dbuf = db;
+ dr->dr_txg = tx->tx_txg;
+ dr->dr_next = *drp;
+ *drp = dr;
- mutex_enter(&dn->dn_mtx);
/*
* We could have been freed_in_flight between the dbuf_noread
* and dbuf_dirty. We win, as though the dbuf_noread() had
* happened after the free.
*/
if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
dnode_clear_range(dn, db->db_blkid, 1, tx);
- db->db_d.db_freed_in_flight = FALSE;
+ mutex_exit(&dn->dn_mtx);
+ db->db_freed_in_flight = FALSE;
}
- db->db_dirtied = tx->tx_txg;
- list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
- mutex_exit(&dn->dn_mtx);
-
if (db->db_blkid != DB_BONUS_BLKID) {
/*
* Update the accounting.
@@ -1084,8 +1037,12 @@
mutex_exit(&db->db_mtx);
if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
- return;
+ return (dr);
}
if (db->db_level == 0) {
@@ -1099,30 +1056,61 @@
}
if (db->db_level+1 < dn->dn_nlevels) {
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- dmu_buf_impl_t *parent;
- parent = dbuf_hold_level(dn, db->db_level+1,
- db->db_blkid >> epbs, FTAG);
+ dmu_buf_impl_t *parent = db->db_parent;
+ dbuf_dirty_record_t *di;
+ int parent_held = FALSE;
+
+ if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ parent = dbuf_hold_level(dn, db->db_level+1,
+ db->db_blkid >> epbs, FTAG);
+ parent_held = TRUE;
+ }
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
- dbuf_dirty(parent, tx);
- dbuf_rele(parent, FTAG);
+ ASSERT3U(db->db_level+1, ==, parent->db_level);
+ di = dbuf_dirty(parent, tx);
+ if (parent_held)
+ dbuf_rele(parent, FTAG);
+
+ mutex_enter(&db->db_mtx);
+ /* possible race with dbuf_undirty() */
+ if (db->db_last_dirty == dr ||
+ dn->dn_object == DMU_META_DNODE_OBJECT) {
+ mutex_enter(&di->dt.di.dr_mtx);
+ ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&di->dt.di.dr_children, dr);
+ mutex_exit(&di->dt.di.dr_mtx);
+ dr->dr_parent = di;
+ }
+ mutex_exit(&db->db_mtx);
} else {
+ ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_blkid < dn->dn_nblkptr);
+ ASSERT(db->db_parent == NULL ||
+ db->db_parent == db->db_dnode->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
}
dnode_setdirty(dn, tx);
+ return (dr);
}
static int
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
- int txgoff = tx->tx_txg & TXG_MASK;
- int64_t holds;
+ uint64_t txg = tx->tx_txg;
+ dbuf_dirty_record_t *dr;
- ASSERT(tx->tx_txg != 0);
+ ASSERT(txg != 0);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
mutex_enter(&db->db_mtx);
@@ -1130,10 +1118,14 @@
/*
* If this buffer is not dirty, we're done.
*/
- if (!list_link_active(&db->db_dirty_node[txgoff])) {
+ for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ if (dr->dr_txg <= txg)
+ break;
+ if (dr == NULL || dr->dr_txg < txg) {
mutex_exit(&db->db_mtx);
return (0);
}
+ ASSERT(dr->dr_txg == txg);
/*
* If this buffer is currently held, we cannot undirty
@@ -1152,31 +1144,41 @@
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- dbuf_unoverride(db, tx->tx_txg);
+ ASSERT(db->db.db_size != 0);
+
+ /* XXX would be nice to fix up dn_towrite_space[] */
+
+ db->db_last_dirty = dr->dr_next;
- ASSERT(db->db.db_size != 0);
- if (db->db_level == 0) {
- ASSERT(db->db_buf != NULL);
- ASSERT(db->db_d.db_data_old[txgoff] != NULL);
- if (db->db_d.db_data_old[txgoff] != db->db_buf)
- VERIFY(arc_buf_remove_ref(
- db->db_d.db_data_old[txgoff], db) == 1);
- db->db_d.db_data_old[txgoff] = NULL;
+ if (dr->dr_parent) {
+ mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+ list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+ mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+ } else if (db->db_level+1 == dn->dn_nlevels) {
+ ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ mutex_enter(&dn->dn_mtx);
+ list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+ mutex_exit(&dn->dn_mtx);
}
- /* XXX would be nice to fix up dn_towrite_space[] */
- /* XXX undo db_dirtied? but how? */
- /* db->db_dirtied = tx->tx_txg; */
+ if (db->db_level == 0) {
+ dbuf_unoverride(dr);
- mutex_enter(&dn->dn_mtx);
- list_remove(&dn->dn_dirty_dbufs[txgoff], db);
- mutex_exit(&dn->dn_mtx);
+ ASSERT(db->db_buf != NULL);
+ ASSERT(dr->dt.dl.dr_data != NULL);
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ } else {
+ ASSERT(db->db_buf != NULL);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ /* XXX - mutex and list destroy? */
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- if ((holds = refcount_remove(&db->db_holds,
- (void *)(uintptr_t)tx->tx_txg)) == 0) {
+ if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
arc_buf_t *buf = db->db_buf;
ASSERT(arc_released(buf));
@@ -1185,7 +1187,6 @@
dbuf_evict(db);
return (1);
}
- ASSERT(holds > 0);
mutex_exit(&db->db_mtx);
return (0);
@@ -1203,7 +1204,7 @@
if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
(void) dbuf_read(db, NULL, rf);
- dbuf_dirty(db, tx);
+ (void) dbuf_dirty(db, tx);
}
void
@@ -1220,7 +1221,7 @@
dmu_tx_private_ok(tx));
dbuf_noread(db);
- dbuf_dirty(db, tx);
+ (void) dbuf_dirty(db, tx);
}
#pragma weak dmu_buf_fill_done = dbuf_fill_done
@@ -1232,12 +1233,12 @@
DBUF_VERIFY(db);
if (db->db_state == DB_FILL) {
- if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+ if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
- db->db_d.db_freed_in_flight = FALSE;
+ db->db_freed_in_flight = FALSE;
}
db->db_state = DB_CACHED;
cv_broadcast(&db->db_changed);
@@ -1374,13 +1375,17 @@
db->db.db_object = dn->dn_object;
db->db_level = level;
db->db_blkid = blkid;
- db->db_dirtied = 0;
+ db->db_last_dirty = NULL;
db->db_dirtycnt = 0;
db->db_dnode = dn;
db->db_parent = parent;
db->db_blkptr = blkptr;
- bzero(&db->db_d, sizeof (db->db_d));
+ db->db_user_ptr = NULL;
+ db->db_user_data_ptr_ptr = NULL;
+ db->db_evict_func = NULL;
+ db->db_immediate_evict = 0;
+ db->db_freed_in_flight = 0;
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -1586,22 +1591,24 @@
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
/*
- * If this buffer is currently syncing out, and we are
- * are still referencing it from db_data, we need to make
- * a copy of it in case we decide we want to dirty it
- * again in this txg.
+ * If this buffer is currently syncing out, and we are are
+ * still referencing it from db_data, we need to make a copy
+ * of it in case we decide we want to dirty it again in this txg.
*/
- if (db->db_level == 0 && db->db_state == DB_CACHED &&
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
- db->db_data_pending == db->db_buf) {
- int size = (db->db_blkid == DB_BONUS_BLKID) ?
- DN_MAX_BONUSLEN : db->db.db_size;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ db->db_state == DB_CACHED && db->db_data_pending) {
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- size, db, type));
- bcopy(db->db_data_pending->b_data, db->db.db_data,
- db->db.db_size);
+ dbuf_set_data(db,
+ arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ db->db.db_size, db, type));
+ bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+ db->db.db_size);
+ }
}
(void) refcount_add(&db->db_holds, tag);
@@ -1669,11 +1676,15 @@
holds = refcount_remove(&db->db_holds, tag);
ASSERT(holds >= 0);
- if (db->db_buf && holds == db->db_dirtycnt)
+ /*
+ * We can't freeze indirects if there is a possibility that they
+ * may be modified in the current syncing context.
+ */
+ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
arc_buf_freeze(db->db_buf);
if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_d.db_immediate_evict)
+ db->db_level == 0 && db->db_immediate_evict)
dbuf_evict_user(db);
if (holds == 0) {
@@ -1725,7 +1736,7 @@
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- db->db_d.db_immediate_evict = TRUE;
+ db->db_immediate_evict = TRUE;
return (dmu_buf_update_user(db_fake, NULL, user_ptr,
user_data_ptr_ptr, evict_func));
}
@@ -1741,14 +1752,14 @@
mutex_enter(&db->db_mtx);
- if (db->db_d.db_user_ptr == old_user_ptr) {
- db->db_d.db_user_ptr = user_ptr;
- db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
- db->db_d.db_evict_func = evict_func;
+ if (db->db_user_ptr == old_user_ptr) {
+ db->db_user_ptr = user_ptr;
+ db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+ db->db_evict_func = evict_func;
dbuf_update_data(db);
} else {
- old_user_ptr = db->db_d.db_user_ptr;
+ old_user_ptr = db->db_user_ptr;
}
mutex_exit(&db->db_mtx);
@@ -1761,21 +1772,106 @@
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT(!refcount_is_zero(&db->db_holds));
- return (db->db_d.db_user_ptr);
+ return (db->db_user_ptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+ /* ASSERT(dmu_tx_is_syncing(tx) */
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_blkptr != NULL)
+ return;
+
+ if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+ /*
+ * This buffer was allocated at a time when there was
+ * no available blkptrs from the dnode, or it was
+ * inappropriate to hook it in (i.e., nlevels mis-match).
+ */
+ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+ ASSERT(db->db_parent == NULL);
+ db->db_parent = dn->dn_dbuf;
+ db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+ DBUF_VERIFY(db);
+ } else {
+ dmu_buf_impl_t *parent = db->db_parent;
+ int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ ASSERT(dn->dn_phys->dn_nlevels > 1);
+ if (parent == NULL) {
+ mutex_exit(&db->db_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ (void) dbuf_hold_impl(dn, db->db_level+1,
+ db->db_blkid >> epbs, FALSE, db, &parent);
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ db->db_parent = parent;
+ }
+ db->db_blkptr = (blkptr_t *)parent->db.db_data +
+ (db->db_blkid & ((1ULL << epbs) - 1));
+ DBUF_VERIFY(db);
+ }
}
-void
-dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
- arc_buf_t **data;
- uint64_t txg = tx->tx_txg;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ zio_t *zio;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+ mutex_enter(&db->db_mtx);
+
+ ASSERT(db->db_level > 0);
+ DBUF_VERIFY(db);
+
+ if (db->db_buf == NULL) {
+ mutex_exit(&db->db_mtx);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ mutex_enter(&db->db_mtx);
+ }
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ ASSERT(db->db_buf != NULL);
+
+ dbuf_check_blkptr(dn, db);
+
+ db->db_data_pending = dr;
+ mutex_exit(&db->db_mtx);
+
+ arc_release(db->db_buf, db);
+
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
+ zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+
+ zio = dr->dr_zio;
+ mutex_enter(&dr->dt.di.dr_mtx);
+ dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ mutex_exit(&dr->dt.di.dr_mtx);
+ zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ arc_buf_t **datap = &dr->dt.dl.dr_data;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
- int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t txg = tx->tx_txg;
int checksum, compress;
- zbookmark_t zb;
int blksz;
- arc_buf_contents_t type;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1791,25 +1887,20 @@
ASSERT(db->db.db_data == NULL);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
- ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
+ ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
DBUF_VERIFY(db);
/*
- * Don't need a lock on db_dirty (dn_mtx), because it can't
- * be modified yet.
+ * If this is a bonus buffer, simply copy the bonus data into the
+ * dnode. It will be written out when the dnode is synced (and it
+ * will be synced, since it must have been dirty for dbuf_sync to
+ * be called).
*/
-
if (db->db_blkid == DB_BONUS_BLKID) {
- arc_buf_t **datap = &db->db_d.db_data_old[txg&TXG_MASK];
- /*
- * Simply copy the bonus data into the dnode. It will
- * be written out when the dnode is synced (and it will
- * be synced, since it must have been dirty for dbuf_sync
- * to be called).
- */
+ dbuf_dirty_record_t **drp;
/*
* Use dn_phys->dn_bonuslen since db.db_size is the length
* of the bonus buffer in the open transaction rather than
@@ -1821,10 +1912,13 @@
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
if (*datap != db->db.db_data)
zio_buf_free(*datap, DN_MAX_BONUSLEN);
- db->db_d.db_data_old[txg&TXG_MASK] = NULL;
db->db_data_pending = NULL;
- if (db->db_dirtied == txg)
- db->db_dirtied = 0;
+ drp = &db->db_last_dirty;
+ while (*drp != dr)
+ drp = &(*drp)->dr_next;
+ ASSERT((*drp)->dr_next == NULL);
+ *drp = NULL;
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
@@ -1832,20 +1926,51 @@
return;
}
- if (db->db_level == 0) {
- type = DBUF_GET_BUFC_TYPE(db);
- data = &db->db_d.db_data_old[txg&TXG_MASK];
- blksz = arc_buf_size(*data);
+ /*
+ * If this buffer is in the middle of an immdiate write,
+ * wait for the synchronous IO to complete.
+ */
+ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+ }
+
+ dbuf_check_blkptr(dn, db);
+
+ /*
+ * If this dbuf has already been written out via an immediate write,
+ * just complete the write by copying over the new block pointer and
+ * updating the accounting via the write-completion functions.
+ */
+ if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ zio_t zio_fake;
- /*
- * This buffer is in the middle of an immdiate write.
- * Wait for the synchronous IO to complete.
- */
- while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
- ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
- cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]);
- }
+ zio_fake.io_private = &db;
+ zio_fake.io_error = 0;
+ zio_fake.io_bp = db->db_blkptr;
+ zio_fake.io_bp_orig = *db->db_blkptr;
+ zio_fake.io_txg = txg;
+
+ *db->db_blkptr = dr->dt.dl.dr_overridden_by;
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ db->db_data_pending = dr;
+ dr->dr_zio = &zio_fake;
+ mutex_exit(&db->db_mtx);
+
+ if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio_fake.io_bp_orig, dn->dn_zio, tx);
+
+ dbuf_write_ready(&zio_fake, db->db_buf, db);
+ dbuf_write_done(&zio_fake, db->db_buf, db);
+
+ return;
+ }
+
+ blksz = arc_buf_size(*datap);
+
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
/*
* If this buffer is currently "in use" (i.e., there are
* active holds and db_data still references it), then make
@@ -1853,326 +1978,154 @@
* from the open txg will not leak into this write.
*
* NOTE: this copy does not need to be made for objects only
- * modified in the syncing context (e.g. DNONE_DNODE blocks)
- * or if there is no actual write involved (bonus blocks).
+ * modified in the syncing context (e.g. DNONE_DNODE blocks).
*/
- if (dn->dn_object != DMU_META_DNODE_OBJECT &&
- db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
- if (refcount_count(&db->db_holds) > 1 &&
- *data == db->db_buf) {
- *data = arc_buf_alloc(os->os_spa, blksz, db,
- type);
- bcopy(db->db.db_data, (*data)->b_data, blksz);
- }
- db->db_data_pending = *data;
- } else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(db->db_buf, db);
+ if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
} else {
- data = &db->db_buf;
- if (*data == NULL) {
- /*
- * This can happen if we dirty and then free
- * the level-0 data blocks in the same txg. So
- * this indirect remains unchanged.
- */
- if (db->db_dirtied == txg)
- db->db_dirtied = 0;
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
- return;
- }
- blksz = db->db.db_size;
- ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(db->db_buf, db);
+ }
+
+ ASSERT(*datap != NULL);
+ db->db_data_pending = dr;
+
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
}
- ASSERT(*data != NULL);
+ dbuf_write(dr, *datap, checksum, compress, tx);
- if (db->db_level > 0 && !arc_released(db->db_buf)) {
- /*
- * This indirect buffer was marked dirty, but
- * never modified (if it had been modified, then
- * we would have released the buffer). There is
- * no reason to write anything.
- */
- db->db_data_pending = NULL;
- if (db->db_dirtied == txg)
- db->db_dirtied = 0;
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
- return;
- } else if (db->db_blkptr == NULL &&
- db->db_level == dn->dn_phys->dn_nlevels-1 &&
- db->db_blkid < dn->dn_phys->dn_nblkptr) {
- /*
- * This buffer was allocated at a time when there was
- * no available blkptrs from the dnode, or it was
- * inappropriate to hook it in (i.e., nlevels mis-match).
- */
- ASSERT(db->db_blkptr == NULL);
- ASSERT(db->db_parent == NULL);
- db->db_parent = dn->dn_dbuf;
- db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
- DBUF_VERIFY(db);
- mutex_exit(&db->db_mtx);
- } else if (db->db_blkptr == NULL) {
- dmu_buf_impl_t *parent = db->db_parent;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
+ list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+ else
+ zio_nowait(dr->dr_zio);
+}
- mutex_exit(&db->db_mtx);
- ASSERT(dn->dn_phys->dn_nlevels > 1);
- if (parent == NULL) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- (void) dbuf_hold_impl(dn, db->db_level+1,
- db->db_blkid >> epbs, FALSE, FTAG, &parent);
- rw_exit(&dn->dn_struct_rwlock);
- dbuf_add_ref(parent, db);
- db->db_parent = parent;
- dbuf_rele(parent, FTAG);
- }
- (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
- } else {
- mutex_exit(&db->db_mtx);
- }
-
- ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr;
- if (db->db_level > 0 &&
- db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
- /*
- * Don't write indirect blocks past EOF.
- * We get these when we truncate a file *after* dirtying
- * blocks in the truncate range (we undirty the level 0
- * blocks in dbuf_free_range(), but not the indirects).
- */
-#ifdef ZFS_DEBUG
- /*
- * Verify that this indirect block is empty.
- */
- blkptr_t *bplist;
- int i;
-
- mutex_enter(&db->db_mtx);
- bplist = db->db.db_data;
- for (i = 0; i < (1 << epbs); i++) {
- if (!BP_IS_HOLE(&bplist[i])) {
- panic("data past EOF: "
- "db=%p level=%d id=%llu i=%d\n",
- db, db->db_level,
- (u_longlong_t)db->db_blkid, i);
- }
+ while (dr = list_head(list)) {
+ if (dr->dr_zio != NULL) {
+ /*
+ * If we find an already initialized zio then we
+ * are processing the meta-dnode, and we have finished.
+ * The dbufs for all dnodes are put back on the list
+ * during processing, so that we can zio_wait()
+ * these IOs after initiating all child IOs.
+ */
+ ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+ DMU_META_DNODE_OBJECT);
+ break;
}
- mutex_exit(&db->db_mtx);
-#endif
- ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr));
- mutex_enter(&db->db_mtx);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
- return;
+ list_remove(list, dr);
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
}
+}
- if (db->db_parent != dn->dn_dbuf) {
- dmu_buf_impl_t *parent = db->db_parent;
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+ int compress, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_impl_t *os = dn->dn_objset;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_t *zio;
- mutex_enter(&db->db_mtx);
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
ASSERT(db->db_level == parent->db_level-1);
- ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
- /*
- * We may have read this indirect block after we dirtied it,
- * so never released it from the cache.
- */
- arc_release(parent->db_buf, parent);
-
- db->db_blkptr = (blkptr_t *)parent->db.db_data +
- (db->db_blkid & ((1ULL << epbs) - 1));
- DBUF_VERIFY(db);
- mutex_exit(&db->db_mtx);
-#ifdef ZFS_DEBUG
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
} else {
- /*
- * We don't need to dnode_setdirty(dn) because if we got
- * here then the parent is already dirty.
- */
ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
ASSERT3P(db->db_blkptr, ==,
&dn->dn_phys->dn_blkptr[db->db_blkid]);
-#endif
- }
- ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
- if (db->db_level == 0 &&
- db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
- arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
- blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
- int old_size = bp_get_dasize(os->os_spa, db->db_blkptr);
- int new_size = bp_get_dasize(os->os_spa, *bpp);
-
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
- dnode_diduse_space(dn, new_size-old_size);
- mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid)
- dn->dn_phys->dn_maxblkid = db->db_blkid;
- mutex_exit(&dn->dn_mtx);
-
- dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
- if (!BP_IS_HOLE(db->db_blkptr))
- dsl_dataset_block_kill(os->os_dsl_dataset,
- db->db_blkptr, os->os_synctx);
-
- mutex_enter(&db->db_mtx);
- *db->db_blkptr = **bpp;
- kmem_free(*bpp, sizeof (blkptr_t));
- *bpp = NULL;
-
- if (*old != db->db_buf)
- VERIFY(arc_buf_remove_ref(*old, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
- *old = NULL;
- db->db_data_pending = NULL;
-
- cv_broadcast(&db->db_changed);
-
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
- return;
+ zio = dn->dn_zio;
}
- if (db->db_level > 0) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- checksum = ZIO_CHECKSUM_FLETCHER_4;
- if (zfs_mdcomp_disable)
- compress = ZIO_COMPRESS_EMPTY;
- else
- compress = ZIO_COMPRESS_LZJB;
- } else {
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
- }
-#ifdef ZFS_DEBUG
- if (db->db_parent) {
- ASSERT(list_link_active(
- &db->db_parent->db_dirty_node[txg&TXG_MASK]));
- ASSERT(db->db_parent == dn->dn_dbuf ||
- db->db_parent->db_level > 0);
- if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
- ASSERT(*data == db->db_buf);
- }
-#endif
- ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- (void) arc_write(zio, os->os_spa, checksum, compress,
- dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
- db->db_blkptr, *data, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
- /*
- * We can't access db after arc_write, since it could finish
- * and be freed, and we have no locks on it.
- */
-}
-
-struct dbuf_arg {
- objset_impl_t *os;
- blkptr_t bp;
-};
+ if (BP_IS_OLDER(db->db_blkptr, txg))
+ dsl_dataset_block_kill(
+ os->os_dsl_dataset, db->db_blkptr, zio, tx);
-static void
-dbuf_do_born(void *arg)
-{
- struct dbuf_arg *da = arg;
- dsl_dataset_block_born(da->os->os_dsl_dataset,
- &da->bp, da->os->os_synctx);
- kmem_free(da, sizeof (struct dbuf_arg));
-}
-
-static void
-dbuf_do_kill(void *arg)
-{
- struct dbuf_arg *da = arg;
- dsl_dataset_block_kill(da->os->os_dsl_dataset,
- &da->bp, da->os->os_synctx);
- kmem_free(da, sizeof (struct dbuf_arg));
+ dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
+ dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
+ db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
/* ARGSUSED */
static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
- uint64_t txg = zio->io_txg;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t fill = 0;
- int i;
- int old_size, new_size;
+ int old_size, new_size, i;
- ASSERT3U(zio->io_error, ==, 0);
+ dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
- dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
-
- old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig);
+ old_size = bp_get_dasize(os->os_spa, bp_orig);
new_size = bp_get_dasize(os->os_spa, zio->io_bp);
dnode_diduse_space(dn, new_size-old_size);
- mutex_enter(&db->db_mtx);
+ if (BP_IS_HOLE(zio->io_bp)) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
- ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ return;
+ }
- if (db->db_dirtied == txg)
- db->db_dirtied = 0;
+ mutex_enter(&db->db_mtx);
if (db->db_level == 0) {
- arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
-
- ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
- if (*old != db->db_buf)
- VERIFY(arc_buf_remove_ref(*old, db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
- *old = NULL;
- db->db_data_pending = NULL;
-
mutex_enter(&dn->dn_mtx);
- if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
- !BP_IS_HOLE(db->db_blkptr))
+ if (db->db_blkid > dn->dn_phys->dn_maxblkid)
dn->dn_phys->dn_maxblkid = db->db_blkid;
mutex_exit(&dn->dn_mtx);
@@ -2184,22 +2137,11 @@
fill++;
}
} else {
- if (!BP_IS_HOLE(db->db_blkptr))
- fill = 1;
+ fill = 1;
}
} else {
blkptr_t *bp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- if (!BP_IS_HOLE(db->db_blkptr)) {
- int epbs =
- dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
- ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
- db->db.db_size);
- ASSERT3U(dn->dn_phys->dn_maxblkid
- >> (db->db_level * epbs), >=, db->db_blkid);
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
- }
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
if (BP_IS_HOLE(bp))
continue;
@@ -2210,40 +2152,78 @@
}
}
- if (!BP_IS_HOLE(db->db_blkptr)) {
- db->db_blkptr->blk_fill = fill;
- BP_SET_TYPE(db->db_blkptr, dn->dn_type);
- BP_SET_LEVEL(db->db_blkptr, db->db_level);
+ db->db_blkptr->blk_fill = fill;
+ BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+ BP_SET_LEVEL(db->db_blkptr, db->db_level);
+
+ mutex_exit(&db->db_mtx);
+
+ /* We must do this after we've set the bp's type and level */
+ if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (bp_orig->blk_birth == tx->tx_txg)
+ dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ uint64_t txg = zio->io_txg;
+ dbuf_dirty_record_t **drp, *dr;
+
+ ASSERT3U(zio->io_error, ==, 0);
+
+ mutex_enter(&db->db_mtx);
+
+ drp = &db->db_last_dirty;
+ while (*drp != db->db_data_pending)
+ drp = &(*drp)->dr_next;
+ ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
+ ASSERT((*drp)->dr_txg == txg);
+ ASSERT((*drp)->dr_next == NULL);
+ dr = *drp;
+ *drp = NULL;
+
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+ if (dr->dt.dl.dr_data != db->db_buf)
+ VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
} else {
- ASSERT3U(fill, ==, 0);
- ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
- }
+ dnode_t *dn = db->db_dnode;
- dprintf_dbuf_bp(db, db->db_blkptr,
- "wrote %llu bytes to blkptr:", zio->io_size);
+ ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ if (!BP_IS_HOLE(db->db_blkptr)) {
+ int epbs =
+ dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ db->db.db_size);
+ ASSERT3U(dn->dn_phys->dn_maxblkid
+ >> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ }
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
- ASSERT(db->db_parent == NULL ||
- list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
cv_broadcast(&db->db_changed);
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
+ db->db_data_pending = NULL;
mutex_exit(&db->db_mtx);
- /* We must do this after we've set the bp's type and level */
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
- BP_IDENTITY(&zio->io_bp_orig))) {
- struct dbuf_arg *da;
- da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
- da->os = os;
- da->bp = *zio->io_bp;
- (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
- if (!BP_IS_HOLE(&zio->io_bp_orig)) {
- da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
- da->os = os;
- da->bp = zio->io_bp_orig;
- (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
- }
- }
+ dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
dbuf_rele(db, (void *)(uintptr_t)txg);
}
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c Fri Feb 02 15:36:58 2007 -0800
@@ -567,27 +567,19 @@
#endif
typedef struct {
- uint64_t txg;
- dmu_buf_impl_t *db;
- dmu_sync_cb_t *done;
- void *arg;
-} dmu_sync_cbin_t;
-
-typedef union {
- dmu_sync_cbin_t data;
- blkptr_t blk;
-} dmu_sync_cbarg_t;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_cb_t *done;
+ void *arg;
+} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
- dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg;
- dmu_buf_impl_t *db = in->db;
- uint64_t txg = in->txg;
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
dmu_sync_cb_t *done = in->done;
- void *arg = in->arg;
- blkptr_t *blk = (blkptr_t *)varg;
if (!BP_IS_HOLE(zio->io_bp)) {
zio->io_bp->blk_fill = 1;
@@ -595,16 +587,17 @@
BP_SET_LEVEL(zio->io_bp, 0);
}
- *blk = *zio->io_bp; /* structure assignment */
-
mutex_enter(&db->db_mtx);
- ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC);
- db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+ ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+ dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
if (done)
- done(&(db->db), arg);
+ done(&(db->db), in->arg);
+
+ kmem_free(in, sizeof (dmu_sync_arg_t));
}
/*
@@ -637,10 +630,10 @@
objset_impl_t *os = db->db_objset;
dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
tx_state_t *tx = &dp->dp_tx;
- dmu_sync_cbin_t *in;
- blkptr_t *blk;
+ dbuf_dirty_record_t *dr;
+ dmu_sync_arg_t *in;
zbookmark_t zb;
- uint32_t arc_flag;
+ zio_t *zio;
int err;
ASSERT(BP_IS_HOLE(bp));
@@ -674,25 +667,6 @@
mutex_enter(&db->db_mtx);
- blk = db->db_d.db_overridden_by[txg&TXG_MASK];
- if (blk == IN_DMU_SYNC) {
- /*
- * We have already issued a sync write for this buffer.
- */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (EALREADY);
- } else if (blk != NULL) {
- /*
- * This buffer had already been synced. It could not
- * have been dirtied since, or we would have cleared blk.
- */
- *bp = *blk; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
-
if (txg == tx->tx_syncing_txg) {
while (db->db_data_pending) {
/*
@@ -726,7 +700,10 @@
}
}
- if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) {
+ dr = db->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ if (dr == NULL || dr->dr_txg < txg) {
/*
* This dbuf isn't dirty, must have been free_range'd.
* There's no need to log writes to freed blocks, so we're done.
@@ -736,35 +713,52 @@
return (ENOENT);
}
- ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
- db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
- /*
- * XXX - a little ugly to stash the blkptr in the callback
- * buffer. We always need to make sure the following is true:
- * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t));
- */
- in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- in->db = db;
- in->txg = txg;
+ ASSERT(dr->dr_txg == txg);
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ /*
+ * We have already issued a sync write for this buffer.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (EALREADY);
+ } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * This buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
+ */
+ *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+
+ dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+ in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ in->dr = dr;
in->done = done;
in->arg = arg;
mutex_exit(&db->db_mtx);
txg_resume(dp);
- arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT;
zb.zb_objset = os->os_dsl_dataset->ds_object;
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- err = arc_write(pio, os->os_spa,
+ zio = arc_write(pio, os->os_spa,
zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type),
- txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb);
- ASSERT(err == 0);
+ txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0);
+ if (pio) {
+ zio_nowait(zio);
+ err = EINPROGRESS;
+ } else {
+ err = zio_wait(zio);
+ ASSERT(err == 0);
+ }
+ return (err);
}
int
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -139,10 +139,8 @@
osi->os.os = osi;
osi->os_dsl_dataset = ds;
osi->os_spa = spa;
- if (bp)
- osi->os_rootbp = *bp;
- osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
- if (!BP_IS_HOLE(&osi->os_rootbp)) {
+ osi->os_rootbp = bp;
+ if (!BP_IS_HOLE(osi->os_rootbp)) {
uint32_t aflags = ARC_WAIT;
zbookmark_t zb;
zb.zb_objset = ds ? ds->ds_object : 0;
@@ -150,17 +148,21 @@
zb.zb_level = -1;
zb.zb_blkid = 0;
- dprintf_bp(&osi->os_rootbp, "reading %s", "");
- err = arc_read(NULL, spa, &osi->os_rootbp,
+ dprintf_bp(osi->os_rootbp, "reading %s", "");
+ err = arc_read(NULL, spa, osi->os_rootbp,
dmu_ot[DMU_OT_OBJSET].ot_byteswap,
- arc_bcopy_func, osi->os_phys,
+ arc_getbuf_func, &osi->os_phys_buf,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
- zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
kmem_free(osi, sizeof (objset_impl_t));
return (err);
}
+ osi->os_phys = osi->os_phys_buf->b_data;
+ arc_release(osi->os_phys_buf, &osi->os_phys_buf);
} else {
+ osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+ &osi->os_phys_buf, ARC_BUFC_METADATA);
+ osi->os_phys = osi->os_phys_buf->b_data;
bzero(osi->os_phys, sizeof (objset_phys_t));
}
@@ -177,7 +179,8 @@
err = dsl_prop_register(ds, "compression",
compression_changed_cb, osi);
if (err) {
- zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
+ &osi->os_phys_buf) == 1);
kmem_free(osi, sizeof (objset_impl_t));
return (err);
}
@@ -252,11 +255,8 @@
osi = dsl_dataset_get_user_ptr(ds);
if (osi == NULL) {
- blkptr_t bp;
-
- dsl_dataset_get_blkptr(ds, &bp);
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, &bp, &osi);
+ ds, &ds->ds_phys->ds_bp, &osi);
if (err) {
dsl_dataset_close(ds, mode, os);
kmem_free(os, sizeof (objset_t));
@@ -364,7 +364,7 @@
dnode_special_close(osi->os_meta_dnode);
zil_free(osi->os_zil);
- zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+ VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock);
kmem_free(osi, sizeof (objset_impl_t));
@@ -372,14 +372,14 @@
/* called from dsl for meta-objset */
objset_impl_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
- dmu_tx_t *tx)
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ dmu_objset_type_t type, dmu_tx_t *tx)
{
objset_impl_t *osi;
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
- VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
+ VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
mdn = osi->os_meta_dnode;
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -467,7 +467,7 @@
dsl_dir_t *dd = arg1;
struct oscarg *oa = arg2;
dsl_dataset_t *ds;
- blkptr_t bp;
+ blkptr_t *bp;
uint64_t dsobj;
ASSERT(dmu_tx_is_syncing(tx));
@@ -477,13 +477,13 @@
VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
- dsl_dataset_get_blkptr(ds, &bp);
- if (BP_IS_HOLE(&bp)) {
+ bp = dsl_dataset_get_blkptr(ds);
+ if (BP_IS_HOLE(bp)) {
objset_impl_t *osi;
/* This is an empty dmu_objset; not a clone. */
osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, oa->type, tx);
+ ds, bp, oa->type, tx);
if (oa->userfunc)
oa->userfunc(&osi->os, oa->userarg, tx);
@@ -660,41 +660,41 @@
}
static void
-dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
+dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
{
- dnode_t *dn = list_head(list);
- int level, err;
+ dnode_t *dn;
- for (level = 0; dn = list_head(list); level++) {
- zio_t *zio;
- zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
- ASSERT3U(level, <=, DN_MAX_LEVELS);
-
- while (dn) {
- dnode_t *next = list_next(list, dn);
+ while (dn = list_head(list)) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ ASSERT(dn->dn_dbuf->db_data_pending);
+ /*
+ * Initialize dn_zio outside dnode_sync()
+ * to accomodate meta-dnode
+ */
+ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+ ASSERT(dn->dn_zio);
- list_remove(list, dn);
- if (dnode_sync(dn, level, zio, tx) == 0) {
- /*
- * This dnode requires syncing at higher
- * levels; put it back onto the list.
- */
- if (next)
- list_insert_before(list, next, dn);
- else
- list_insert_tail(list, dn);
- }
- dn = next;
- }
+ ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+ list_remove(list, dn);
+ dnode_sync(dn, tx);
+ }
+}
- DTRACE_PROBE1(wait__begin, zio_t *, zio);
- err = zio_wait(zio);
- DTRACE_PROBE4(wait__end, zio_t *, zio,
- uint64_t, tx->tx_txg, objset_impl_t *, os, int, level);
+/* ARGSUSED */
+static void
+ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ objset_impl_t *os = arg;
+ blkptr_t *bp = os->os_rootbp;
+ dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+ int i;
- ASSERT(err == 0);
- }
+ /*
+ * Update rootbp fill count.
+ */
+ bp->blk_fill = 1; /* count the meta-dnode */
+ for (i = 0; i < dnp->dn_nblkptr; i++)
+ bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
}
/* ARGSUSED */
@@ -702,90 +702,81 @@
killer(zio_t *zio, arc_buf_t *abuf, void *arg)
{
objset_impl_t *os = arg;
- objset_phys_t *osphys = zio->io_data;
- dnode_phys_t *dnp = &osphys->os_meta_dnode;
- int i;
ASSERT3U(zio->io_error, ==, 0);
- /*
- * Update rootbp fill count.
- */
- os->os_rootbp.blk_fill = 1; /* count the meta-dnode */
- for (i = 0; i < dnp->dn_nblkptr; i++)
- os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
-
BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
BP_SET_LEVEL(zio->io_bp, 0);
if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
BP_IDENTITY(&zio->io_bp_orig))) {
- dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
- os->os_synctx);
+ if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ &zio->io_bp_orig, NULL, os->os_synctx);
dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
os->os_synctx);
}
+ arc_release(os->os_phys_buf, &os->os_phys_buf);
+
+ if (os->os_dsl_dataset)
+ dmu_buf_rele(os->os_dsl_dataset->ds_dbuf, os->os_dsl_dataset);
}
/* called from dsl */
void
-dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
+dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
{
- extern taskq_t *dbuf_tq;
int txgoff;
- list_t *dirty_list;
- int err;
zbookmark_t zb;
- arc_buf_t *abuf =
- arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG,
- ARC_BUFC_METADATA);
+ zio_t *zio;
+ list_t *list;
+ dbuf_dirty_record_t *dr;
+
+ dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(os->os_synctx == NULL);
/* XXX the write_done callback should really give us the tx... */
os->os_synctx = tx;
- dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+ /*
+ * Create the root block IO
+ */
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+ if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
+ dsl_dataset_block_kill(os->os_dsl_dataset,
+ os->os_rootbp, pio, tx);
+ zio = arc_write(pio, os->os_spa, os->os_md_checksum,
+ os->os_md_compress,
+ dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
+ tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+
+ /*
+ * Sync meta-dnode - the parent IO for the sync is the root block
+ */
+ os->os_meta_dnode->dn_zio = zio;
+ dnode_sync(os->os_meta_dnode, tx);
txgoff = tx->tx_txg & TXG_MASK;
- dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
- dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
+ dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+ list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+ while (dr = list_head(list)) {
+ ASSERT(dr->dr_dbuf->db_level == 0);
+ list_remove(list, dr);
+ if (dr->dr_zio)
+ zio_nowait(dr->dr_zio);
+ }
/*
* Free intent log blocks up to this tx.
*/
zil_sync(os->os_zil, tx);
-
- /*
- * Sync meta-dnode
- */
- dirty_list = &os->os_dirty_dnodes[txgoff];
- ASSERT(list_head(dirty_list) == NULL);
- list_insert_tail(dirty_list, os->os_meta_dnode);
- dmu_objset_sync_dnodes(os, dirty_list, tx);
-
- /*
- * Sync the root block.
- */
- bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
- err = arc_write(NULL, os->os_spa, os->os_md_checksum,
- os->os_md_compress,
- dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
- tx->tx_txg, &os->os_rootbp, abuf, killer, os,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
- ASSERT(err == 0);
- VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
-
- dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
-
- ASSERT3P(os->os_synctx, ==, tx);
- taskq_wait(dbuf_tq);
- os->os_synctx = NULL;
+ zio_nowait(zio);
}
void
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -382,7 +382,7 @@
DS_MODE_EXCLUSIVE, FTAG, &ds));
(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
- ds, drrb->drr_type, tx);
+ ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
--- a/usr/src/uts/common/fs/zfs/dnode.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode.c Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -65,9 +65,9 @@
avl_create(&dn->dn_ranges[i], free_range_compar,
sizeof (free_range_t),
offsetof(struct free_range, fr_node));
- list_create(&dn->dn_dirty_dbufs[i],
- sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_dirty_node[i]));
+ list_create(&dn->dn_dirty_records[i],
+ sizeof (dbuf_dirty_record_t),
+ offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
@@ -91,7 +91,7 @@
for (i = 0; i < TXG_SIZE; i++) {
avl_destroy(&dn->dn_ranges[i]);
- list_destroy(&dn->dn_dirty_dbufs[i]);
+ list_destroy(&dn->dn_dirty_records[i]);
}
list_destroy(&dn->dn_dbufs);
@@ -296,7 +296,7 @@
for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT(NULL == list_head(&dn->dn_dirty_dbufs[i]));
+ ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
}
ASSERT(NULL == list_head(&dn->dn_dbufs));
@@ -362,7 +362,7 @@
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
+ ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
}
@@ -461,7 +461,7 @@
ASSERT(db->db.db_data != NULL);
db->db.db_size = bonuslen;
mutex_exit(&db->db_mtx);
- dbuf_dirty(db, tx);
+ (void) dbuf_dirty(db, tx);
}
/* change bonus size and type */
@@ -714,7 +714,7 @@
*/
dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
- dbuf_dirty(dn->dn_dbuf, tx);
+ (void) dbuf_dirty(dn->dn_dbuf, tx);
dsl_dataset_dirty(os->os_dsl_dataset, tx);
}
@@ -855,17 +855,35 @@
if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
dn->dn_next_nlevels[txgoff] = new_nlevels;
- /* Dirty the left indirects. */
+ /* dirty the left indirects */
db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
- dbuf_dirty(db, tx);
+ new = dbuf_dirty(db, tx);
dbuf_rele(db, FTAG);
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
}
out:
@@ -973,7 +991,7 @@
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
- if (db->db_dirtied ||
+ if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
dbuf_will_dirty(db, tx);
@@ -1023,7 +1041,7 @@
if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
TRUE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
- if (db->db_dirtied ||
+ if (db->db_last_dirty ||
(db->db_blkptr &&
!BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Feb 02 15:36:58 2007 -0800
@@ -33,78 +33,81 @@
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
-#include <sys/zio.h>
static void
dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
+ int txgoff = tx->tx_txg & TXG_MASK;
+ int nblkptr = dn->dn_phys->dn_nblkptr;
+ int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+ int new_level = dn->dn_next_nlevels[txgoff];
int i;
- uint64_t txg = tx->tx_txg;
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ /* this dnode can't be paged out because it's dirty */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
- /* this dnode can't be paged out because it's dirty */
+ ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
ASSERT(db != NULL);
- for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
- break;
- if (i != dn->dn_phys->dn_nblkptr) {
- ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
- (void) dbuf_read(db, NULL,
- DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED);
- arc_release(db->db_buf, db);
- /* copy dnode's block pointers to new indirect block */
- ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
- db->db.db_size);
- bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
- sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
- arc_buf_freeze(db->db_buf);
- }
-
- dn->dn_phys->dn_nlevels += 1;
+ dn->dn_phys->dn_nlevels = new_level;
dprintf("os=%p obj=%llu, increase to %d\n",
dn->dn_objset, dn->dn_object,
dn->dn_phys->dn_nlevels);
+ /* check for existing blkptrs in the dnode */
+ for (i = 0; i < nblkptr; i++)
+ if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+ break;
+ if (i != nblkptr) {
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
+ }
+
/* set dbuf's parent pointers to new indirect buf */
- for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) {
- dmu_buf_impl_t *child =
- dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i);
+ for (i = 0; i < nblkptr; i++) {
+ dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+
if (child == NULL)
continue;
- if (child->db_dnode == NULL) {
+ ASSERT3P(child->db_dnode, ==, dn);
+ if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+ ASSERT(child->db_parent->db_level == db->db_level);
+ ASSERT(child->db_blkptr !=
+ &dn->dn_phys->dn_blkptr[child->db_blkid]);
mutex_exit(&child->db_mtx);
continue;
}
+ ASSERT(child->db_parent == NULL ||
+ child->db_parent == dn->dn_dbuf);
- if (child->db_parent == NULL ||
- child->db_parent == dn->dn_dbuf) {
- dprintf_dbuf_bp(child, child->db_blkptr,
- "changing db_blkptr to new indirect %s", "");
- child->db_parent = db;
- dbuf_add_ref(db, child);
- if (db->db.db_data) {
- child->db_blkptr =
- (blkptr_t *)db->db.db_data + i;
- } else {
- child->db_blkptr = NULL;
- }
- dprintf_dbuf_bp(child, child->db_blkptr,
- "changed db_blkptr to new indirect %s", "");
- }
- ASSERT3P(child->db_parent, ==, db);
+ child->db_parent = db;
+ dbuf_add_ref(db, child);
+ if (db->db.db_data)
+ child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+ else
+ child->db_blkptr = NULL;
+ dprintf_dbuf_bp(child, child->db_blkptr,
+ "changed db_blkptr to new indirect %s", "");
mutex_exit(&child->db_mtx);
}
- bzero(dn->dn_phys->dn_blkptr,
- sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+ bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
dbuf_rele(db, FTAG);
+
+ rw_exit(&dn->dn_struct_rwlock);
}
static void
@@ -122,7 +125,8 @@
bytesfreed += bp_get_dasize(os->os_spa, bp);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
- dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx);
+ dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
+ bzero(bp, sizeof (blkptr_t));
}
dnode_diduse_space(dn, -bytesfreed);
}
@@ -148,8 +152,9 @@
for (i = off; i < off+num; i++) {
uint64_t *buf;
+ dmu_buf_impl_t *child;
+ dbuf_dirty_record_t *dr;
int j;
- dmu_buf_impl_t *child;
ASSERT(db->db_level == 1);
@@ -161,11 +166,14 @@
continue;
ASSERT(err == 0);
ASSERT(child->db_level == 0);
- ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK]));
+ dr = child->db_last_dirty;
+ while (dr && dr->dr_txg > txg)
+ dr = dr->dr_next;
+ ASSERT(dr == NULL || dr->dr_txg == txg);
- /* db_data_old better be zeroed */
- if (child->db_d.db_data_old[txg & TXG_MASK]) {
- buf = child->db_d.db_data_old[txg & TXG_MASK]->b_data;
+ /* data_old better be zeroed */
+ if (dr) {
+ buf = dr->dt.dl.dr_data->b_data;
for (j = 0; j < child->db.db_size >> 3; j++) {
if (buf[j] != 0) {
panic("freed data not zero: "
@@ -182,10 +190,7 @@
mutex_enter(&child->db_mtx);
buf = child->db.db_data;
if (buf != NULL && child->db_state != DB_FILL &&
- !list_link_active(&child->db_dirty_node
- [(txg+1) & TXG_MASK]) &&
- !list_link_active(&child->db_dirty_node
- [(txg+2) & TXG_MASK])) {
+ child->db_last_dirty == NULL) {
for (j = 0; j < child->db.db_size >> 3; j++) {
if (buf[j] != 0) {
panic("freed data not zero: "
@@ -210,7 +215,6 @@
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
int epbs, shift, err;
- int txgoff = tx->tx_txg & TXG_MASK;
int all = TRUE;
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -239,7 +243,7 @@
FREE_VERIFY(db, start, end, tx);
free_blocks(dn, bp, end-start+1, tx);
arc_buf_freeze(db->db_buf);
- ASSERT(all || list_link_active(&db->db_dirty_node[txgoff]));
+ ASSERT(all || db->db_last_dirty);
return (all);
}
@@ -270,7 +274,7 @@
ASSERT3U(bp->blk_birth, ==, 0);
}
#endif
- ASSERT(all || list_link_active(&db->db_dirty_node[txgoff]));
+ ASSERT(all || db->db_last_dirty);
return (all);
}
@@ -418,31 +422,43 @@
return (0);
}
-static int
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+ dbuf_dirty_record_t *dr;
+
+ while (dr = list_head(list)) {
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ uint64_t txg = dr->dr_txg;
+
+ mutex_enter(&db->db_mtx);
+ /* XXX - use dbuf_undirty()? */
+ list_remove(list, dr);
+ ASSERT(db->db_last_dirty == dr);
+ db->db_last_dirty = NULL;
+ db->db_dirtycnt -= 1;
+ if (db->db_level == 0) {
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ dr->dt.dl.dr_data == db->db_buf);
+ dbuf_unoverride(dr);
+ mutex_exit(&db->db_mtx);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+ }
+ kmem_free(dr, sizeof (dbuf_dirty_record_t));
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ }
+}
+
+static void
dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
{
- dmu_buf_impl_t *db;
int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(dmu_tx_is_syncing(tx));
- /* Undirty all buffers */
- while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) {
- mutex_enter(&db->db_mtx);
- /* XXX - use dbuf_undirty()? */
- list_remove(&dn->dn_dirty_dbufs[txgoff], db);
- if (db->db_level == 0) {
- ASSERT(db->db_blkid == DB_BONUS_BLKID ||
- db->db_d.db_data_old[txgoff] == db->db_buf);
- if (db->db_d.db_overridden_by[txgoff])
- dbuf_unoverride(db, tx->tx_txg);
- db->db_d.db_data_old[txgoff] = NULL;
- }
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg);
- }
-
+ dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
(void) dnode_evict_dbufs(dn, 0);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
@@ -487,32 +503,27 @@
* Now that we've released our hold, the dnode may
* be evicted, so we musn't access it.
*/
- return (1);
}
/*
- * Write out the dnode's dirty buffers at the specified level.
- * This may create more dirty buffers at the next level up.
+ * Write out the dnode's dirty buffers.
*
* NOTE: The dnode is kept in memory by being dirty. Once the
* dirty bit is cleared, it may be evicted. Beware of this!
*/
-int
-dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
free_range_t *rp;
+ dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
- dnode_phys_t *dnp = dn->dn_phys;
+ list_t *list = &dn->dn_dirty_records[txgoff];
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
DNODE_VERIFY(dn);
- /*
- * Make sure the dbuf for the dn_phys is released before we modify it.
- */
- if (dn->dn_dbuf)
- arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf);
+ ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
mutex_enter(&dn->dn_mtx);
if (dn->dn_allocated_txg == tx->tx_txg) {
@@ -536,7 +547,7 @@
dnp->dn_nblkptr = dn->dn_nblkptr;
}
- ASSERT(level != 0 || dnp->dn_nlevels > 1 ||
+ ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
@@ -545,7 +556,7 @@
ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- list_head(&dn->dn_dirty_dbufs[txgoff]) != NULL ||
+ list_head(list) != NULL ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
dnp->dn_datablkszsec);
dnp->dn_datablkszsec =
@@ -586,68 +597,25 @@
mutex_exit(&dn->dn_mtx);
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
- ASSERT3U(level, ==, 0);
- return (dnode_sync_free(dn, tx));
+ dnode_sync_free(dn, tx);
+ return;
}
if (dn->dn_next_nlevels[txgoff]) {
- int new_lvl = dn->dn_next_nlevels[txgoff];
-
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- while (new_lvl > dnp->dn_nlevels)
- dnode_increase_indirection(dn, tx);
- rw_exit(&dn->dn_struct_rwlock);
+ dnode_increase_indirection(dn, tx);
dn->dn_next_nlevels[txgoff] = 0;
}
- if (level == dnp->dn_nlevels) {
- uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
- (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-
- /* we've already synced out all data and indirect blocks */
- /* there are no more dirty dbufs under this dnode */
- ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL);
- ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg);
+ dbuf_sync_list(list, tx);
- /* NB: the "off < maxblkid" is to catch overflow */
- /*
- * NB: if blocksize is changing, we could get confused,
- * so only bother if there are multiple blocks and thus
- * it can't be changing.
- */
- ASSERT(off < dn->dn_phys->dn_maxblkid ||
- dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
-
- ASSERT(dnp->dn_nlevels > 1 ||
- BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
- BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
- dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+ ASSERT3P(list_head(list), ==, NULL);
+ dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+ }
- if (dn->dn_object != DMU_META_DNODE_OBJECT) {
- dbuf_will_dirty(dn->dn_dbuf, tx);
- dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
- }
-
- /*
- * Now that we've dropped the reference, the dnode may
- * be evicted, so we musn't access it.
- */
- return (1);
- } else {
- dmu_buf_impl_t *db, *db_next;
- list_t *list = &dn->dn_dirty_dbufs[txgoff];
- /*
- * Iterate over the list, removing and sync'ing dbufs
- * which are on the level we want, and leaving others.
- */
- for (db = list_head(list); db; db = db_next) {
- db_next = list_next(list, db);
- if (db->db_level == level) {
- list_remove(list, db);
- dbuf_sync(db, zio, tx);
- }
- }
- return (0);
- }
+ /*
+ * Although we have dropped our reference to the dnode, it
+ * can't be evicted until its written, and we haven't yet
+ * initiated the IO for the dnode's dbuf.
+ */
}
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri Feb 02 15:36:58 2007 -0800
@@ -105,26 +105,28 @@
}
void
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx)
{
int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
ASSERT(dmu_tx_is_syncing(tx));
+ /* No block pointer => nothing to free */
if (BP_IS_HOLE(bp))
return;
ASSERT(used > 0);
if (ds == NULL) {
+ int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
- /* XXX this can fail, what do we do when it does? */
- (void) arc_free(NULL, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
- bzero(bp, sizeof (blkptr_t));
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
-used, -compressed, -uncompressed, tx);
@@ -136,10 +138,12 @@
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ int err;
+
dprintf_bp(bp, "freeing: %s", "");
- /* XXX check return code? */
- (void) arc_free(NULL, tx->tx_pool->dp_spa,
- tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+ err = arc_free(pio, tx->tx_pool->dp_spa,
+ tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+ ASSERT(err == 0);
mutex_enter(&ds->ds_lock);
/* XXX unique_bytes is not accurate for head datasets */
@@ -167,7 +171,6 @@
}
}
}
- bzero(bp, sizeof (blkptr_t));
mutex_enter(&ds->ds_lock);
ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
ds->ds_phys->ds_used_bytes -= used;
@@ -539,7 +542,8 @@
VERIFY(0 ==
dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
- (void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
+ (void) dmu_objset_create_impl(dp->dp_spa, ds,
+ &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
}
@@ -829,10 +833,10 @@
}
-void
-dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
- *bp = ds->ds_phys->ds_bp;
+ return (&ds->ds_phys->ds_bp);
}
void
@@ -1403,17 +1407,15 @@
}
void
-dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_user_ptr != NULL);
ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
- dmu_objset_sync(ds->ds_user_ptr, tx);
dsl_dir_dirty(ds->ds_dir, tx);
- bplist_close(&ds->ds_deadlist);
-
- dmu_buf_rele(ds->ds_dbuf, ds);
+ dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+ /* Unneeded? bplist_close(&ds->ds_deadlist); */
}
void
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +33,7 @@
#include <sys/dmu_objset.h>
#include <sys/arc.h>
#include <sys/zap.h>
+#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/fs/zfs.h>
@@ -143,7 +144,7 @@
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
dp->dp_meta_objset = &dmu_objset_create_impl(spa,
- NULL, DMU_OST_META, tx)->os;
+ NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
/* create the pool directory */
err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -167,36 +168,36 @@
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
+ zio_t *zio;
dmu_tx_t *tx;
+ dsl_dir_t *dd;
+ dsl_dataset_t *ds;
+ dsl_sync_task_group_t *dstg;
objset_impl_t *mosi = dp->dp_meta_objset->os;
+ int err;
tx = dmu_tx_create_assigned(dp, txg);
- do {
- dsl_dir_t *dd;
- dsl_dataset_t *ds;
- dsl_sync_task_group_t *dstg;
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ if (!list_link_active(&ds->ds_synced_link))
+ list_insert_tail(&dp->dp_synced_objsets, ds);
+ dsl_dataset_sync(ds, zio, tx);
+ }
+ err = zio_wait(zio);
+ ASSERT(err == 0);
- while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
- if (!list_link_active(&ds->ds_synced_link))
- list_insert_tail(&dp->dp_synced_objsets, ds);
- dsl_dataset_sync(ds, tx);
- }
- while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
- dsl_sync_task_group_sync(dstg, tx);
- while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
- dsl_dir_sync(dd, tx);
- /*
- * We need to loop since dsl_sync_task_group_sync()
- * could create a new (dirty) objset.
- * XXX - isn't this taken care of by the spa's sync to
- * convergence loop?
- */
- } while (!txg_list_empty(&dp->dp_dirty_datasets, txg));
+ while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+ dsl_sync_task_group_sync(dstg, tx);
+ while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ dsl_dir_sync(dd, tx);
if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
- dmu_objset_sync(mosi, tx);
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(mosi, zio, tx);
+ err = zio_wait(zio);
+ ASSERT(err == 0);
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
}
@@ -216,18 +217,15 @@
}
}
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
int
dsl_pool_sync_context(dsl_pool_t *dp)
{
- /*
- * Yeah, this is cheesy. But the SPA needs some way to let
- * the sync threads invoke spa_open() and spa_close() while
- * it holds the namespace lock. I'm certainly open to better
- * ideas for how to determine whether the current thread is
- * operating on behalf of spa_sync(). This works for now.
- */
return (curthread == dp->dp_tx.tx_sync_thread ||
- BP_IS_HOLE(&dp->dp_meta_rootbp));
+ spa_get_dsl(dp->dp_spa) == NULL);
}
uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -84,10 +84,10 @@
int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t *arc_flags, zbookmark_t *zb);
-int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
- uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags, zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+ int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_done_func_t *done, void *private, uint32_t arc_flags);
int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,7 +41,7 @@
#endif
#define DB_BONUS_BLKID (-1ULL)
-#define IN_DMU_SYNC ((blkptr_t *)-1)
+#define IN_DMU_SYNC 2
/*
* define flags for dbuf_read
@@ -86,6 +86,56 @@
#define LIST_LINK_INACTIVE(link) \
((link)->list_next == NULL && (link)->list_prev == NULL)
+struct dmu_buf_impl;
+
+typedef enum override_states {
+ DR_NOT_OVERRIDDEN,
+ DR_IN_DMU_SYNC,
+ DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+ /* link on our parents dirty list */
+ list_node_t dr_dirty_node;
+
+ /* transaction group this data will sync in */
+ uint64_t dr_txg;
+
+ /* zio of outstanding write IO */
+ zio_t *dr_zio;
+
+ /* pointer back to our dbuf */
+ struct dmu_buf_impl *dr_dbuf;
+
+ /* pointer to next dirty record */
+ struct dbuf_dirty_record *dr_next;
+
+ /* pointer to parent dirty record */
+ struct dbuf_dirty_record *dr_parent;
+
+ union dirty_types {
+ struct dirty_indirect {
+
+ /* protect access to list */
+ kmutex_t dr_mtx;
+
+ /* Our list of dirty children */
+ list_t dr_children;
+ } di;
+ struct dirty_leaf {
+
+ /*
+ * dr_data is set when we dirty the buffer
+ * so that we can retain the pointer even if it
+ * gets COW'd in a subsequent transaction group.
+ */
+ arc_buf_t *dr_data;
+ blkptr_t dr_overridden_by;
+ override_states_t dr_override_state;
+ } dl;
+ } dt;
+} dbuf_dirty_record_t;
+
typedef struct dmu_buf_impl {
/*
* The following members are immutable, with the exception of
@@ -152,53 +202,28 @@
arc_buf_t *db_buf;
kcondvar_t db_changed;
- arc_buf_t *db_data_pending;
+ dbuf_dirty_record_t *db_data_pending;
+
+ /* pointer to most recent dirty record for this buffer */
+ dbuf_dirty_record_t *db_last_dirty;
/*
- * Last time (transaction group) this buffer was dirtied.
- */
- uint64_t db_dirtied;
-
- /*
- * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+ * Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
list_node_t db_link;
- /* Our link on dn_dirty_dbufs[txg] */
- list_node_t db_dirty_node[TXG_SIZE];
- uint8_t db_dirtycnt;
-
- /*
- * Data which is unique to data (leaf) blocks:
- */
- struct {
- /* stuff we store for the user (see dmu_buf_set_user) */
- void *db_user_ptr;
- void **db_user_data_ptr_ptr;
- dmu_buf_evict_func_t *db_evict_func;
- uint8_t db_immediate_evict;
- uint8_t db_freed_in_flight;
+ /* Data which is unique to data (leaf) blocks: */
- /*
- * db_data_old[txg&TXG_MASK] is set when we
- * dirty the buffer, so that we can retain the
- * pointer even if it gets COW'd in a subsequent
- * transaction group.
- *
- * If the buffer is dirty in any txg, it can't
- * be destroyed.
- */
- /*
- * XXX Protected by db_mtx and dn_dirty_mtx.
- * db_mtx must be held to read db_dirty[], and
- * both db_mtx and dn_dirty_mtx must be held to
- * modify (dirty or clean). db_mtx must be held
- * before dn_dirty_mtx.
- */
- arc_buf_t *db_data_old[TXG_SIZE];
- blkptr_t *db_overridden_by[TXG_SIZE];
- } db_d;
+ /* stuff we store for the user (see dmu_buf_set_user) */
+ void *db_user_ptr;
+ void **db_user_data_ptr_ptr;
+ dmu_buf_evict_func_t *db_evict_func;
+
+ uint8_t db_immediate_evict;
+ uint8_t db_freed_in_flight;
+
+ uint8_t db_dirtycnt;
} dmu_buf_impl_t;
/* Note: the dbuf hash table is exposed only for the mdb module */
@@ -237,14 +262,14 @@
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
-void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx);
-void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
struct dmu_tx *);
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Fri Feb 02 15:36:58 2007 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -218,6 +217,14 @@
* held from:
* dsl_dataset_*
*
+ * dr_mtx (leaf)
+ * protects:
+ * dr_children
+ * held from:
+ * dbuf_dirty
+ * dbuf_undirty
+ * dbuf_sync_indirect
+ * dnode_new_blkid
*/
struct objset;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,6 +29,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/spa.h>
+#include <sys/arc.h>
#include <sys/txg.h>
#include <sys/zfs_context.h>
#include <sys/dnode.h>
@@ -60,6 +61,7 @@
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
spa_t *os_spa;
+ arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
dnode_t *os_meta_dnode;
zilog_t *os_zil;
@@ -71,7 +73,7 @@
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
- blkptr_t os_rootbp;
+ blkptr_t *os_rootbp;
/* Protected by os_obj_lock */
kmutex_t os_obj_lock;
@@ -108,9 +110,9 @@
int dmu_objset_evict_dbufs(objset_t *os, int try);
/* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
+void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
- dmu_objset_type_t type, dmu_tx_t *tx);
+ blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
objset_impl_t **osip);
void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -32,6 +32,7 @@
#include <sys/avl.h>
#include <sys/spa.h>
#include <sys/txg.h>
+#include <sys/zio.h>
#include <sys/refcount.h>
#include <sys/dmu_zfetch.h>
@@ -162,7 +163,7 @@
/* protected by dn_mtx: */
kmutex_t dn_mtx;
- list_t dn_dirty_dbufs[TXG_SIZE];
+ list_t dn_dirty_records[TXG_SIZE];
avl_tree_t dn_ranges[TXG_SIZE];
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;
@@ -179,6 +180,9 @@
list_t dn_dbufs; /* linked list of descendent dbuf_t's */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
+ /* parent IO for current sync write */
+ zio_t *dn_zio;
+
/* holds prefetch structure */
struct zfetch dn_zfetch;
} dnode_t;
@@ -200,7 +204,7 @@
void dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
-int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -31,6 +31,7 @@
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
+#include <sys/zio.h>
#include <sys/bplist.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_context.h>
@@ -138,15 +139,16 @@
void *p, dsl_dataset_evict_func_t func);
void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
-void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp);
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
-void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+ dmu_tx_t *tx);
int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h Fri Feb 02 15:36:58 2007 -0800
@@ -272,6 +272,7 @@
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
#define BP_ZERO(bp) \
{ \
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Fri Feb 02 15:36:58 2007 -0800
@@ -207,6 +207,7 @@
zio_t *io_logical;
/* Callback info */
+ zio_done_func_t *io_ready;
zio_done_func_t *io_done;
void *io_private;
blkptr_t io_bp_orig;
@@ -262,8 +263,8 @@
extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags,
- zbookmark_t *zb);
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
--- a/usr/src/uts/common/fs/zfs/zio.c Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Fri Feb 02 15:36:58 2007 -0800
@@ -435,8 +435,8 @@
zio_t *
zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags,
- zbookmark_t *zb)
+ zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+ int flags, zbookmark_t *zb)
{
zio_t *zio;
@@ -450,6 +450,8 @@
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+ zio->io_ready = ready;
+
zio->io_bookmark = *zb;
zio->io_logical = zio;
@@ -810,6 +812,9 @@
{
zio_t *pio = zio->io_parent;
+ if (zio->io_ready)
+ zio->io_ready(zio);
+
if (pio != NULL)
zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
&pio->io_children_notready);