6416482 filebench oltp workload hangs in zfs
6440499 zil should avoid txg_wait_synced() and use dmu_sync() to issue parallel IOs when fsyncing
--- a/usr/src/cmd/ztest/ztest.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/cmd/ztest/ztest.c Mon Jun 19 19:31:35 2006 -0700
@@ -2031,22 +2031,30 @@
uint64_t blkoff;
zbookmark_t zb;
- txg_suspend(dmu_objset_pool(os));
(void) mutex_lock(lp);
- error = dmu_sync(os, ZTEST_DIROBJ, off, &blkoff, &blk,
- txg);
+ blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
+ error = dmu_buf_hold(os,
+ ZTEST_DIROBJ, blkoff, FTAG, &db);
+ if (error) {
+ dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
+ osname, ZTEST_DIROBJ, blkoff, error);
+ (void) mutex_unlock(lp);
+ continue;
+ }
+ blkoff = off - blkoff;
+ error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
+ dmu_buf_rele(db, FTAG);
(void) mutex_unlock(lp);
if (error) {
- txg_resume(dmu_objset_pool(os));
dprintf("dmu_sync(%s, %d, %llx) = %d\n",
osname, ZTEST_DIROBJ, off, error);
continue;
}
if (blk.blk_birth == 0) { /* concurrent free */
- txg_resume(dmu_objset_pool(os));
continue;
}
+ txg_suspend(dmu_objset_pool(os));
ASSERT(blk.blk_fill == 1);
ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
--- a/usr/src/uts/common/fs/zfs/arc.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c Mon Jun 19 19:31:35 2006 -0700
@@ -2217,6 +2217,8 @@
ASSERT3P(hdr->b_state, ==, arc.anon);
ASSERT(BUF_EMPTY(hdr));
ASSERT(!HDR_IO_ERROR(hdr));
+ ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+ ASSERT(hdr->b_acb == 0);
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
--- a/usr/src/uts/common/fs/zfs/dbuf.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Mon Jun 19 19:31:35 2006 -0700
@@ -712,9 +712,9 @@
{
ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
- db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
- } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+ ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC);
+
+ if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
/* free this block */
ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
db->db_dnode->dn_free_txg == txg);
@@ -1783,6 +1783,16 @@
if (db->db_level == 0) {
data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
blksz = arc_buf_size(*data);
+
+ /*
+ * This buffer is in the middle of an immdiate write.
+ * Wait for the synchronous IO to complete.
+ */
+ while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]);
+ }
/*
* If this buffer is currently "in use" (i.e., there are
* active holds and db_data still references it), then make
@@ -2085,6 +2095,8 @@
mutex_enter(&db->db_mtx);
+ ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+
if (db->db_dirtied == txg)
db->db_dirtied = 0;
--- a/usr/src/uts/common/fs/zfs/dmu.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Mon Jun 19 19:31:35 2006 -0700
@@ -1376,185 +1376,195 @@
return (ra.err);
}
+typedef struct {
+ uint64_t txg;
+ dmu_buf_impl_t *db;
+ dmu_sync_cb_t *done;
+ void *arg;
+} dmu_sync_cbin_t;
+
+typedef union {
+ dmu_sync_cbin_t data;
+ blkptr_t blk;
+} dmu_sync_cbarg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg;
+ dmu_buf_impl_t *db = in->db;
+ uint64_t txg = in->txg;
+ dmu_sync_cb_t *done = in->done;
+ void *arg = in->arg;
+ blkptr_t *blk = (blkptr_t *)varg;
+
+ if (!BP_IS_HOLE(zio->io_bp)) {
+ zio->io_bp->blk_fill = 1;
+ BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
+ BP_SET_LEVEL(zio->io_bp, 0);
+ }
+
+ *blk = *zio->io_bp; /* structure assignment */
+
+ mutex_enter(&db->db_mtx);
+ ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC);
+ db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+ cv_broadcast(&db->db_changed);
+ mutex_exit(&db->db_mtx);
+
+ if (done)
+ done(&(db->db), arg);
+}
+
/*
- * Intent log support: sync the block at <os, object, offset> to disk.
- * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
- * of the same block, and for making sure that the data isn't changing
- * while dmu_sync() is writing it.
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
*
* Return values:
*
- * EALREADY: this txg has already been synced, so there's nothing to to.
+ * EEXIST: this txg has already been synced, so there's nothing to to.
* The caller should not log the write.
*
* ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
* The caller should not log the write.
*
- * EINPROGRESS: the block is in the process of being synced by the
- * usual mechanism (spa_sync()), so we can't sync it here.
- * The caller should txg_wait_synced() and not log the write.
- *
- * EBUSY: another thread is trying to dmu_sync() the same dbuf.
- * (This case cannot arise under the current locking rules.)
- * The caller should txg_wait_synced() and not log the write.
+ * EALREADY: this block is already in the process of being synced.
+ * The caller should track its progress (somehow).
*
- * ESTALE: the block was dirtied or freed while we were writing it,
- * so the data is no longer valid.
- * The caller should txg_wait_synced() and not log the write.
+ * EINPROGRESS: the IO has been initiated.
+ * The caller should log this blkptr in the callback.
*
- * 0: success. Sets *bp to the blkptr just written, and sets
- * *blkoff to the data's offset within that block.
- * The caller should log this blkptr/blkoff in its lr_write_t.
+ * 0: completed. Sets *bp to the blkptr just written.
+ * The caller should log this blkptr immediately.
*/
int
-dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
- blkptr_t *bp, uint64_t txg)
+dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
+ blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
{
- objset_impl_t *osi = os->os;
- dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ objset_impl_t *os = db->db_objset;
+ dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
tx_state_t *tx = &dp->dp_tx;
- dmu_buf_impl_t *db;
+ dmu_sync_cbin_t *in;
blkptr_t *blk;
+ zbookmark_t zb;
+ uint32_t arc_flag;
int err;
- zbookmark_t zb;
- ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
+
dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
/*
- * XXX why is this routine using dmu_buf_*() and casting between
- * dmu_buf_impl_t and dmu_buf_t?
+ * XXX - would be nice if we could do this without suspending...
*/
+ txg_suspend(dp);
/*
* If this txg already synced, there's nothing to do.
*/
if (txg <= tx->tx_synced_txg) {
+ txg_resume(dp);
/*
* If we're running ziltest, we need the blkptr regardless.
*/
if (txg > spa_freeze_txg(dp->dp_spa)) {
- err = dmu_buf_hold(os, object, offset,
- FTAG, (dmu_buf_t **)&db);
- if (err)
- return (err);
/* if db_blkptr == NULL, this was an empty write */
if (db->db_blkptr)
*bp = *db->db_blkptr; /* structure assignment */
- else
- bzero(bp, sizeof (blkptr_t));
- *blkoff = offset - db->db.db_offset;
- ASSERT3U(*blkoff, <, db->db.db_size);
- dmu_buf_rele((dmu_buf_t *)db, FTAG);
return (0);
}
- return (EALREADY);
+ return (EEXIST);
}
- /*
- * If this txg is in the middle of syncing, just wait for it.
- */
- if (txg == tx->tx_syncing_txg) {
- ASSERT(txg != tx->tx_open_txg);
- return (EINPROGRESS);
- }
-
- err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db);
- if (err)
- return (err);
-
mutex_enter(&db->db_mtx);
- /*
- * If this dbuf isn't dirty, must have been free_range'd.
- * There's no need to log writes to freed blocks, so we're done.
- */
- if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
+ blk = db->db_d.db_overridden_by[txg&TXG_MASK];
+ if (blk == IN_DMU_SYNC) {
+ /*
+ * We have already issued a sync write for this buffer.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (EALREADY);
+ } else if (blk != NULL) {
+ /*
+ * This buffer had already been synced. It could not
+ * have been dirtied since, or we would have cleared blk.
+ */
+ *bp = *blk; /* structure assignment */
mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db, FTAG);
+ txg_resume(dp);
+ return (0);
+ }
+
+ if (txg == tx->tx_syncing_txg) {
+ while (db->db_data_pending) {
+ /*
+ * IO is in-progress. Wait for it to finish.
+ * XXX - would be nice to be able to somehow "attach"
+ * this zio to the parent zio passed in.
+ */
+ cv_wait(&db->db_changed, &db->db_mtx);
+ ASSERT(db->db_data_pending ||
+ (db->db_blkptr && db->db_blkptr->blk_birth == txg));
+ }
+
+ if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
+ /*
+ * IO is already completed.
+ */
+ *bp = *db->db_blkptr; /* structure assignment */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
+ return (0);
+ }
+ }
+
+ if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) {
+ /*
+ * This dbuf isn't dirty, must have been free_range'd.
+ * There's no need to log writes to freed blocks, so we're done.
+ */
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
return (ENOENT);
}
- blk = db->db_d.db_overridden_by[txg&TXG_MASK];
-
+ ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+ db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
/*
- * If we already did a dmu_sync() of this dbuf in this txg,
- * free the old block before writing the new one.
+ * XXX - a little ugly to stash the blkptr in the callback
+ * buffer. We always need to make sure the following is true:
+ * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t));
*/
- if (blk != NULL) {
- ASSERT(blk != IN_DMU_SYNC);
- if (blk == IN_DMU_SYNC) {
- mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db, FTAG);
- return (EBUSY);
- }
- arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
- if (!BP_IS_HOLE(blk)) {
- (void) arc_free(NULL, osi->os_spa, txg, blk,
- NULL, NULL, ARC_WAIT);
- }
- kmem_free(blk, sizeof (blkptr_t));
- }
+ in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ in->db = db;
+ in->txg = txg;
+ in->done = done;
+ in->arg = arg;
+ mutex_exit(&db->db_mtx);
+ txg_resume(dp);
- db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
- mutex_exit(&db->db_mtx);
-
- blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- blk->blk_birth = 0; /* mark as invalid */
-
- zb.zb_objset = osi->os_dsl_dataset->ds_object;
+ arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT;
+ zb.zb_objset = os->os_dsl_dataset->ds_object;
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- err = arc_write(NULL, osi->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, osi->os_compress),
- dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type),
- txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
+ err = arc_write(pio, os->os_spa,
+ zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
+ zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
+ dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type),
+ txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb);
ASSERT(err == 0);
- if (!BP_IS_HOLE(blk)) {
- blk->blk_fill = 1;
- BP_SET_TYPE(blk, db->db_dnode->dn_type);
- BP_SET_LEVEL(blk, 0);
- }
-
- /* copy the block pointer back to caller */
- *bp = *blk; /* structure assignment */
- *blkoff = offset - db->db.db_offset;
- ASSERT3U(*blkoff, <, db->db.db_size);
-
- mutex_enter(&db->db_mtx);
- if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
- /* we were dirtied/freed during the sync */
- ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
- arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
- mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db, FTAG);
- /* Note that this block does not free on disk until txg syncs */
-
- /*
- * XXX can we use ARC_NOWAIT here?
- * XXX should we be ignoring the return code?
- */
- if (!BP_IS_HOLE(blk)) {
- (void) arc_free(NULL, osi->os_spa, txg, blk,
- NULL, NULL, ARC_WAIT);
- }
- kmem_free(blk, sizeof (blkptr_t));
- return (ESTALE);
- }
-
- db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
- mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db, FTAG);
- ASSERT3U(txg, >, tx->tx_syncing_txg);
- return (0);
+ return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0);
}
uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Jun 19 19:31:35 2006 -0700
@@ -572,13 +572,17 @@
/*
* Synchronous write.
- * On success returns 0 and fills in the blk pointed at by bp.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absense of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
* Note that while the data covered by this function will be on stable
- * storage when the function returns this new data does not become a
+ * storage when the write completes this new data does not become a
* permanent part of the file until the associated transaction commits.
*/
-int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
- struct blkptr *bp, uint64_t txg);
+typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
+int dmu_sync(struct zio *zio, dmu_buf_t *db,
+ struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
/*
* Find the next hole or data block in file starting at *off
--- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h Mon Jun 19 19:31:35 2006 -0700
@@ -43,6 +43,7 @@
} rl_type_t;
typedef struct rl {
+ znode_t *r_zp; /* znode this lock applies to */
avl_node_t r_node; /* avl node link */
uint64_t r_off; /* file range offset */
uint64_t r_len; /* file range length */
@@ -66,13 +67,13 @@
/*
* Unlock range and destroy range lock structure.
*/
-void zfs_range_unlock(znode_t *zp, rl_t *rl);
+void zfs_range_unlock(rl_t *rl);
/*
* Reduce range locked as RW_WRITER from whole file to specified range.
* Asserts the whole file was previously locked.
*/
-void zfs_range_reduce(znode_t *zp, rl_t *rl, uint64_t off, uint64_t len);
+void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
/*
* AVL comparison function used to compare range locks
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Mon Jun 19 19:31:35 2006 -0700
@@ -215,7 +215,7 @@
typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t();
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf);
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Mon Jun 19 19:31:35 2006 -0700
@@ -51,6 +51,7 @@
int lwb_nused; /* # used bytes in buffer */
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
+ zio_t *lwb_zio; /* zio for this buffer */
uint64_t lwb_max_txg; /* highest txg in this lwb */
uint64_t lwb_seq; /* highest log record seq number */
txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
@@ -78,6 +79,7 @@
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
uint64_t zl_itx_seq; /* itx sequence number */
+ uint64_t zl_wait_seq; /* last tx write initiated */
uint64_t zl_ss_seq; /* last tx on stable storage */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
--- a/usr/src/uts/common/fs/zfs/vdev.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c Mon Jun 19 19:31:35 2006 -0700
@@ -1792,7 +1792,7 @@
"deadline = pri + (lbolt >> time_shift)",
0,
63,
- 4,
+ 8,
offsetof(struct vdev, vdev_queue.vq_time_shift)
},
{
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c Mon Jun 19 19:31:35 2006 -0700
@@ -209,7 +209,7 @@
/*
* zfs_log_write() handles TX_WRITE transactions.
*/
-ssize_t zfs_immediate_write_sz = 65536;
+ssize_t zfs_immediate_write_sz = 32768;
uint64_t
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c Mon Jun 19 19:31:35 2006 -0700
@@ -34,8 +34,8 @@
* ---------
* Defined in zfs_rlock.h but essentially:
* rl = zfs_range_lock(zp, off, len, lock_type);
- * zfs_range_unlock(zp, rl);
- * zfs_range_reduce(zp, rl, off, len);
+ * zfs_range_unlock(rl);
+ * zfs_range_reduce(rl, off, len);
*
* AVL tree
* --------
@@ -417,6 +417,7 @@
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+ new->r_zp = zp;
new->r_off = off;
new->r_len = len;
new->r_cnt = 1; /* assume it's going to be in the tree */
@@ -503,8 +504,10 @@
* Unlock range and destroy range lock structure.
*/
void
-zfs_range_unlock(znode_t *zp, rl_t *rl)
+zfs_range_unlock(rl_t *rl)
{
+ znode_t *zp = rl->r_zp;
+
ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
ASSERT(!rl->r_proxy);
@@ -535,8 +538,10 @@
* entry in the tree.
*/
void
-zfs_range_reduce(znode_t *zp, rl_t *rl, uint64_t off, uint64_t len)
+zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
{
+ znode_t *zp = rl->r_zp;
+
/* Ensure there are no other locks */
ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
ASSERT(rl->r_off == 0);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Jun 19 19:31:35 2006 -0700
@@ -487,7 +487,7 @@
dmu_buf_rele_array(dbpp, numbufs, FTAG);
}
out:
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
@@ -606,10 +606,10 @@
ZFS_ENTER(zfsvfs);
/*
- * Pre-fault the initial pages to ensure slow (eg NFS) pages
+ * Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
*/
- zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio);
+ zfs_prefault_write(n, uio);
/*
* If in append mode, set the io offset pointer to eof.
@@ -692,7 +692,7 @@
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_range_reduce(zp, rl, woff, n);
+ zfs_range_reduce(rl, woff, n);
}
/*
@@ -766,9 +766,6 @@
ioflag, uio);
dmu_tx_commit(tx);
- /* Pre-fault the next set of pages */
- zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio);
-
/*
* Start another transaction.
*/
@@ -810,7 +807,7 @@
no_tx_done:
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
/*
* If we're in replay mode, or we made no progress, return error.
@@ -827,16 +824,28 @@
return (0);
}
+void
+zfs_get_done(dmu_buf_t *db, void *vrl)
+{
+ rl_t *rl = (rl_t *)vrl;
+ vnode_t *vp = ZTOV(rl->r_zp);
+
+ dmu_buf_rele(db, rl);
+ zfs_range_unlock(rl);
+ VN_RELE(vp);
+}
+
/*
* Get data to generate a TX_WRITE intent log record.
*/
int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf)
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t off = lr->lr_offset;
+ dmu_buf_t *db;
rl_t *rl;
int dlen = lr->lr_length; /* length of user data */
int error = 0;
@@ -861,8 +870,6 @@
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- dmu_buf_t *db;
-
rl = zfs_range_lock(zp, off, dlen, RL_READER);
/* test for truncation needs to be done while range locked */
if (off >= zp->z_phys->zp_size) {
@@ -892,20 +899,30 @@
rl = zfs_range_lock(zp, boff, dlen, RL_READER);
if (zp->z_blksz == dlen)
break;
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
}
/* test for truncation needs to be done while range locked */
if (off >= zp->z_phys->zp_size) {
error = ENOENT;
goto out;
}
- txg_suspend(dmu_objset_pool(os));
- error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff,
- &lr->lr_blkptr, lr->lr_common.lrc_txg);
- txg_resume(dmu_objset_pool(os));
+ VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, rl, &db));
+ ASSERT(boff == db->db_offset);
+ lr->lr_blkoff = off - boff;
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zio ? zfs_get_done : NULL, rl);
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everthing
+ * up in the zfs_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, rl);
}
out:
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
VN_RELE(ZTOV(zp));
return (error);
}
@@ -2785,7 +2802,7 @@
* Can't push pages past end-of-file.
*/
if (off >= zp->z_phys->zp_size) {
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
return (EIO);
}
len = MIN(PAGESIZE, zp->z_phys->zp_size - off);
@@ -2795,7 +2812,7 @@
dmu_tx_hold_bonus(tx, zp->z_id);
err = dmu_tx_assign(tx, zfsvfs->z_assign);
if (err != 0) {
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
dmu_tx_wait(tx);
dmu_tx_abort(tx);
@@ -2815,7 +2832,7 @@
(void) zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
dmu_tx_commit(tx);
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
pvn_write_done(pp, B_WRITE | flags);
if (offp)
@@ -3155,7 +3172,7 @@
/* can't fault past EOF */
if (off >= zp->z_phys->zp_size) {
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (EFAULT);
}
@@ -3236,7 +3253,7 @@
if (need_unlock)
rw_exit(&zp->z_map_lock);
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
ZFS_EXIT(zfsvfs);
return (err);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Mon Jun 19 19:31:35 2006 -0700
@@ -936,7 +936,7 @@
/* recheck, in case zp_size changed */
if (off + len > zp->z_phys->zp_size) {
/* lost race: file size changed, lock whole file */
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
}
}
@@ -946,7 +946,7 @@
*/
size = zp->z_phys->zp_size;
if (len == 0 && size == off) {
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
return (0);
}
@@ -964,7 +964,7 @@
extent = size - off;
}
if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
return (error);
}
}
@@ -996,7 +996,7 @@
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
dmu_tx_wait(tx);
dmu_tx_abort(tx);
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
return (error);
}
@@ -1022,7 +1022,7 @@
seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
}
- zfs_range_unlock(zp, rl);
+ zfs_range_unlock(rl);
dmu_tx_commit(tx);
--- a/usr/src/uts/common/fs/zfs/zil.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c Mon Jun 19 19:31:35 2006 -0700
@@ -366,6 +366,8 @@
lwb->lwb_max_txg = txg;
lwb->lwb_seq = 0;
lwb->lwb_state = UNWRITTEN;
+ lwb->lwb_zio = NULL;
+
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
mutex_exit(&zilog->zl_lock);
@@ -619,6 +621,29 @@
}
/*
+ * Initialize the io for a log block.
+ *
+ * Note, we should not initialize the IO until we are about
+ * to use it, since zio_rewrite() does a spa_config_enter().
+ */
+static void
+zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+{
+ zbookmark_t zb;
+
+ zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+ ASSERT(lwb->lwb_zio == NULL);
+ lwb->lwb_zio = zio_rewrite(NULL, zilog->zl_spa,
+ ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+ lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+}
+
+/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
@@ -631,7 +656,6 @@
blkptr_t *bp = &ztp->zit_next_blk;
uint64_t txg;
uint64_t zil_blksz;
- zbookmark_t zb;
int error;
ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
@@ -651,7 +675,8 @@
* maximum of the previous used size, the current used size and
* the amount waiting in the queue.
*/
- zil_blksz = MAX(zilog->zl_cur_used, zilog->zl_prev_used);
+ zil_blksz = MAX(zilog->zl_prev_used,
+ zilog->zl_cur_used + sizeof (*ztp));
zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
if (zil_blksz > ZIL_MAX_BLKSZ)
@@ -692,6 +717,7 @@
nlwb->lwb_max_txg = txg;
nlwb->lwb_seq = 0;
nlwb->lwb_state = UNWRITTEN;
+ nlwb->lwb_zio = NULL;
/*
* Put new lwb at the end of the log chain,
@@ -704,16 +730,19 @@
mutex_exit(&zilog->zl_lock);
/*
- * write the old log block
+ * kick off the write for the old log block
*/
- zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
- zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0,
- &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
- ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
+ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+ if (lwb->lwb_zio == NULL) {
+ /*
+ * This can only happen if there are no log records in this
+ * block (i.e. the first record to go in was too big to fit).
+ * XXX - would be nice if we could avoid this IO
+ */
+ ASSERT(lwb->lwb_nused == 0);
+ zil_lwb_write_init(zilog, lwb);
+ }
+ zio_nowait(lwb->lwb_zio);
return (nlwb);
}
@@ -722,61 +751,21 @@
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lr;
- char *dbuf;
+ lr_write_t *lr = (lr_write_t *)lrc;
uint64_t seq = lrc->lrc_seq;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen = 0;
- int error;
+ uint64_t dlen;
if (lwb == NULL)
return (NULL);
ASSERT(lwb->lwb_buf != NULL);
- /*
- * If it's a write, fetch the data or get its blkptr as appropriate.
- */
- if (lrc->lrc_txtype == TX_WRITE) {
- lr = (lr_write_t *)lrc;
- if (txg > spa_freeze_txg(zilog->zl_spa))
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- if (itx->itx_wr_state != WR_COPIED) {
- if (itx->itx_wr_state == WR_NEED_COPY) {
- dlen = P2ROUNDUP_TYPED(lr->lr_length,
- sizeof (uint64_t), uint64_t);
- ASSERT(dlen);
- dbuf = kmem_alloc(dlen, KM_NOSLEEP);
- /* on memory shortage use dmu_sync */
- if (dbuf == NULL) {
- itx->itx_wr_state = WR_INDIRECT;
- dlen = 0;
- }
- } else {
- ASSERT(itx->itx_wr_state == WR_INDIRECT);
- dbuf = NULL;
- }
- error = zilog->zl_get_data(itx->itx_private, lr, dbuf);
- if (error) {
- if (dlen)
- kmem_free(dbuf, dlen);
- if (error != ENOENT && error != EALREADY) {
- txg_wait_synced(zilog->zl_dmu_pool,
- txg);
- mutex_enter(&zilog->zl_lock);
- zilog->zl_ss_seq =
- MAX(seq, zilog->zl_ss_seq);
- mutex_exit(&zilog->zl_lock);
- return (lwb);
- }
- mutex_enter(&zilog->zl_lock);
- zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(
- &(lr->lr_blkptr))), seq);
- mutex_exit(&zilog->zl_lock);
- return (lwb);
- }
- }
- }
+ if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+ dlen = P2ROUNDUP_TYPED(
+ lr->lr_length, sizeof (uint64_t), uint64_t);
+ else
+ dlen = 0;
zilog->zl_cur_used += (reclen + dlen);
@@ -785,32 +774,58 @@
*/
if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
lwb = zil_lwb_write_start(zilog, lwb);
- if (lwb == NULL) {
- if (dlen)
- kmem_free(dbuf, dlen);
+ if (lwb == NULL)
return (NULL);
- }
ASSERT(lwb->lwb_nused == 0);
if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
mutex_enter(&zilog->zl_lock);
zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
mutex_exit(&zilog->zl_lock);
- if (dlen)
- kmem_free(dbuf, dlen);
return (lwb);
}
}
- lrc->lrc_reclen += dlen;
+ if (lwb->lwb_zio == NULL)
+ zil_lwb_write_init(zilog, lwb);
+
bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
- lwb->lwb_nused += reclen;
- if (dlen) {
- bcopy(dbuf, lwb->lwb_buf + lwb->lwb_nused, dlen);
- lwb->lwb_nused += dlen;
- kmem_free(dbuf, dlen);
- lrc->lrc_reclen -= dlen; /* for kmem_free of itx */
+
+ /*
+ * If it's a write, fetch the data or get its blkptr as appropriate.
+ */
+ if (lrc->lrc_txtype == TX_WRITE) {
+ if (txg > spa_freeze_txg(zilog->zl_spa))
+ txg_wait_synced(zilog->zl_dmu_pool, txg);
+ if (itx->itx_wr_state != WR_COPIED) {
+ char *dbuf;
+ int error;
+
+ /* alignment is guaranteed */
+ lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
+ if (dlen) {
+ ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
+ lr->lr_common.lrc_reclen += dlen;
+ } else {
+ ASSERT(itx->itx_wr_state == WR_INDIRECT);
+ dbuf = NULL;
+ }
+ error = zilog->zl_get_data(
+ itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ if (error) {
+ ASSERT(error == ENOENT || error == EEXIST ||
+ error == EALREADY);
+ return (lwb);
+ }
+ }
}
+
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(seq > zilog->zl_wait_seq);
+ zilog->zl_wait_seq = seq;
+ mutex_exit(&zilog->zl_lock);
+ lwb->lwb_nused += reclen + dlen;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
ASSERT3U(lwb->lwb_seq, <, seq);
lwb->lwb_seq = seq;
@@ -993,8 +1008,9 @@
/*
* Wait if necessary for our seq to be committed.
*/
- if (lwb) {
- while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
+ if (lwb && zilog->zl_wait_seq) {
+ while (zilog->zl_ss_seq < zilog->zl_wait_seq &&
+ zilog->zl_log_error == 0)
cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
zil_flush_vdevs(zilog, seq);
}
@@ -1009,6 +1025,7 @@
cv_broadcast(&zilog->zl_cv_seq);
}
/* wake up others waiting to start a write */
+ zilog->zl_wait_seq = 0;
zilog->zl_writer = B_FALSE;
mutex_exit(&zilog->zl_lock);
cv_broadcast(&zilog->zl_cv_write);
--- a/usr/src/uts/common/fs/zfs/zvol.c Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c Mon Jun 19 19:31:35 2006 -0700
@@ -68,6 +68,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/mkdev.h>
#include <sys/zil.h>
+#include <sys/refcount.h>
#include "zfs_namecheck.h"
@@ -683,7 +684,9 @@
itx_t *itx;
lr_write_t *lr;
objset_t *os;
+ dmu_buf_t *db;
uint64_t txg;
+ uint64_t boff;
int error;
uint32_t blocksize;
@@ -714,18 +717,22 @@
if (nbytes <= zvol_immediate_write_sz) {
itx = zvol_immediate_itx(off, nbytes, addr);
} else {
+ boff = P2ALIGN_TYPED(off, blocksize, uint64_t);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
lr->lr_length = nbytes;
- lr->lr_blkoff = 0;
+ lr->lr_blkoff = off - boff;
BP_ZERO(&lr->lr_blkptr);
- txg_suspend(dmu_objset_pool(os));
- error = dmu_sync(os, ZVOL_OBJ, off, &lr->lr_blkoff,
- &lr->lr_blkptr, txg);
- txg_resume(dmu_objset_pool(os));
+ /* XXX - we should do these IOs in parallel */
+ VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
+ FTAG, &db));
+ ASSERT(boff == db->db_offset);
+ error = dmu_sync(NULL, db, &lr->lr_blkptr,
+ txg, NULL, NULL);
+ dmu_buf_rele(db, FTAG);
if (error) {
kmem_free(itx, offsetof(itx_t, itx_lr));
return (error);