6341569 zio_alloc_blk() vdev distribution performs badly
6428639 large writes to zvol synchs too much, better cut down a little
6444692 Need to flush disk write cache for dmu_sync buffers
6465634 zvol: dmu_sync() should be issued in parallel
6468731 lwb_state_t can be nuked
6470042 parallel dmu_sync() isn't being used
6471679 stash blocksize in zvol_state_t rather than reading in every zvol_log_write
6472230 ZIL vdev management is inefficient
6473775 zil_commit changes in snv_48 make it hot for O_DSYNC workloads
6478388 ZIL replay takes too long causing issues while booting
6486390 zil_commit could push more transactions
6486496 zil_replay() useful debug
--- a/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 04 07:59:19 2006 -0800
@@ -704,7 +704,7 @@
*/
static int
metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
- dva_t *hintdva, uint64_t txg)
+ dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
{
metaslab_group_t *mg, *rotor;
metaslab_class_t *mc;
@@ -725,10 +725,10 @@
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
- * If we are doing ditto blocks, try to spread them across consecutive
- * vdevs. If we're forced to reuse a vdev before we've allocated
- * all of our ditto blocks, then try and spread them out on that
- * vdev as much as possible. If it turns out to not be possible,
+ * If we are doing ditto or log blocks, try to spread them across
+ * consecutive vdevs. If we're forced to reuse a vdev before we've
+ * allocated all of our ditto blocks, then try and spread them out on
+ * that vdev as much as possible. If it turns out to not be possible,
* gradually lower our standards until anything becomes acceptable.
* Also, allocating on consecutive vdevs (as opposed to random vdevs)
* gives us hope of containing our fault domains to something we're
@@ -743,7 +743,10 @@
*/
if (hintdva) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
- mg = vd->vdev_mg;
+ if (hintdva_avoid)
+ mg = vd->vdev_mg->mg_next;
+ else
+ mg = vd->vdev_mg;
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
@@ -918,7 +921,7 @@
int
metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
- uint64_t txg, blkptr_t *hintbp)
+ uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@@ -930,7 +933,8 @@
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
for (d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg);
+ error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
+ txg, hintbp_avoid);
if (error) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 04 07:59:19 2006 -0800
@@ -48,7 +48,7 @@
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
- int ncopies, uint64_t txg, blkptr_t *hintbp);
+ int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Sat Nov 04 07:59:19 2006 -0800
@@ -151,6 +151,7 @@
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
uint64_t z_last_itx; /* last ZIL itx on this znode */
+ uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
list_node_t z_link_node; /* all znodes in fs link */
/*
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Sat Nov 04 07:59:19 2006 -0800
@@ -212,10 +212,23 @@
list_node_t itx_node; /* linkage on zl_itx_list */
void *itx_private; /* type-specific opaque data */
itx_wr_state_t itx_wr_state; /* write state */
+ uint8_t itx_sync; /* synchronous transaction */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
+
+/*
+ * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
+ * to handle the cleanup of the dmu_sync() buffer write
+ */
+typedef struct {
+ zilog_t *zgd_zilog; /* zilog */
+ blkptr_t *zgd_bp; /* block pointer */
+ struct rl *zgd_rl; /* range lock */
+} zgd_t;
+
+
typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
uint64_t txg);
typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
@@ -252,6 +265,8 @@
extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
+extern void zil_add_vdev(zilog_t *zilog, uint64_t vdev);
+
extern int zil_disable;
#ifdef __cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Sat Nov 04 07:59:19 2006 -0800
@@ -35,13 +35,6 @@
extern "C" {
#endif
-typedef enum lwb_state_type {
- UNWRITTEN, /* buffer yet to be written */
- SEQ_INCOMPLETE, /* buffer written, but there's an unwritten buffer in */
- /* the sequence before this */
- SEQ_COMPLETE, /* no unwritten buffers before this */
-} lwb_state_t;
-
/*
* Log write buffer.
*/
@@ -58,8 +51,11 @@
} lwb_t;
/*
- * vdev element for use in flushing device write caches
+ * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
+ * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
*/
+
+#define ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
typedef struct zil_vdev {
uint64_t vdev; /* device written */
list_node_t vdev_seq_node; /* zilog->zl_vdev_list linkage */
@@ -76,8 +72,8 @@
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* itx sequence number */
- uint64_t zl_ss_seq; /* last tx on stable storage */
+ uint64_t zl_itx_seq; /* next itx sequence number */
+ uint64_t zl_commit_seq; /* committed upto this number */
uint64_t zl_lr_seq; /* log record sequence number */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
@@ -96,8 +92,11 @@
uint64_t zl_prev_used; /* previous commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
list_t zl_vdev_list; /* list of [vdev, seq] pairs */
+ uint8_t zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ clock_t zl_replay_time; /* lbolt of when replay started */
+ uint64_t zl_replay_blks; /* number of log blocks replayed */
};
typedef struct zil_dva_node {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 04 07:59:19 2006 -0800
@@ -286,7 +286,8 @@
uint64_t size, void *data, int checksum,
zio_done_func_t *done, void *private, int priority, int flags);
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg);
+extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t txg);
extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
extern int zio_wait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c Sat Nov 04 07:59:19 2006 -0800
@@ -273,6 +273,7 @@
itx->itx_private = zp->z_zfsvfs;
+ itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
@@ -297,6 +298,7 @@
lr->lr_offset = off;
lr->lr_length = len;
+ itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
@@ -326,6 +328,7 @@
ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+ itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
@@ -350,6 +353,7 @@
lr->lr_aclcnt = (uint64_t)aclcnt;
bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
+ itx->itx_sync = (zp->z_sync_cnt != 0);
seq = zil_itx_assign(zilog, itx, tx);
zp->z_last_itx = seq;
}
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Sat Nov 04 07:59:19 2006 -0800
@@ -154,11 +154,15 @@
* ZFS_EXIT(zfsvfs); // finished in zfs
* return (error); // done, report error
*/
-
/* ARGSUSED */
static int
zfs_open(vnode_t **vpp, int flag, cred_t *cr)
{
+ znode_t *zp = VTOZ(*vpp);
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & (FSYNC | FDSYNC))
+ atomic_inc_32(&zp->z_sync_cnt);
return (0);
}
@@ -166,6 +170,12 @@
static int
zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
{
+ znode_t *zp = VTOZ(vp);
+
+ /* Decrement the synchronous opens in the znode */
+ if (flag & (FSYNC | FDSYNC))
+ atomic_dec_32(&zp->z_sync_cnt);
+
/*
* Clean up any locks held by this process on the vp.
*/
@@ -827,14 +837,17 @@
}
void
-zfs_get_done(dmu_buf_t *db, void *vrl)
+zfs_get_done(dmu_buf_t *db, void *vzgd)
{
- rl_t *rl = (rl_t *)vrl;
+ zgd_t *zgd = (zgd_t *)vzgd;
+ rl_t *rl = zgd->zgd_rl;
vnode_t *vp = ZTOV(rl->r_zp);
- dmu_buf_rele(db, rl);
+ dmu_buf_rele(db, vzgd);
zfs_range_unlock(rl);
VN_RELE(vp);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
}
/*
@@ -849,9 +862,11 @@
uint64_t off = lr->lr_offset;
dmu_buf_t *db;
rl_t *rl;
+ zgd_t *zgd;
int dlen = lr->lr_length; /* length of user data */
int error = 0;
+ ASSERT(zio);
ASSERT(dlen != 0);
/*
@@ -907,11 +922,19 @@
error = ENOENT;
goto out;
}
- VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, rl, &db));
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_rl = rl;
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_bp = &lr->lr_blkptr;
+ VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
ASSERT(boff == db->db_offset);
lr->lr_blkoff = off - boff;
error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zio ? zfs_get_done : NULL, rl);
+ lr->lr_common.lrc_txg, zfs_get_done, zgd);
+ if (error == 0) {
+ zil_add_vdev(zfsvfs->z_log,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ }
/*
* If we get EINPROGRESS, then we need to wait for a
* write IO initiated by dmu_sync() to complete before
@@ -920,7 +943,8 @@
*/
if (error == EINPROGRESS)
return (0);
- dmu_buf_rele(db, rl);
+ dmu_buf_rele(db, zgd);
+ kmem_free(zgd, sizeof (zgd_t));
}
out:
zfs_range_unlock(rl);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Sat Nov 04 07:59:19 2006 -0800
@@ -411,6 +411,7 @@
zp->z_id = obj_num;
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
--- a/usr/src/uts/common/fs/zfs/zil.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zil.c Sat Nov 04 07:59:19 2006 -0800
@@ -352,7 +352,8 @@
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
- error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg);
+ error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
+ NULL, txg);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -494,73 +495,101 @@
void
zil_add_vdev(zilog_t *zilog, uint64_t vdev)
{
- zil_vdev_t *zv;
+ zil_vdev_t *zv, *new;
+ uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
+ uchar_t *cp;
if (zfs_nocacheflush)
return;
- ASSERT(MUTEX_HELD(&zilog->zl_lock));
- zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
- zv->vdev = vdev;
- list_insert_tail(&zilog->zl_vdev_list, zv);
+ if (vdev < bmap_sz) {
+ cp = zilog->zl_vdev_bmap + (vdev / 8);
+ atomic_or_8(cp, 1 << (vdev % 8));
+ } else {
+ /*
+ * insert into ordered list
+ */
+ mutex_enter(&zilog->zl_lock);
+ for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
+ zv = list_next(&zilog->zl_vdev_list, zv)) {
+ if (zv->vdev == vdev) {
+ /* duplicate found - just return */
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ if (zv->vdev > vdev) {
+ /* insert before this entry */
+ new = kmem_alloc(sizeof (zil_vdev_t),
+ KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_before(&zilog->zl_vdev_list,
+ zv, new);
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
+ /* ran off end of list, insert at the end */
+ ASSERT(zv == NULL);
+ new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+ new->vdev = vdev;
+ list_insert_tail(&zilog->zl_vdev_list, new);
+ mutex_exit(&zilog->zl_lock);
+ }
+}
+
+/* start an async flush of the write cache for this vdev */
+void
+zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
+{
+ vdev_t *vd;
+
+ if (*zio == NULL)
+ *zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ vd = vdev_lookup_top(spa, vdev);
+ ASSERT(vd);
+
+ (void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
+ NULL, NULL, ZIO_PRIORITY_NOW,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
}
void
zil_flush_vdevs(zilog_t *zilog)
{
- vdev_t *vd;
- zil_vdev_t *zv, *zv2;
- zio_t *zio;
- spa_t *spa;
+ zil_vdev_t *zv;
+ zio_t *zio = NULL;
+ spa_t *spa = zilog->zl_spa;
uint64_t vdev;
+ uint8_t b;
+ int i, j;
+
+ ASSERT(zilog->zl_writer);
- if (zfs_nocacheflush)
- return;
-
- ASSERT(MUTEX_HELD(&zilog->zl_lock));
-
- spa = zilog->zl_spa;
- zio = NULL;
+ for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
+ b = zilog->zl_vdev_bmap[i];
+ if (b == 0)
+ continue;
+ for (j = 0; j < 8; j++) {
+ if (b & (1 << j)) {
+ vdev = (i << 3) + j;
+ zil_flush_vdev(spa, vdev, &zio);
+ }
+ }
+ zilog->zl_vdev_bmap[i] = 0;
+ }
while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
- vdev = zv->vdev;
+ zil_flush_vdev(spa, zv->vdev, &zio);
list_remove(&zilog->zl_vdev_list, zv);
kmem_free(zv, sizeof (zil_vdev_t));
-
- /*
- * remove all chained entries with same vdev
- */
- zv = list_head(&zilog->zl_vdev_list);
- while (zv) {
- zv2 = list_next(&zilog->zl_vdev_list, zv);
- if (zv->vdev == vdev) {
- list_remove(&zilog->zl_vdev_list, zv);
- kmem_free(zv, sizeof (zil_vdev_t));
- }
- zv = zv2;
- }
-
- /* flush the write cache for this vdev */
- mutex_exit(&zilog->zl_lock);
- if (zio == NULL)
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- vd = vdev_lookup_top(spa, vdev);
- ASSERT(vd);
- (void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
- NULL, NULL, ZIO_PRIORITY_NOW,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
- mutex_enter(&zilog->zl_lock);
}
-
/*
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
- if (zio != NULL) {
- mutex_exit(&zilog->zl_lock);
+ if (zio)
(void) zio_wait(zio);
- mutex_enter(&zilog->zl_lock);
- }
}
/*
@@ -610,10 +639,12 @@
zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
ZIO_FLAG_CANFAIL);
}
- lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
- lwb->lwb_sz, zil_lwb_write_done, lwb,
- ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ if (lwb->lwb_zio == NULL) {
+ lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
+ ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+ lwb->lwb_sz, zil_lwb_write_done, lwb,
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
}
/*
@@ -655,7 +686,9 @@
if (zil_blksz > ZIL_MAX_BLKSZ)
zil_blksz = ZIL_MAX_BLKSZ;
- error = zio_alloc_blk(spa, zil_blksz, bp, txg);
+ BP_ZERO(bp);
+ /* pass the old blkptr in order to spread log blocks across devs */
+ error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
if (error) {
/*
* Reinitialise the lwb.
@@ -689,20 +722,20 @@
nlwb->lwb_zio = NULL;
/*
- * Put new lwb at the end of the log chain,
- * and record the vdev for later flushing
+ * Put new lwb at the end of the log chain
*/
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, nlwb);
+ mutex_exit(&zilog->zl_lock);
+
+ /* Record the vdev for later flushing */
zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
- mutex_exit(&zilog->zl_lock);
/*
* kick off the write for the old log block
*/
dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
- if (lwb->lwb_zio == NULL)
- zil_lwb_write_init(zilog, lwb);
+ ASSERT(lwb->lwb_zio);
zio_nowait(lwb->lwb_zio);
return (nlwb);
@@ -729,6 +762,8 @@
zilog->zl_cur_used += (reclen + dlen);
+ zil_lwb_write_init(zilog, lwb);
+
/*
* If this record won't fit in the current log block, start a new one.
*/
@@ -736,6 +771,7 @@
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
+ zil_lwb_write_init(zilog, lwb);
ASSERT(lwb->lwb_nused == 0);
if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -843,20 +879,26 @@
kmem_free(itx, offsetof(itx_t, itx_lr)
+ itx->itx_lr.lrc_reclen);
}
+ cv_broadcast(&zilog->zl_cv_writer);
mutex_exit(&zilog->zl_lock);
}
/*
- * If there are in-memory intent log transactions then
- * start up a taskq to free up any that have now been synced.
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them.
*/
void
zil_clean(zilog_t *zilog)
{
+ itx_t *itx;
+
mutex_enter(&zilog->zl_lock);
- if (list_head(&zilog->zl_itx_list) != NULL)
+ itx = list_head(&zilog->zl_itx_list);
+ if ((itx != NULL) &&
+ (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
(void) taskq_dispatch(zilog->zl_clean_taskq,
(void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+ }
mutex_exit(&zilog->zl_lock);
}
@@ -865,6 +907,7 @@
{
uint64_t txg;
uint64_t reclen;
+ uint64_t commit_seq = 0;
itx_t *itx, *itx_next = (itx_t *)-1;
lwb_t *lwb;
spa_t *spa;
@@ -883,13 +926,9 @@
* dirty the fs by calling zil_create()
*/
if (list_is_empty(&zilog->zl_itx_list)) {
- /* wake up others waiting to start a write */
zilog->zl_writer = B_FALSE;
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
return;
}
-
mutex_exit(&zilog->zl_lock);
zil_create(zilog);
mutex_enter(&zilog->zl_lock);
@@ -897,11 +936,7 @@
}
}
- /*
- * Loop through in-memory log transactions filling log blocks,
- * until we reach the given sequence number and there's no more
- * room in the write buffer.
- */
+ /* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
for (;;) {
/*
@@ -917,6 +952,8 @@
for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
if (foid == 0) /* push all foids? */
break;
+ if (itx->itx_sync) /* push all O_[D]SYNC */
+ break;
switch (itx->itx_lr.lrc_txtype) {
case TX_SETATTR:
case TX_WRITE:
@@ -936,8 +973,9 @@
reclen = itx->itx_lr.lrc_reclen;
if ((itx->itx_lr.lrc_seq > seq) &&
((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb))))
+ (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
break;
+ }
/*
* Save the next pointer. Even though we soon drop
@@ -947,10 +985,10 @@
*/
itx_next = list_next(&zilog->zl_itx_list, itx);
list_remove(&zilog->zl_itx_list, itx);
+ mutex_exit(&zilog->zl_lock);
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
- mutex_exit(&zilog->zl_lock);
if (txg > spa_last_synced_txg(spa) ||
txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
@@ -960,10 +998,16 @@
zilog->zl_itx_list_sz -= reclen;
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
+ /* determine commit sequence number */
+ itx = list_head(&zilog->zl_itx_list);
+ if (itx)
+ commit_seq = itx->itx_lr.lrc_seq;
+ else
+ commit_seq = zilog->zl_itx_seq;
mutex_exit(&zilog->zl_lock);
/* write the last block out */
- if (lwb != NULL && lwb->lwb_nused != 0)
+ if (lwb != NULL && lwb->lwb_zio != NULL)
lwb = zil_lwb_write_start(zilog, lwb);
zilog->zl_prev_used = zilog->zl_cur_used;
@@ -972,26 +1016,24 @@
/*
* Wait if necessary for the log blocks to be on stable storage.
*/
- mutex_enter(&zilog->zl_lock);
if (zilog->zl_root_zio) {
- mutex_exit(&zilog->zl_lock);
DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
(void) zio_wait(zilog->zl_root_zio);
DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
- mutex_enter(&zilog->zl_lock);
- zil_flush_vdevs(zilog);
+ if (!zfs_nocacheflush)
+ zil_flush_vdevs(zilog);
}
if (zilog->zl_log_error || lwb == NULL) {
zilog->zl_log_error = 0;
- mutex_exit(&zilog->zl_lock);
txg_wait_synced(zilog->zl_dmu_pool, 0);
- mutex_enter(&zilog->zl_lock);
}
- /* wake up others waiting to start a write */
+
+ mutex_enter(&zilog->zl_lock);
zilog->zl_writer = B_FALSE;
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
+
+ ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
+ zilog->zl_commit_seq = commit_seq;
}
/*
@@ -1009,9 +1051,17 @@
seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */
- while (zilog->zl_writer)
+ while (zilog->zl_writer) {
cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+ if (seq < zilog->zl_commit_seq) {
+ mutex_exit(&zilog->zl_lock);
+ return;
+ }
+ }
zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
+ /* wake up others waiting on the commit */
+ cv_broadcast(&zilog->zl_cv_writer);
+ mutex_exit(&zilog->zl_lock);
}
/*
@@ -1278,7 +1328,8 @@
const zil_header_t *zh = zilog->zl_header;
uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype;
- int pass, error;
+ char *name;
+ int pass, error, sunk;
if (zilog->zl_stop_replay)
return;
@@ -1343,7 +1394,7 @@
* and update the log header to reflect the fact that we did so.
* We use the DMU's ability to assign into a specific txg to do this.
*/
- for (pass = 1; /* CONSTANTCONDITION */; pass++) {
+ for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
uint64_t replay_txg;
dmu_tx_t *replay_tx;
@@ -1378,6 +1429,24 @@
dmu_tx_commit(replay_tx);
+ if (!error)
+ return;
+
+ /*
+ * The DMU's dnode layer doesn't see removes until the txg
+ * commits, so a subsequent claim can spuriously fail with
+ * EEXIST. So if we receive any error other than ERESTART
+ * we try syncing out any removes then retrying the
+ * transaction.
+ */
+ if (error != ERESTART && !sunk) {
+ if (zr->zr_rm_sync != NULL)
+ zr->zr_rm_sync(zr->zr_arg);
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ sunk = B_TRUE;
+ continue; /* retry */
+ }
+
if (error != ERESTART)
break;
@@ -1388,29 +1457,21 @@
dprintf("pass %d, retrying\n", pass);
}
- if (error) {
- char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- dmu_objset_name(zr->zr_os, name);
- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
- "dataset %s, seq 0x%llx, txtype %llu\n",
- error, name,
- (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
- zilog->zl_stop_replay = 1;
- kmem_free(name, MAXNAMELEN);
- }
+ ASSERT(error && error != ERESTART);
+ name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dmu_objset_name(zr->zr_os, name);
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu\n",
+ error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+ zilog->zl_stop_replay = 1;
+ kmem_free(name, MAXNAMELEN);
+}
- /*
- * The DMU's dnode layer doesn't see removes until the txg commits,
- * so a subsequent claim can spuriously fail with EEXIST.
- * To prevent this, if we might have removed an object,
- * wait for the delete thread to delete it, and then
- * wait for the transaction group to sync.
- */
- if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
- if (zr->zr_rm_sync != NULL)
- zr->zr_rm_sync(zr->zr_arg);
- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
- }
+/* ARGSUSED */
+static void
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ zilog->zl_replay_blks++;
}
/*
@@ -1445,7 +1506,9 @@
txg_wait_synced(zilog->zl_dmu_pool, 0);
zilog->zl_stop_replay = 0;
- (void) zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+ zilog->zl_replay_time = lbolt;
+ ASSERT(zilog->zl_replay_blks == 0);
+ (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
--- a/usr/src/uts/common/fs/zfs/zio.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Sat Nov 04 07:59:19 2006 -0800
@@ -1147,7 +1147,7 @@
gsize = SPA_GANGBLOCKSIZE;
gbps_left = SPA_GBH_NBLKPTRS;
- error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL);
+ error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
if (error == ENOSPC)
panic("can't allocate gang block header");
ASSERT(error == 0);
@@ -1174,7 +1174,7 @@
while (resid <= maxalloc * gbps_left) {
error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
- txg, bp);
+ txg, bp, B_FALSE);
if (error == 0)
break;
ASSERT3U(error, ==, ENOSPC);
@@ -1245,7 +1245,7 @@
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
- zio->io_txg, NULL);
+ zio->io_txg, NULL, B_FALSE);
if (error == 0) {
bp->blk_birth = zio->io_txg;
@@ -1653,25 +1653,27 @@
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg)
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t txg)
{
int error;
spa_config_enter(spa, RW_READER, FTAG);
- BP_ZERO(bp);
-
- error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
+ /*
+ * We were passed the previous log blocks dva_t in bp->blk_dva[0].
+ */
+ error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
if (error == 0) {
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG);
- BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
- BP_SET_LEVEL(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
- bp->blk_birth = txg;
+ BP_SET_LSIZE(new_bp, size);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_LEVEL(new_bp, 0);
+ BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+ new_bp->blk_birth = txg;
}
spa_config_exit(spa, FTAG);
--- a/usr/src/uts/common/fs/zfs/zvol.c Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zvol.c Sat Nov 04 07:59:19 2006 -0800
@@ -92,6 +92,7 @@
typedef struct zvol_state {
char zv_name[MAXPATHLEN]; /* pool/dd name */
uint64_t zv_volsize; /* amount of space we advertise */
+ uint64_t zv_volblocksize; /* volume block size */
minor_t zv_minor; /* minor number */
uint8_t zv_min_bs; /* minimum addressable block shift */
uint8_t zv_readonly; /* hard readonly; like write-protect */
@@ -104,6 +105,13 @@
krwlock_t zv_dslock; /* dmu_sync() rwlock */
} zvol_state_t;
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS/2;
+
+int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+
static void
zvol_size_changed(zvol_state_t *zv, dev_t dev)
{
@@ -309,6 +317,7 @@
{
zvol_state_t *zv;
objset_t *os;
+ dmu_object_info_t doi;
uint64_t volsize;
minor_t minor = 0;
struct pathname linkpath;
@@ -428,7 +437,12 @@
zv->zv_volsize = volsize;
zv->zv_objset = os;
zv->zv_mode = ds_mode;
- zv->zv_zilog = zil_open(os, NULL);
+ zv->zv_zilog = zil_open(os, zvol_get_data);
+
+ /* get and cache the blocksize */
+ error = dmu_object_info(os, ZVOL_OBJ, &doi);
+ ASSERT(error == 0);
+ zv->zv_volblocksize = doi.doi_data_block_size;
rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
@@ -687,83 +701,111 @@
return (itx);
}
+void
+zvol_get_done(dmu_buf_t *db, void *vzgd)
+{
+ zgd_t *zgd = (zgd_t *)vzgd;
+
+ dmu_buf_rele(db, vzgd);
+ zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ zvol_state_t *zv = arg;
+ objset_t *os = zv->zv_objset;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int dlen = lr->lr_length; /* length of user data */
+ int error;
+
+ ASSERT(zio);
+ ASSERT(dlen != 0);
+ ASSERT(buf == NULL);
+
+ zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_bp = &lr->lr_blkptr;
+
+ VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+ /*
+ * Have to lock to ensure when when the data is
+ * written out and it's checksum is being calculated
+ * that no one can change the data.
+ */
+ rw_enter(&zv->zv_dslock, RW_READER);
+ error = dmu_sync(zio, db, &lr->lr_blkptr,
+ lr->lr_common.lrc_txg, zvol_get_done, zgd);
+ rw_exit(&zv->zv_dslock);
+ if (error == 0) {
+ zil_add_vdev(zv->zv_zilog,
+ DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+ }
+ /*
+ * If we get EINPROGRESS, then we need to wait for a
+ * write IO initiated by dmu_sync() to complete before
+ * we can release this dbuf. We will finish everything
+ * up in the zvol_get_done() callback.
+ */
+ if (error == EINPROGRESS)
+ return (0);
+ dmu_buf_rele(db, zgd);
+ kmem_free(zgd, sizeof (zgd_t));
+ return (error);
+}
+
/*
* zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
*
* We store data in the log buffers if it's small enough.
- * Otherwise we flush the data out via dmu_sync().
+ * Otherwise we will later flush the data out via dmu_sync().
*/
-ssize_t zvol_immediate_write_sz = 65536;
+ssize_t zvol_immediate_write_sz = 32768;
-int
+void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
char *addr)
{
- dmu_object_info_t doi;
ssize_t nbytes;
itx_t *itx;
lr_write_t *lr;
- objset_t *os;
- dmu_buf_t *db;
- uint64_t txg;
+ zilog_t *zilog = zv->zv_zilog;
uint64_t boff;
- int error;
uint32_t blocksize;
/* handle common case */
if (len <= zvol_immediate_write_sz) {
itx = zvol_immediate_itx(off, len, addr);
- (void) zil_itx_assign(zv->zv_zilog, itx, tx);
- return (0);
+ (void) zil_itx_assign(zilog, itx, tx);
}
- txg = dmu_tx_get_txg(tx);
- os = zv->zv_objset;
+ blocksize = zv->zv_volblocksize;
- /*
- * We need to dmu_sync() each block in the range.
- * For this we need the blocksize.
- */
- error = dmu_object_info(os, ZVOL_OBJ, &doi);
- if (error)
- return (error);
- blocksize = doi.doi_data_block_size;
-
- /*
- * We need to immediate write or dmu_sync() each block in the range.
- */
while (len) {
nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
if (nbytes <= zvol_immediate_write_sz) {
itx = zvol_immediate_itx(off, nbytes, addr);
} else {
- boff = P2ALIGN_TYPED(off, blocksize, uint64_t);
+ boff = P2ALIGN_TYPED(off, blocksize, uint64_t);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ itx->itx_wr_state = WR_INDIRECT;
+ itx->itx_private = zv;
lr = (lr_write_t *)&itx->itx_lr;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
lr->lr_length = nbytes;
lr->lr_blkoff = off - boff;
BP_ZERO(&lr->lr_blkptr);
-
- /* XXX - we should do these IOs in parallel */
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
- FTAG, &db));
- ASSERT(boff == db->db_offset);
- error = dmu_sync(NULL, db, &lr->lr_blkptr,
- txg, NULL, NULL);
- dmu_buf_rele(db, FTAG);
- if (error) {
- kmem_free(itx, offsetof(itx_t, itx_lr));
- return (error);
- }
- itx->itx_wr_state = WR_COPIED;
}
- (void) zil_itx_assign(zv->zv_zilog, itx, tx);
+ (void) zil_itx_assign(zilog, itx, tx);
len -= nbytes;
off += nbytes;
}
- return (0);
}
int
@@ -777,7 +819,6 @@
int error = 0;
int sync;
int reading;
- int txg_sync_needed = B_FALSE;
if (zv == NULL) {
bioerror(bp, ENXIO);
@@ -822,7 +863,7 @@
while (resid != 0 && off < volsize) {
- size = MIN(resid, 1UL << 20); /* cap at 1MB per tx */
+ size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
if (size > volsize - off) /* don't write past the end */
size = volsize - off;
@@ -837,13 +878,9 @@
dmu_tx_abort(tx);
} else {
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- if (sync) {
- /* use the ZIL to commit this write */
- if (zvol_log_write(zv, tx, off, size,
- addr) != 0) {
- txg_sync_needed = B_TRUE;
- }
- }
+ /* add a log write transaction */
+ if (sync)
+ zvol_log_write(zv, tx, off, size, addr);
dmu_tx_commit(tx);
}
}
@@ -860,42 +897,55 @@
biodone(bp);
- if (sync) {
- if (txg_sync_needed)
- txg_wait_synced(dmu_objset_pool(os), 0);
- else
- zil_commit(zv->zv_zilog, UINT64_MAX, 0);
- }
+ if (sync)
+ zil_commit(zv->zv_zilog, UINT64_MAX, 0);
return (0);
}
+/*
+ * Set the buffer count to the zvol maximum transfer.
+ * Using our own routine instead of the default minphys()
+ * means that for larger writes we write bigger buffers on X86
+ * (128K instead of 56K) and flush the disk write cache less often
+ * (every zvol_maxphys - currently 1MB) instead of minphys (currently
+ * 56K on X86 and 128K on sparc).
+ */
+void
+zvol_minphys(struct buf *bp)
+{
+ if (bp->b_bcount > zvol_maxphys)
+ bp->b_bcount = zvol_maxphys;
+}
+
/*ARGSUSED*/
int
zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
{
- return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
+ return (physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uiop));
}
/*ARGSUSED*/
int
zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
{
- return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
+ return (physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uiop));
}
/*ARGSUSED*/
int
zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
{
- return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
+ return (aphysio(zvol_strategy, anocancel, dev, B_READ, zvol_minphys,
+ aio));
}
/*ARGSUSED*/
int
zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
{
- return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
+ return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, zvol_minphys,
+ aio));
}
/*