6773224 zvol_log_write() is inefficient
6797713 zfs_log_write() and zvol_log_write() could share logic
6825801 itx_sync picks up garbage for zvols
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Mon Apr 20 16:11:21 2009 -0600
@@ -305,7 +305,27 @@
*/
/*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ * In this mode, if we need to commit the write later, then the block
+ * is immediately written into the file system (using dmu_sync),
+ * and a pointer to the block is put into the log record.
+ * When the txg commits the block is linked in.
+ * This saves additionally writing the data into the log record.
+ * There are a few requirements for this to occur:
+ * - write is greater than zfs/zvol_immediate_write_sz
+ * - not using slogs (as slogs are assumed to always be faster
+ * than writing into the main pool)
+ * - the write occupies only one block
+ * WR_COPIED:
+ * If we know we'll immediately be committing the
+ * transaction (FSYNC or FDSYNC), the we allocate a larger
+ * log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ * Otherwise we don't allocate a buffer, and *if* we need to
+ * flush the write later then a buffer is allocated and
+ * we retrieve the data using the dmu.
*/
typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Mon Apr 20 16:11:21 2009 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -101,6 +101,9 @@
avl_node_t zn_node;
} zil_dva_node_t;
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+ sizeof (lr_write_t))
+
#ifdef __cplusplus
}
#endif
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c Mon Apr 20 16:11:21 2009 -0600
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -467,9 +467,6 @@
*/
ssize_t zfs_immediate_write_sz = 32768;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
- sizeof (lr_write_t))
-
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -483,29 +480,6 @@
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
- /*
- * Writes are handled in three different ways:
- *
- * WR_INDIRECT:
- * In this mode, if we need to commit the write later, then the block
- * is immediately written into the file system (using dmu_sync),
- * and a pointer to the block is put into the log record.
- * When the txg commits the block is linked in.
- * This saves additionally writing the data into the log record.
- * There are a few requirements for this to occur:
- * - write is greater than zfs_immediate_write_sz
- * - not using slogs (as slogs are assumed to always be faster
- * than writing into the main pool)
- * - the write occupies only one block
- * WR_COPIED:
- * If we know we'll immediately be committing the
- * transaction (FSYNC or FDSYNC), the we allocate a larger
- * log record here for the data and copy the data in.
- * WR_NEED_COPY:
- * Otherwise we don't allocate a buffer, and *if* we need to
- * flush the write later then a buffer is allocated and
- * we retrieve the data using the dmu.
- */
slogging = spa_has_slogs(zilog->zl_spa);
if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
write_state = WR_INDIRECT;
--- a/usr/src/uts/common/fs/zfs/zvol.c Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zvol.c Mon Apr 20 16:11:21 2009 -0600
@@ -969,11 +969,15 @@
ssize_t zvol_immediate_write_sz = 32768;
static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+ boolean_t sync)
{
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
- lr_write_t *lr;
+ boolean_t slogging;
+
+ if (zil_disable)
+ return;
if (zilog->zl_replay) {
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
@@ -982,23 +986,58 @@
return;
}
- while (len) {
- ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
- itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ slogging = spa_has_slogs(zilog->zl_spa);
+
+ while (resid) {
+ itx_t *itx;
+ lr_write_t *lr;
+ ssize_t len;
+ itx_wr_state_t write_state;
- itx->itx_wr_state =
- len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY;
- itx->itx_private = zv;
+ /*
+ * Unlike zfs_log_write() we can be called with
+ * upto DMU_MAX_ACCESS/2 (5MB) writes.
+ */
+ if (blocksize > zvol_immediate_write_sz && !slogging &&
+ resid >= blocksize && off % blocksize == 0) {
+ write_state = WR_INDIRECT; /* uses dmu_sync */
+ len = blocksize;
+ } else if (sync) {
+ write_state = WR_COPIED;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ } else {
+ write_state = WR_NEED_COPY;
+ len = MIN(ZIL_MAX_LOG_DATA, resid);
+ }
+
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+ (write_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
+ if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ ZVOL_OBJ, off, len, lr + 1) != 0) {
+ kmem_free(itx, offsetof(itx_t, itx_lr) +
+ itx->itx_lr.lrc_reclen);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ lr = (lr_write_t *)&itx->itx_lr;
+ write_state = WR_NEED_COPY;
+ }
+
+ itx->itx_wr_state = write_state;
+ if (write_state == WR_NEED_COPY)
+ itx->itx_sod += len;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
- lr->lr_length = nbytes;
+ lr->lr_length = len;
lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
BP_ZERO(&lr->lr_blkptr);
+ itx->itx_private = zv;
+ itx->itx_sync = sync;
+
(void) zil_itx_assign(zilog, itx, tx);
- len -= nbytes;
- off += nbytes;
+
+ off += len;
+ resid -= len;
}
}
@@ -1087,6 +1126,7 @@
int error = 0;
boolean_t doread = bp->b_flags & B_READ;
boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
+ boolean_t sync;
if (zv == NULL) {
bioerror(bp, ENXIO);
@@ -1124,6 +1164,9 @@
return (0);
}
+ sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump &&
+ !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
/*
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
@@ -1147,7 +1190,7 @@
dmu_tx_abort(tx);
} else {
dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
- zvol_log_write(zv, tx, off, size);
+ zvol_log_write(zv, tx, off, size, sync);
dmu_tx_commit(tx);
}
}
@@ -1166,8 +1209,7 @@
if ((bp->b_resid = resid) == bp->b_bcount)
bioerror(bp, off > volsize ? EINVAL : error);
- if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable &&
- !is_dump && !(zv->zv_flags & ZVOL_WCE))
+ if (sync)
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
biodone(bp);
@@ -1282,6 +1324,7 @@
uint64_t volsize;
rl_t *rl;
int error = 0;
+ boolean_t sync;
if (minor == 0) /* This is the control device */
return (ENXIO);
@@ -1301,6 +1344,8 @@
return (error);
}
+ sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_WRITER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1319,14 +1364,14 @@
}
error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
if (error == 0)
- zvol_log_write(zv, tx, off, bytes);
+ zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx);
if (error)
break;
}
zfs_range_unlock(rl);
- if (!zil_disable && !(zv->zv_flags & ZVOL_WCE))
+ if (sync)
zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
return (error);
}