6773224 zvol_log_write() is inefficient
authorNeil Perrin <Neil.Perrin@Sun.COM>
Mon, 20 Apr 2009 16:11:21 -0600
changeset 9401 afae664f76f6
parent 9400 454c9c7026bd
child 9402 ffac5760d12b
6773224 zvol_log_write() is inefficient 6797713 zfs_log_write() and zvol_log_write() could share logic 6825801 itx_sync picks up garbage for zvols
usr/src/uts/common/fs/zfs/sys/zil.h
usr/src/uts/common/fs/zfs/sys/zil_impl.h
usr/src/uts/common/fs/zfs/zfs_log.c
usr/src/uts/common/fs/zfs/zvol.c
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Mon Apr 20 16:11:21 2009 -0600
@@ -305,7 +305,27 @@
  */
 
 /*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ *    In this mode, if we need to commit the write later, then the block
+ *    is immediately written into the file system (using dmu_sync),
+ *    and a pointer to the block is put into the log record.
+ *    When the txg commits the block is linked in.
+ *    This saves additionally writing the data into the log record.
+ *    There are a few requirements for this to occur:
+ *	- write is greater than zfs/zvol_immediate_write_sz
+ *	- not using slogs (as slogs are assumed to always be faster
+ *	  than writing into the main pool)
+ *	- the write occupies only one block
+ * WR_COPIED:
+ *    If we know we'll immediately be committing the
+ *    transaction (FSYNC or FDSYNC), the we allocate a larger
+ *    log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ *    Otherwise we don't allocate a buffer, and *if* we need to
+ *    flush the write later then a buffer is allocated and
+ *    we retrieve the data using the dmu.
  */
 typedef enum {
 	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Mon Apr 20 16:11:21 2009 -0600
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -101,6 +101,9 @@
 	avl_node_t	zn_node;
 } zil_dva_node_t;
 
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+    sizeof (lr_write_t))
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Apr 20 16:11:21 2009 -0600
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -467,9 +467,6 @@
  */
 ssize_t zfs_immediate_write_sz = 32768;
 
-#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
-    sizeof (lr_write_t))
-
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -483,29 +480,6 @@
 
 	ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
 
-	/*
-	 * Writes are handled in three different ways:
-	 *
-	 * WR_INDIRECT:
-	 *    In this mode, if we need to commit the write later, then the block
-	 *    is immediately written into the file system (using dmu_sync),
-	 *    and a pointer to the block is put into the log record.
-	 *    When the txg commits the block is linked in.
-	 *    This saves additionally writing the data into the log record.
-	 *    There are a few requirements for this to occur:
-	 *	- write is greater than zfs_immediate_write_sz
-	 *	- not using slogs (as slogs are assumed to always be faster
-	 *	  than writing into the main pool)
-	 *	- the write occupies only one block
-	 * WR_COPIED:
-	 *    If we know we'll immediately be committing the
-	 *    transaction (FSYNC or FDSYNC), the we allocate a larger
-	 *    log record here for the data and copy the data in.
-	 * WR_NEED_COPY:
-	 *    Otherwise we don't allocate a buffer, and *if* we need to
-	 *    flush the write later then a buffer is allocated and
-	 *    we retrieve the data using the dmu.
-	 */
 	slogging = spa_has_slogs(zilog->zl_spa);
 	if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
 		write_state = WR_INDIRECT;
--- a/usr/src/uts/common/fs/zfs/zvol.c	Mon Apr 20 16:00:39 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Mon Apr 20 16:11:21 2009 -0600
@@ -969,11 +969,15 @@
 ssize_t zvol_immediate_write_sz = 32768;
 
 static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+    boolean_t sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
-	lr_write_t *lr;
+	boolean_t slogging;
+
+	if (zil_disable)
+		return;
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
@@ -982,23 +986,58 @@
 		return;
 	}
 
-	while (len) {
-		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
-		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+	slogging = spa_has_slogs(zilog->zl_spa);
+
+	while (resid) {
+		itx_t *itx;
+		lr_write_t *lr;
+		ssize_t len;
+		itx_wr_state_t write_state;
 
-		itx->itx_wr_state =
-		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
-		itx->itx_private = zv;
+		/*
+		 * Unlike zfs_log_write() we can be called with
+		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
+		 */
+		if (blocksize > zvol_immediate_write_sz && !slogging &&
+		    resid >= blocksize && off % blocksize == 0) {
+			write_state = WR_INDIRECT; /* uses dmu_sync */
+			len = blocksize;
+		} else if (sync) {
+			write_state = WR_COPIED;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		} else {
+			write_state = WR_NEED_COPY;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		}
+
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		    ZVOL_OBJ, off, len, lr + 1) != 0) {
+			kmem_free(itx, offsetof(itx_t, itx_lr) +
+			    itx->itx_lr.lrc_reclen);
+			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			write_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
-		lr->lr_length = nbytes;
+		lr->lr_length = len;
 		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
 		BP_ZERO(&lr->lr_blkptr);
 
+		itx->itx_private = zv;
+		itx->itx_sync = sync;
+
 		(void) zil_itx_assign(zilog, itx, tx);
-		len -= nbytes;
-		off += nbytes;
+
+		off += len;
+		resid -= len;
 	}
 }
 
@@ -1087,6 +1126,7 @@
 	int error = 0;
 	boolean_t doread = bp->b_flags & B_READ;
 	boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
+	boolean_t sync;
 
 	if (zv == NULL) {
 		bioerror(bp, ENXIO);
@@ -1124,6 +1164,9 @@
 		return (0);
 	}
 
+	sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump &&
+	    !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
@@ -1147,7 +1190,7 @@
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				zvol_log_write(zv, tx, off, size);
+				zvol_log_write(zv, tx, off, size, sync);
 				dmu_tx_commit(tx);
 			}
 		}
@@ -1166,8 +1209,7 @@
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
 
-	if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable &&
-	    !is_dump && !(zv->zv_flags & ZVOL_WCE))
+	if (sync)
 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 	biodone(bp);
 
@@ -1282,6 +1324,7 @@
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
+	boolean_t sync;
 
 	if (minor == 0)			/* This is the control device */
 		return (ENXIO);
@@ -1301,6 +1344,8 @@
 		return (error);
 	}
 
+	sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_WRITER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1319,14 +1364,14 @@
 		}
 		error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
 		if (error == 0)
-			zvol_log_write(zv, tx, off, bytes);
+			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(rl);
-	if (!zil_disable && !(zv->zv_flags & ZVOL_WCE))
+	if (sync)
 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 	return (error);
 }