6959659 Extending end of file *may* get lost on replay of writes.
authorNeil Perrin <Neil.Perrin@Sun.COM>
Tue, 06 Jul 2010 14:50:56 -0600
changeset 12772 71326a76a341
parent 12771 9e4f2f9983b9
child 12773 f0d486d64936
6959659 Extending end of file *may* get lost on replay of writes.
usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
usr/src/uts/common/fs/zfs/zfs_replay.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
--- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h	Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h	Tue Jul 06 14:50:56 2010 -0600
@@ -79,6 +79,7 @@
 	kmutex_t	z_lock;
 	uint64_t	z_userquota_obj;
 	uint64_t	z_groupquota_obj;
+	uint64_t	z_replay_eof;	/* New end of file - replay only */
 	sa_attr_type_t	*z_attr_table;	/* SA attr mapping->id */
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c	Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c	Tue Jul 06 14:50:56 2010 -0600
@@ -624,7 +624,7 @@
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
-	uint64_t orig_eof, eod, offset, length;
+	uint64_t eod, offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -642,9 +642,20 @@
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
-	eod = offset + length;		/* end of data for this write */
+	eod = offset + length;	/* end of data for this write */
 
-	orig_eof = zp->z_size;
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof. This needs to be done within the single dmu
+	 * transaction created within vn_rdwr -> zfs_write. So a possible
+	 * new end of file is passed through in zfsvfs->z_replay_eof
+	 */
+
+	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -653,23 +664,15 @@
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
+		if (zp->z_size < eod)
+			zfsvfs->z_replay_eof = eod;
 	}
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
-	/*
-	 * This may be a write from a dmu_sync() for a whole block,
-	 * and may extend beyond the current end of the file.
-	 * We can't just replay what was written for this TX_WRITE as
-	 * a future TX_WRITE2 may extend the eof and the data for that
-	 * write needs to be there. So we write the whole block and
-	 * reduce the eof.
-	 */
-	if (orig_eof < zp->z_size) /* file length grew ? */
-		zp->z_size = eod;
-
 	VN_RELE(ZTOV(zp));
+	zfsvfs->z_replay_eof = 0;	/* safety */
 
 	return (error);
 }
@@ -693,9 +696,32 @@
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
+top:
 	end = lr->lr_offset + lr->lr_length;
-	if (end > zp->z_size)
+	if (end > zp->z_size) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
 		zp->z_size = end;
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			VN_RELE(ZTOV(zp));
+			if (error == ERESTART) {
+				dmu_tx_wait(tx);
+				dmu_tx_abort(tx);
+				goto top;
+			}
+			dmu_tx_abort(tx);
+			return (error);
+		}
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+		    (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+		/* Ensure the replayed seq is updated */
+		(void) zil_replaying(zfsvfs->z_log, tx);
+
+		dmu_tx_commit(tx);
+	}
 
 	VN_RELE(ZTOV(zp));
 
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Tue Jul 06 14:50:56 2010 -0600
@@ -893,6 +893,14 @@
 			    uio->uio_loffset);
 			ASSERT(error == 0);
 		}
+		/*
+		 * If we are replaying and eof is non zero then force
+		 * the file size to the specified eof. Note, there's no
+		 * concurrency during replay.
+		 */
+		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+			zp->z_size = zfsvfs->z_replay_eof;
+
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);