6959659 Extending end of file *may* get lost on replay of writes.
--- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h Tue Jul 06 14:50:56 2010 -0600
@@ -79,6 +79,7 @@
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
+ uint64_t z_replay_eof; /* New end of file - replay only */
sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
#define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c Tue Jul 06 14:50:56 2010 -0600
@@ -624,7 +624,7 @@
znode_t *zp;
int error;
ssize_t resid;
- uint64_t orig_eof, eod, offset, length;
+ uint64_t eod, offset, length;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -642,9 +642,20 @@
offset = lr->lr_offset;
length = lr->lr_length;
- eod = offset + length; /* end of data for this write */
+ eod = offset + length; /* end of data for this write */
- orig_eof = zp->z_size;
+ /*
+ * This may be a write from a dmu_sync() for a whole block,
+ * and may extend beyond the current end of the file.
+ * We can't just replay what was written for this TX_WRITE as
+ * a future TX_WRITE2 may extend the eof and the data for that
+ * write needs to be there. So we write the whole block and
+ * reduce the eof. This needs to be done within the single dmu
+ * transaction created within vn_rdwr -> zfs_write. So a possible
+ * new end of file is passed through in zfsvfs->z_replay_eof
+ */
+
+ zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
/* If it's a dmu_sync() block, write the whole block */
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -653,23 +664,15 @@
offset -= offset % blocksize;
length = blocksize;
}
+ if (zp->z_size < eod)
+ zfsvfs->z_replay_eof = eod;
}
error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
- /*
- * This may be a write from a dmu_sync() for a whole block,
- * and may extend beyond the current end of the file.
- * We can't just replay what was written for this TX_WRITE as
- * a future TX_WRITE2 may extend the eof and the data for that
- * write needs to be there. So we write the whole block and
- * reduce the eof.
- */
- if (orig_eof < zp->z_size) /* file length grew ? */
- zp->z_size = eod;
-
VN_RELE(ZTOV(zp));
+ zfsvfs->z_replay_eof = 0; /* safety */
return (error);
}
@@ -693,9 +696,32 @@
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
+top:
end = lr->lr_offset + lr->lr_length;
- if (end > zp->z_size)
+ if (end > zp->z_size) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
zp->z_size = end;
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ VN_RELE(ZTOV(zp));
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+ /* Ensure the replayed seq is updated */
+ (void) zil_replaying(zfsvfs->z_log, tx);
+
+ dmu_tx_commit(tx);
+ }
VN_RELE(ZTOV(zp));
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 06 11:05:17 2010 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Tue Jul 06 14:50:56 2010 -0600
@@ -893,6 +893,14 @@
uio->uio_loffset);
ASSERT(error == 0);
}
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);