usr/src/uts/common/fs/zfs/zil.c
changeset 8227 f7d7be9b1f56
parent 8109 6147a1bdd359
child 8746 e1d96ca6808c
--- a/usr/src/uts/common/fs/zfs/zil.c	Tue Nov 25 09:39:25 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/zil.c	Tue Nov 25 13:18:25 2008 -0700
@@ -1220,7 +1220,7 @@
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+	zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
@@ -1229,7 +1229,7 @@
 		ASSERT(spa_sync_pass(spa) == 1);
 
 		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
@@ -1466,9 +1466,7 @@
 typedef struct zil_replay_arg {
 	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
-	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
-	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
 	char		*zr_lrbuf;
 } zil_replay_arg_t;
@@ -1481,9 +1479,9 @@
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	char *name;
-	int pass, error, sunk;
+	int pass, error;
 
-	if (zilog->zl_stop_replay)
+	if (!zilog->zl_replay)			/* giving up */
 		return;
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
@@ -1495,6 +1493,11 @@
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
+	if (txtype == 0 || txtype >= TX_MAX_TYPE) {
+		error = EINVAL;
+		goto bad;
+	}
+
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
@@ -1545,69 +1548,16 @@
 	}
 
 	/*
-	 * Replay of large truncates can end up needing additional txs
-	 * and a different txg. If they are nested within the replay tx
-	 * as below then a hang is possible. So we do the truncate here
-	 * and redo the truncate later (a no-op) and update the sequence
-	 * number whilst in the replay tx. Fortunately, it's safe to repeat
-	 * a truncate if we crash and the truncate commits. A create over
-	 * an existing file will also come in as a TX_TRUNCATE record.
-	 *
-	 * Note, remove of large files and renames over large files is
-	 * handled by putting the deleted object on a stable list
-	 * and if necessary force deleting the object outside of the replay
-	 * transaction using the zr_replay_cleaner.
-	 */
-	if (txtype == TX_TRUNCATE) {
-		*zr->zr_txgp = TXG_NOWAIT;
-		error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
-		    zr->zr_byteswap);
-		if (error)
-			goto bad;
-		zr->zr_byteswap = 0; /* only byteswap once */
-	}
-
-	/*
 	 * We must now do two things atomically: replay this log record,
-	 * and update the log header to reflect the fact that we did so.
-	 * We use the DMU's ability to assign into a specific txg to do this.
+	 * and update the log header sequence number to reflect the fact that
+	 * we did so. At the end of each replay function the sequence number
+	 * is updated if we are in replay mode.
 	 */
-	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-		uint64_t replay_txg;
-		dmu_tx_t *replay_tx;
-
-		replay_tx = dmu_tx_create(zr->zr_os);
-		error = dmu_tx_assign(replay_tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(replay_tx);
-			break;
-		}
-
-		replay_txg = dmu_tx_get_txg(replay_tx);
-
-		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-			error = EINVAL;
-		} else {
-			/*
-			 * On the first pass, arrange for the replay vector
-			 * to fail its dmu_tx_assign().  That's the only way
-			 * to ensure that those code paths remain well tested.
-			 *
-			 * Only byteswap (if needed) on the 1st pass.
-			 */
-			*zr->zr_txgp = replay_txg - (pass == 1);
-			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap && pass == 1);
-			*zr->zr_txgp = TXG_NOWAIT;
-		}
-
-		if (error == 0) {
-			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-			    lr->lrc_seq;
-		}
-
-		dmu_tx_commit(replay_tx);
+	for (pass = 1; pass <= 2; pass++) {
+		zilog->zl_replaying_seq = lr->lrc_seq;
+		/* Only byteswap (if needed) on the 1st pass.  */
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
+		    zr->zr_byteswap && pass == 1);
 
 		if (!error)
 			return;
@@ -1615,37 +1565,22 @@
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error other than ERESTART
-		 * we try syncing out any removes then retrying the
-		 * transaction.
+		 * EEXIST. So if we receive any error we try syncing out
+		 * any removes then retry the transaction.
 		 */
-		if (error != ERESTART && !sunk) {
-			if (zr->zr_replay_cleaner)
-				zr->zr_replay_cleaner(zr->zr_arg);
+		if (pass == 1)
 			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-			sunk = B_TRUE;
-			continue; /* retry */
-		}
-
-		if (error != ERESTART)
-			break;
-
-		if (pass != 1)
-			txg_wait_open(spa_get_dsl(zilog->zl_spa),
-			    replay_txg + 1);
-
-		dprintf("pass %d, retrying\n", pass);
 	}
 
 bad:
-	ASSERT(error && error != ERESTART);
+	ASSERT(error);
 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 	dmu_objset_name(zr->zr_os, name);
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
 	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-	zilog->zl_stop_replay = 1;
+	zilog->zl_replay = B_FALSE;
 	kmem_free(name, MAXNAMELEN);
 }
 
@@ -1660,9 +1595,7 @@
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE],
-	zil_replay_cleaner_t *replay_cleaner)
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
@@ -1675,9 +1608,7 @@
 
 	zr.zr_os = os;
 	zr.zr_replay = replay_func;
-	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
-	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
@@ -1686,7 +1617,7 @@
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
-	zilog->zl_stop_replay = 0;
+	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = lbolt;
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
@@ -1695,6 +1626,7 @@
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+	zilog->zl_replay = B_FALSE;
 }
 
 /*