usr/src/uts/common/fs/zfs/zil.c
changeset 1807 35c8b566d7af
parent 1669 3521dbbcb2e8
child 1842 1712a484fc9d
equal deleted inserted replaced
1806:7f3c457c93fd 1807:35c8b566d7af
   125 	avl_insert(t, zn, where);
   125 	avl_insert(t, zn, where);
   126 
   126 
   127 	return (0);
   127 	return (0);
   128 }
   128 }
   129 
   129 
       
   130 static zil_header_t *
       
   131 zil_header_in_syncing_context(zilog_t *zilog)
       
   132 {
       
   133 	return ((zil_header_t *)zilog->zl_header);
       
   134 }
       
   135 
       
   136 static void
       
   137 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
       
   138 {
       
   139 	zio_cksum_t *zc = &bp->blk_cksum;
       
   140 
       
   141 	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
       
   142 	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
       
   143 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
       
   144 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
       
   145 }
       
   146 
   130 /*
   147 /*
   131  * Read a log block, make sure it's valid, and byteswap it if necessary.
   148  * Read a log block, make sure it's valid, and byteswap it if necessary.
   132  */
   149  */
   133 static int
   150 static int
   134 zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
   151 zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
   135 {
   152 {
   136 	uint64_t blksz = BP_GET_LSIZE(bp);
   153 	blkptr_t blk = *bp;
   137 	zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
       
   138 	zio_cksum_t cksum;
       
   139 	zbookmark_t zb;
   154 	zbookmark_t zb;
   140 	int error;
   155 	int error;
   141 
   156 
   142 	zb.zb_objset = bp->blk_cksum.zc_word[2];
   157 	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
   143 	zb.zb_object = 0;
   158 	zb.zb_object = 0;
   144 	zb.zb_level = -1;
   159 	zb.zb_level = -1;
   145 	zb.zb_blkid = bp->blk_cksum.zc_word[3];
   160 	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
   146 
   161 
   147 	error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
   162 	*abufpp = NULL;
   148 	    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
   163 
   149 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
   164 	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
   150 	if (error) {
   165 	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
   151 		dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
   166 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, ARC_WAIT, &zb);
   152 		    zilog, bp, error);
   167 
   153 		return (error);
   168 	if (error == 0) {
   154 	}
   169 		char *data = (*abufpp)->b_data;
   155 
   170 		uint64_t blksz = BP_GET_LSIZE(bp);
   156 	if (BP_SHOULD_BYTESWAP(bp))
   171 		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
   157 		byteswap_uint64_array(buf, blksz);
   172 		zio_cksum_t cksum = bp->blk_cksum;
   158 
   173 
   159 	/*
   174 		/*
   160 	 * Sequence numbers should be... sequential.  The checksum verifier for
   175 		 * Sequence numbers should be... sequential.  The checksum
   161 	 * the next block should be: <logid[0], logid[1], objset id, seq + 1>.
   176 		 * verifier for the next block should be bp's checksum plus 1.
   162 	 */
   177 		 */
   163 	cksum = bp->blk_cksum;
   178 		cksum.zc_word[ZIL_ZC_SEQ]++;
   164 	cksum.zc_word[3]++;
   179 
   165 	if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)) != 0) {
   180 		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, sizeof (cksum)))
   166 		dprintf_bp(bp, "zilog %p bp %p stale pointer: ", zilog, bp);
   181 			error = ESTALE;
   167 		return (ESTALE);
   182 		else if (BP_IS_HOLE(&ztp->zit_next_blk))
   168 	}
   183 			error = ENOENT;
   169 
   184 		else if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))
   170 	if (BP_IS_HOLE(&ztp->zit_next_blk)) {
   185 			error = EOVERFLOW;
   171 		dprintf_bp(bp, "zilog %p bp %p hole: ", zilog, bp);
   186 
   172 		return (ENOENT);
   187 		if (error) {
   173 	}
   188 			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
   174 
   189 			*abufpp = NULL;
   175 	if (ztp->zit_nused > (blksz - sizeof (zil_trailer_t))) {
   190 		}
   176 		dprintf("zilog %p bp %p nused exceeds blksz\n", zilog, bp);
   191 	}
   177 		return (EOVERFLOW);
   192 
   178 	}
   193 	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
   179 
   194 
   180 	dprintf_bp(bp, "zilog %p bp %p good block: ", zilog, bp);
   195 	return (error);
   181 
       
   182 	return (0);
       
   183 }
   196 }
   184 
   197 
   185 /*
   198 /*
   186  * Parse the intent log, and call parse_func for each valid record within.
   199  * Parse the intent log, and call parse_func for each valid record within.
   187  */
   200  * Return the highest sequence number.
   188 void
   201  */
       
   202 uint64_t
   189 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
   203 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
   190     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
   204     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
   191 {
   205 {
   192 	blkptr_t blk;
   206 	const zil_header_t *zh = zilog->zl_header;
       
   207 	uint64_t claim_seq = zh->zh_claim_seq;
       
   208 	uint64_t seq = 0;
       
   209 	uint64_t max_seq = 0;
       
   210 	blkptr_t blk = zh->zh_log;
       
   211 	arc_buf_t *abuf;
   193 	char *lrbuf, *lrp;
   212 	char *lrbuf, *lrp;
   194 	zil_trailer_t *ztp;
   213 	zil_trailer_t *ztp;
   195 	int reclen, error;
   214 	int reclen, error;
   196 
   215 
   197 	blk = zilog->zl_header->zh_log;
       
   198 	if (BP_IS_HOLE(&blk))
   216 	if (BP_IS_HOLE(&blk))
   199 		return;
   217 		return (max_seq);
   200 
   218 
   201 	/*
   219 	/*
   202 	 * Starting at the block pointed to by zh_log we read the log chain.
   220 	 * Starting at the block pointed to by zh_log we read the log chain.
   203 	 * For each block in the chain we strongly check that block to
   221 	 * For each block in the chain we strongly check that block to
   204 	 * ensure its validity.  We stop when an invalid block is found.
   222 	 * ensure its validity.  We stop when an invalid block is found.
   205 	 * For each block pointer in the chain we call parse_blk_func().
   223 	 * For each block pointer in the chain we call parse_blk_func().
   206 	 * For each record in each valid block we call parse_lr_func().
   224 	 * For each record in each valid block we call parse_lr_func().
       
   225 	 * If the log has been claimed, stop if we encounter a sequence
       
   226 	 * number greater than the highest claimed sequence number.
   207 	 */
   227 	 */
   208 	zil_dva_tree_init(&zilog->zl_dva_tree);
   228 	zil_dva_tree_init(&zilog->zl_dva_tree);
   209 	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
       
   210 	for (;;) {
   229 	for (;;) {
   211 		error = zil_read_log_block(zilog, &blk, lrbuf);
   230 		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
       
   231 
       
   232 		if (claim_seq != 0 && seq > claim_seq)
       
   233 			break;
       
   234 
       
   235 		ASSERT(max_seq < seq);
       
   236 		max_seq = seq;
       
   237 
       
   238 		error = zil_read_log_block(zilog, &blk, &abuf);
   212 
   239 
   213 		if (parse_blk_func != NULL)
   240 		if (parse_blk_func != NULL)
   214 			parse_blk_func(zilog, &blk, arg, txg);
   241 			parse_blk_func(zilog, &blk, arg, txg);
   215 
   242 
   216 		if (error)
   243 		if (error)
   217 			break;
   244 			break;
   218 
   245 
       
   246 		lrbuf = abuf->b_data;
   219 		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
   247 		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
   220 		blk = ztp->zit_next_blk;
   248 		blk = ztp->zit_next_blk;
   221 
   249 
   222 		if (parse_lr_func == NULL)
   250 		if (parse_lr_func == NULL) {
       
   251 			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
   223 			continue;
   252 			continue;
       
   253 		}
   224 
   254 
   225 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
   255 		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
   226 			lr_t *lr = (lr_t *)lrp;
   256 			lr_t *lr = (lr_t *)lrp;
   227 			reclen = lr->lrc_reclen;
   257 			reclen = lr->lrc_reclen;
   228 			ASSERT3U(reclen, >=, sizeof (lr_t));
   258 			ASSERT3U(reclen, >=, sizeof (lr_t));
   229 			parse_lr_func(zilog, lr, arg, txg);
   259 			parse_lr_func(zilog, lr, arg, txg);
   230 		}
   260 		}
   231 	}
   261 		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
   232 	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
   262 	}
   233 	zil_dva_tree_fini(&zilog->zl_dva_tree);
   263 	zil_dva_tree_fini(&zilog->zl_dva_tree);
       
   264 
       
   265 	return (max_seq);
   234 }
   266 }
   235 
   267 
   236 /* ARGSUSED */
   268 /* ARGSUSED */
   237 static void
   269 static void
   238 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
   270 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
   239 {
   271 {
   240 	spa_t *spa = zilog->zl_spa;
   272 	spa_t *spa = zilog->zl_spa;
   241 	int err;
   273 	int err;
   242 
       
   243 	dprintf_bp(bp, "first_txg %llu: ", first_txg);
       
   244 
   274 
   245 	/*
   275 	/*
   246 	 * Claim log block if not already committed and not already claimed.
   276 	 * Claim log block if not already committed and not already claimed.
   247 	 */
   277 	 */
   248 	if (bp->blk_birth >= first_txg &&
   278 	if (bp->blk_birth >= first_txg &&
   289  * Create an on-disk intent log.
   319  * Create an on-disk intent log.
   290  */
   320  */
   291 static void
   321 static void
   292 zil_create(zilog_t *zilog)
   322 zil_create(zilog_t *zilog)
   293 {
   323 {
       
   324 	const zil_header_t *zh = zilog->zl_header;
   294 	lwb_t *lwb;
   325 	lwb_t *lwb;
   295 	uint64_t txg;
   326 	uint64_t txg = 0;
   296 	dmu_tx_t *tx;
   327 	dmu_tx_t *tx = NULL;
   297 	blkptr_t blk;
   328 	blkptr_t blk;
   298 	int error;
   329 	int error = 0;
   299 	int no_blk;
   330 
   300 
   331 	/*
   301 	ASSERT(zilog->zl_header->zh_claim_txg == 0);
   332 	 * Wait for any previous destroy to complete.
   302 	ASSERT(zilog->zl_header->zh_replay_seq == 0);
   333 	 */
   303 
   334 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
   304 	/*
   335 
   305 	 * Initialize the log header block.
   336 	ASSERT(zh->zh_claim_txg == 0);
   306 	 */
   337 	ASSERT(zh->zh_replay_seq == 0);
   307 	tx = dmu_tx_create(zilog->zl_os);
   338 
   308 	(void) dmu_tx_assign(tx, TXG_WAIT);
   339 	blk = zh->zh_log;
   309 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   340 
   310 	txg = dmu_tx_get_txg(tx);
   341 	/*
   311 
   342 	 * If we don't already have an initial log block, allocate one now.
   312 	/*
   343 	 */
   313 	 * If we don't have a log block already then
   344 	if (BP_IS_HOLE(&blk)) {
   314 	 * allocate the first log block and assign its checksum verifier.
   345 		tx = dmu_tx_create(zilog->zl_os);
   315 	 */
   346 		(void) dmu_tx_assign(tx, TXG_WAIT);
   316 	no_blk = BP_IS_HOLE(&zilog->zl_header->zh_log);
   347 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   317 	if (no_blk) {
   348 		txg = dmu_tx_get_txg(tx);
   318 		error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
   349 
   319 		    ZIL_MIN_BLKSZ, &blk, txg);
   350 		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg);
   320 	} else {
   351 
   321 		blk = zilog->zl_header->zh_log;
   352 		if (error == 0)
   322 		error = 0;
   353 			zil_init_log_chain(zilog, &blk);
   323 	}
   354 	}
       
   355 
       
   356 	/*
       
   357 	 * Allocate a log write buffer (lwb) for the first log block.
       
   358 	 */
   324 	if (error == 0) {
   359 	if (error == 0) {
   325 		ZIO_SET_CHECKSUM(&blk.blk_cksum,
       
   326 		    spa_get_random(-1ULL), spa_get_random(-1ULL),
       
   327 		    dmu_objset_id(zilog->zl_os), 1ULL);
       
   328 
       
   329 		/*
       
   330 		 * Allocate a log write buffer (lwb) for the first log block.
       
   331 		 */
       
   332 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
   360 		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
   333 		lwb->lwb_zilog = zilog;
   361 		lwb->lwb_zilog = zilog;
   334 		lwb->lwb_blk = blk;
   362 		lwb->lwb_blk = blk;
   335 		lwb->lwb_nused = 0;
   363 		lwb->lwb_nused = 0;
   336 		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
   364 		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
   341 		mutex_enter(&zilog->zl_lock);
   369 		mutex_enter(&zilog->zl_lock);
   342 		list_insert_tail(&zilog->zl_lwb_list, lwb);
   370 		list_insert_tail(&zilog->zl_lwb_list, lwb);
   343 		mutex_exit(&zilog->zl_lock);
   371 		mutex_exit(&zilog->zl_lock);
   344 	}
   372 	}
   345 
   373 
   346 	dmu_tx_commit(tx);
   374 	/*
   347 	if (no_blk)
   375 	 * If we just allocated the first log block, commit our transaction
       
   376 	 * and wait for zil_sync() to stuff the block poiner into zh_log.
       
   377 	 * (zh is part of the MOS, so we cannot modify it in open context.)
       
   378 	 */
       
   379 	if (tx != NULL) {
       
   380 		dmu_tx_commit(tx);
   348 		txg_wait_synced(zilog->zl_dmu_pool, txg);
   381 		txg_wait_synced(zilog->zl_dmu_pool, txg);
       
   382 	}
       
   383 
       
   384 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
   349 }
   385 }
   350 
   386 
   351 /*
   387 /*
   352  * In one tx, free all log blocks and clear the log header.
   388  * In one tx, free all log blocks and clear the log header.
       
   389  * If keep_first is set, then we're replaying a log with no content.
       
   390  * We want to keep the first block, however, so that the first
       
   391  * synchronous transaction doesn't require a txg_wait_synced()
       
   392  * in zil_create().  We don't need to txg_wait_synced() here either
       
   393  * when keep_first is set, because both zil_create() and zil_destroy()
       
   394  * will wait for any in-progress destroys to complete.
   353  */
   395  */
   354 void
   396 void
   355 zil_destroy(zilog_t *zilog)
   397 zil_destroy(zilog_t *zilog, boolean_t keep_first)
   356 {
   398 {
       
   399 	const zil_header_t *zh = zilog->zl_header;
       
   400 	lwb_t *lwb;
   357 	dmu_tx_t *tx;
   401 	dmu_tx_t *tx;
   358 	uint64_t txg;
   402 	uint64_t txg;
   359 
   403 
   360 	mutex_enter(&zilog->zl_destroy_lock);
   404 	/*
   361 
   405 	 * Wait for any previous destroy to complete.
   362 	if (BP_IS_HOLE(&zilog->zl_header->zh_log)) {
   406 	 */
   363 		mutex_exit(&zilog->zl_destroy_lock);
   407 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
       
   408 
       
   409 	if (BP_IS_HOLE(&zh->zh_log))
   364 		return;
   410 		return;
   365 	}
       
   366 
   411 
   367 	tx = dmu_tx_create(zilog->zl_os);
   412 	tx = dmu_tx_create(zilog->zl_os);
   368 	(void) dmu_tx_assign(tx, TXG_WAIT);
   413 	(void) dmu_tx_assign(tx, TXG_WAIT);
   369 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   414 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
   370 	txg = dmu_tx_get_txg(tx);
   415 	txg = dmu_tx_get_txg(tx);
   371 
   416 
   372 	zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx,
   417 	mutex_enter(&zilog->zl_lock);
   373 	    zilog->zl_header->zh_claim_txg);
   418 
   374 	/*
   419 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
   375 	 * zil_sync clears the zil header as soon as the zl_destroy_txg commits
       
   376 	 */
       
   377 	zilog->zl_destroy_txg = txg;
   420 	zilog->zl_destroy_txg = txg;
       
   421 	zilog->zl_keep_first = keep_first;
       
   422 
       
   423 	if (!list_is_empty(&zilog->zl_lwb_list)) {
       
   424 		ASSERT(zh->zh_claim_txg == 0);
       
   425 		ASSERT(!keep_first);
       
   426 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
       
   427 			list_remove(&zilog->zl_lwb_list, lwb);
       
   428 			if (lwb->lwb_buf != NULL)
       
   429 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
       
   430 			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
       
   431 			kmem_cache_free(zil_lwb_cache, lwb);
       
   432 		}
       
   433 		mutex_exit(&zilog->zl_lock);
       
   434 	} else {
       
   435 		mutex_exit(&zilog->zl_lock);
       
   436 		if (!keep_first) {
       
   437 			(void) zil_parse(zilog, zil_free_log_block,
       
   438 			    zil_free_log_record, tx, zh->zh_claim_txg);
       
   439 		}
       
   440 	}
   378 
   441 
   379 	dmu_tx_commit(tx);
   442 	dmu_tx_commit(tx);
       
   443 
       
   444 	if (keep_first)			/* no need to wait in this case */
       
   445 		return;
       
   446 
   380 	txg_wait_synced(zilog->zl_dmu_pool, txg);
   447 	txg_wait_synced(zilog->zl_dmu_pool, txg);
   381 
   448 	ASSERT(BP_IS_HOLE(&zh->zh_log));
   382 	mutex_exit(&zilog->zl_destroy_lock);
       
   383 }
   449 }
   384 
   450 
   385 void
   451 void
   386 zil_claim(char *osname, void *txarg)
   452 zil_claim(char *osname, void *txarg)
   387 {
   453 {
   397 		cmn_err(CE_WARN, "can't process intent log for %s", osname);
   463 		cmn_err(CE_WARN, "can't process intent log for %s", osname);
   398 		return;
   464 		return;
   399 	}
   465 	}
   400 
   466 
   401 	zilog = dmu_objset_zil(os);
   467 	zilog = dmu_objset_zil(os);
   402 	zh = zilog->zl_header;
   468 	zh = zil_header_in_syncing_context(zilog);
   403 
   469 
   404 	/*
   470 	/*
   405 	 * Claim all log blocks if we haven't already done so.
   471 	 * Claim all log blocks if we haven't already done so, and remember
       
   472 	 * the highest claimed sequence number.  This ensures that if we can
       
   473 	 * read only part of the log now (e.g. due to a missing device),
       
   474 	 * but we can read the entire log later, we will not try to replay
       
   475 	 * or destroy beyond the last block we successfully claimed.
   406 	 */
   476 	 */
   407 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
   477 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
   408 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
   478 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
   409 		zh->zh_claim_txg = first_txg;
   479 		zh->zh_claim_txg = first_txg;
   410 		zil_parse(zilog, zil_claim_log_block, zil_claim_log_record,
   480 		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
   411 		    tx, first_txg);
   481 		    zil_claim_log_record, tx, first_txg);
   412 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
   482 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
   413 	}
   483 	}
       
   484 
   414 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
   485 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
   415 	dmu_objset_close(os);
   486 	dmu_objset_close(os);
   416 }
   487 }
   417 
   488 
   418 void
   489 void
   553 static lwb_t *
   624 static lwb_t *
   554 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
   625 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
   555 {
   626 {
   556 	lwb_t *nlwb;
   627 	lwb_t *nlwb;
   557 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
   628 	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
       
   629 	spa_t *spa = zilog->zl_spa;
       
   630 	blkptr_t *bp = &ztp->zit_next_blk;
   558 	uint64_t txg;
   631 	uint64_t txg;
   559 	uint64_t zil_blksz;
   632 	uint64_t zil_blksz;
   560 	zbookmark_t zb;
   633 	zbookmark_t zb;
   561 	int error;
   634 	int error;
   562 
   635 
   581 	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
   654 	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
   582 	zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
   655 	zil_blksz = P2ROUNDUP(zil_blksz, ZIL_MIN_BLKSZ);
   583 	if (zil_blksz > ZIL_MAX_BLKSZ)
   656 	if (zil_blksz > ZIL_MAX_BLKSZ)
   584 		zil_blksz = ZIL_MAX_BLKSZ;
   657 		zil_blksz = ZIL_MAX_BLKSZ;
   585 
   658 
   586 	error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
   659 	error = zio_alloc_blk(spa, zil_blksz, bp, txg);
   587 	    zil_blksz, &ztp->zit_next_blk, txg);
       
   588 	if (error) {
   660 	if (error) {
   589 		/*
   661 		/*
   590 		 * Reinitialise the lwb.
   662 		 * Reinitialise the lwb.
   591 		 * By returning NULL the caller will call tx_wait_synced()
   663 		 * By returning NULL the caller will call tx_wait_synced()
   592 		 */
   664 		 */
   597 		mutex_exit(&zilog->zl_lock);
   669 		mutex_exit(&zilog->zl_lock);
   598 		txg_rele_to_sync(&lwb->lwb_txgh);
   670 		txg_rele_to_sync(&lwb->lwb_txgh);
   599 		return (NULL);
   671 		return (NULL);
   600 	}
   672 	}
   601 
   673 
   602 	ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
   674 	ASSERT3U(bp->blk_birth, ==, txg);
   603 	ztp->zit_pad = 0;
   675 	ztp->zit_pad = 0;
   604 	ztp->zit_nused = lwb->lwb_nused;
   676 	ztp->zit_nused = lwb->lwb_nused;
   605 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
   677 	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
   606 	ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
   678 	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
   607 	ztp->zit_next_blk.blk_cksum.zc_word[3]++;
   679 	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
   608 
   680 
   609 	/*
   681 	/*
   610 	 * Allocate a new log write buffer (lwb).
   682 	 * Allocate a new log write buffer (lwb).
   611 	 */
   683 	 */
   612 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
   684 	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
   613 
   685 
   614 	nlwb->lwb_zilog = zilog;
   686 	nlwb->lwb_zilog = zilog;
   615 	nlwb->lwb_blk = ztp->zit_next_blk;
   687 	nlwb->lwb_blk = *bp;
   616 	nlwb->lwb_nused = 0;
   688 	nlwb->lwb_nused = 0;
   617 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
   689 	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
   618 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
   690 	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
   619 	nlwb->lwb_max_txg = txg;
   691 	nlwb->lwb_max_txg = txg;
   620 	nlwb->lwb_seq = 0;
   692 	nlwb->lwb_seq = 0;
   631 	mutex_exit(&zilog->zl_lock);
   703 	mutex_exit(&zilog->zl_lock);
   632 
   704 
   633 	/*
   705 	/*
   634 	 * write the old log block
   706 	 * write the old log block
   635 	 */
   707 	 */
   636 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
   708 	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
   637 
       
   638 	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2];
       
   639 	zb.zb_object = 0;
   709 	zb.zb_object = 0;
   640 	zb.zb_level = -1;
   710 	zb.zb_level = -1;
   641 	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3];
   711 	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
   642 
   712 
   643 	zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
   713 	zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0,
   644 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
   714 	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
   645 	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
   715 	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
   646 
   716 
   647 	return (nlwb);
   717 	return (nlwb);
   648 }
   718 }
   947  * Called in syncing context to free committed log blocks and update log header.
  1017  * Called in syncing context to free committed log blocks and update log header.
   948  */
  1018  */
   949 void
  1019 void
   950 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
  1020 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
   951 {
  1021 {
       
  1022 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
   952 	uint64_t txg = dmu_tx_get_txg(tx);
  1023 	uint64_t txg = dmu_tx_get_txg(tx);
   953 	spa_t *spa = zilog->zl_spa;
  1024 	spa_t *spa = zilog->zl_spa;
   954 	lwb_t *lwb;
  1025 	lwb_t *lwb;
   955 
  1026 
       
  1027 	mutex_enter(&zilog->zl_lock);
       
  1028 
   956 	ASSERT(zilog->zl_stop_sync == 0);
  1029 	ASSERT(zilog->zl_stop_sync == 0);
   957 
  1030 
   958 	zilog->zl_header->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
  1031 	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
   959 
  1032 
   960 	if (zilog->zl_destroy_txg == txg) {
  1033 	if (zilog->zl_destroy_txg == txg) {
   961 		bzero(zilog->zl_header, sizeof (zil_header_t));
  1034 		blkptr_t blk = zh->zh_log;
       
  1035 
       
  1036 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
       
  1037 		ASSERT(spa_sync_pass(spa) == 1);
       
  1038 
       
  1039 		bzero(zh, sizeof (zil_header_t));
   962 		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
  1040 		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
   963 		zilog->zl_destroy_txg = 0;
  1041 
   964 	}
  1042 		if (zilog->zl_keep_first) {
   965 
  1043 			/*
   966 	mutex_enter(&zilog->zl_lock);
  1044 			 * If this block was part of log chain that couldn't
       
  1045 			 * be claimed because a device was missing during
       
  1046 			 * zil_claim(), but that device later returns,
       
  1047 			 * then this block could erroneously appear valid.
       
  1048 			 * To guard against this, assign a new GUID to the new
       
  1049 			 * log chain so it doesn't matter what blk points to.
       
  1050 			 */
       
  1051 			zil_init_log_chain(zilog, &blk);
       
  1052 			zh->zh_log = blk;
       
  1053 		}
       
  1054 	}
       
  1055 
   967 	for (;;) {
  1056 	for (;;) {
   968 		lwb = list_head(&zilog->zl_lwb_list);
  1057 		lwb = list_head(&zilog->zl_lwb_list);
   969 		if (lwb == NULL) {
  1058 		if (lwb == NULL) {
   970 			mutex_exit(&zilog->zl_lock);
  1059 			mutex_exit(&zilog->zl_lock);
   971 			return;
  1060 			return;
   974 			break;
  1063 			break;
   975 		list_remove(&zilog->zl_lwb_list, lwb);
  1064 		list_remove(&zilog->zl_lwb_list, lwb);
   976 		zio_free_blk(spa, &lwb->lwb_blk, txg);
  1065 		zio_free_blk(spa, &lwb->lwb_blk, txg);
   977 		kmem_cache_free(zil_lwb_cache, lwb);
  1066 		kmem_cache_free(zil_lwb_cache, lwb);
   978 	}
  1067 	}
   979 	zilog->zl_header->zh_log = lwb->lwb_blk;
  1068 	zh->zh_log = lwb->lwb_blk;
   980 	mutex_exit(&zilog->zl_lock);
  1069 	mutex_exit(&zilog->zl_lock);
   981 }
  1070 }
   982 
  1071 
   983 void
  1072 void
   984 zil_init(void)
  1073 zil_init(void)
  1002 
  1091 
  1003 	zilog->zl_header = zh_phys;
  1092 	zilog->zl_header = zh_phys;
  1004 	zilog->zl_os = os;
  1093 	zilog->zl_os = os;
  1005 	zilog->zl_spa = dmu_objset_spa(os);
  1094 	zilog->zl_spa = dmu_objset_spa(os);
  1006 	zilog->zl_dmu_pool = dmu_objset_pool(os);
  1095 	zilog->zl_dmu_pool = dmu_objset_pool(os);
       
  1096 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
  1007 
  1097 
  1008 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
  1098 	list_create(&zilog->zl_itx_list, sizeof (itx_t),
  1009 	    offsetof(itx_t, itx_node));
  1099 	    offsetof(itx_t, itx_node));
  1010 
  1100 
  1011 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
  1101 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
  1049  * return true if the initial log block is not valid
  1139  * return true if the initial log block is not valid
  1050  */
  1140  */
  1051 static int
  1141 static int
  1052 zil_empty(zilog_t *zilog)
  1142 zil_empty(zilog_t *zilog)
  1053 {
  1143 {
  1054 	blkptr_t blk;
  1144 	const zil_header_t *zh = zilog->zl_header;
  1055 	char *lrbuf;
  1145 	arc_buf_t *abuf = NULL;
  1056 	int error;
  1146 
  1057 
  1147 	if (BP_IS_HOLE(&zh->zh_log))
  1058 	blk = zilog->zl_header->zh_log;
       
  1059 	if (BP_IS_HOLE(&blk))
       
  1060 		return (1);
  1148 		return (1);
  1061 
  1149 
  1062 	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
  1150 	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
  1063 	error = zil_read_log_block(zilog, &blk, lrbuf);
  1151 		return (1);
  1064 	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
  1152 
  1065 	return (error ? 1 : 0);
  1153 	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
       
  1154 	return (0);
  1066 }
  1155 }
  1067 
  1156 
  1068 /*
  1157 /*
  1069  * Open an intent log.
  1158  * Open an intent log.
  1070  */
  1159  */
  1084  * Close an intent log.
  1173  * Close an intent log.
  1085  */
  1174  */
  1086 void
  1175 void
  1087 zil_close(zilog_t *zilog)
  1176 zil_close(zilog_t *zilog)
  1088 {
  1177 {
  1089 	if (!zil_is_committed(zilog))
  1178 	/*
  1090 		txg_wait_synced(zilog->zl_dmu_pool, 0);
  1179 	 * If the log isn't already committed, mark the objset dirty
       
  1180 	 * (so zil_sync() will be called) and wait for that txg to sync.
       
  1181 	 */
       
  1182 	if (!zil_is_committed(zilog)) {
       
  1183 		uint64_t txg;
       
  1184 		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
       
  1185 		(void) dmu_tx_assign(tx, TXG_WAIT);
       
  1186 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
       
  1187 		txg = dmu_tx_get_txg(tx);
       
  1188 		dmu_tx_commit(tx);
       
  1189 		txg_wait_synced(zilog->zl_dmu_pool, txg);
       
  1190 	}
       
  1191 
  1091 	taskq_destroy(zilog->zl_clean_taskq);
  1192 	taskq_destroy(zilog->zl_clean_taskq);
  1092 	zilog->zl_clean_taskq = NULL;
  1193 	zilog->zl_clean_taskq = NULL;
  1093 	zilog->zl_get_data = NULL;
  1194 	zilog->zl_get_data = NULL;
  1094 
  1195 
  1095 	zil_itx_clean(zilog);
  1196 	zil_itx_clean(zilog);
  1103  * contains all the data it's supposed to, and has an empty intent log.
  1204  * contains all the data it's supposed to, and has an empty intent log.
  1104  */
  1205  */
  1105 int
  1206 int
  1106 zil_suspend(zilog_t *zilog)
  1207 zil_suspend(zilog_t *zilog)
  1107 {
  1208 {
       
  1209 	const zil_header_t *zh = zilog->zl_header;
  1108 	lwb_t *lwb;
  1210 	lwb_t *lwb;
  1109 
  1211 
  1110 	mutex_enter(&zilog->zl_lock);
  1212 	mutex_enter(&zilog->zl_lock);
  1111 	if (zilog->zl_header->zh_claim_txg != 0) {	/* unplayed log */
  1213 	if (zh->zh_claim_txg != 0) {		/* unplayed log */
  1112 		mutex_exit(&zilog->zl_lock);
  1214 		mutex_exit(&zilog->zl_lock);
  1113 		return (EBUSY);
  1215 		return (EBUSY);
  1114 	}
  1216 	}
  1115 	zilog->zl_suspend++;
  1217 	if (zilog->zl_suspend++ != 0) {
       
  1218 		/*
       
  1219 		 * Someone else already began a suspend.
       
  1220 		 * Just wait for them to finish.
       
  1221 		 */
       
  1222 		while (zilog->zl_suspending)
       
  1223 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
       
  1224 		ASSERT(BP_IS_HOLE(&zh->zh_log));
       
  1225 		mutex_exit(&zilog->zl_lock);
       
  1226 		return (0);
       
  1227 	}
       
  1228 	zilog->zl_suspending = B_TRUE;
  1116 	mutex_exit(&zilog->zl_lock);
  1229 	mutex_exit(&zilog->zl_lock);
  1117 
  1230 
  1118 	zil_commit(zilog, UINT64_MAX, FSYNC);
  1231 	zil_commit(zilog, UINT64_MAX, FSYNC);
  1119 
  1232 
  1120 	mutex_enter(&zilog->zl_lock);
  1233 	mutex_enter(&zilog->zl_lock);
  1121 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
  1234 	for (;;) {
  1122 		if (lwb->lwb_buf != NULL) {
  1235 		/*
  1123 			/*
  1236 		 * Wait for any in-flight log writes to complete.
  1124 			 * Wait for the buffer if it's in the process of
  1237 		 */
  1125 			 * being written.
  1238 		for (lwb = list_head(&zilog->zl_lwb_list); lwb != NULL;
  1126 			 */
  1239 		    lwb = list_next(&zilog->zl_lwb_list, lwb))
  1127 			if ((lwb->lwb_seq != 0) &&
  1240 			if (lwb->lwb_seq != 0 && lwb->lwb_state != SEQ_COMPLETE)
  1128 			    (lwb->lwb_state != SEQ_COMPLETE)) {
  1241 				break;
  1129 				cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
  1242 
  1130 				continue;
  1243 		if (lwb == NULL)
  1131 			}
  1244 			break;
  1132 			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
  1245 
  1133 		}
  1246 		cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
  1134 		list_remove(&zilog->zl_lwb_list, lwb);
  1247 	}
  1135 		kmem_cache_free(zil_lwb_cache, lwb);
  1248 
  1136 	}
       
  1137 	mutex_exit(&zilog->zl_lock);
  1249 	mutex_exit(&zilog->zl_lock);
  1138 
  1250 
  1139 	zil_destroy(zilog);
  1251 	zil_destroy(zilog, B_FALSE);
       
  1252 
       
  1253 	mutex_enter(&zilog->zl_lock);
       
  1254 	ASSERT(BP_IS_HOLE(&zh->zh_log));
       
  1255 	zilog->zl_suspending = B_FALSE;
       
  1256 	cv_broadcast(&zilog->zl_cv_suspend);
       
  1257 	mutex_exit(&zilog->zl_lock);
  1140 
  1258 
  1141 	return (0);
  1259 	return (0);
  1142 }
  1260 }
  1143 
  1261 
  1144 void
  1262 void
  1162 
  1280 
  1163 static void
  1281 static void
  1164 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
  1282 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
  1165 {
  1283 {
  1166 	zil_replay_arg_t *zr = zra;
  1284 	zil_replay_arg_t *zr = zra;
  1167 	zil_header_t *zh = zilog->zl_header;
  1285 	const zil_header_t *zh = zilog->zl_header;
  1168 	uint64_t reclen = lr->lrc_reclen;
  1286 	uint64_t reclen = lr->lrc_reclen;
  1169 	uint64_t txtype = lr->lrc_txtype;
  1287 	uint64_t txtype = lr->lrc_txtype;
  1170 	int pass, error;
  1288 	int pass, error;
  1171 
  1289 
  1172 	if (zilog->zl_stop_replay)
  1290 	if (zilog->zl_stop_replay)
  1308 void
  1426 void
  1309 zil_replay(objset_t *os, void *arg, uint64_t *txgp,
  1427 zil_replay(objset_t *os, void *arg, uint64_t *txgp,
  1310 	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
  1428 	zil_replay_func_t *replay_func[TX_MAX_TYPE], void (*rm_sync)(void *arg))
  1311 {
  1429 {
  1312 	zilog_t *zilog = dmu_objset_zil(os);
  1430 	zilog_t *zilog = dmu_objset_zil(os);
  1313 		zil_replay_arg_t zr;
  1431 	const zil_header_t *zh = zilog->zl_header;
       
  1432 	zil_replay_arg_t zr;
  1314 
  1433 
  1315 	if (zil_empty(zilog)) {
  1434 	if (zil_empty(zilog)) {
  1316 		/*
  1435 		zil_destroy(zilog, B_TRUE);
  1317 		 * Initialise the log header but don't free the log block
       
  1318 		 * which will get reused.
       
  1319 		 */
       
  1320 		zilog->zl_header->zh_claim_txg = 0;
       
  1321 		zilog->zl_header->zh_replay_seq = 0;
       
  1322 		return;
  1436 		return;
  1323 	}
  1437 	}
  1324 
  1438 
  1325 	zr.zr_os = os;
  1439 	zr.zr_os = os;
  1326 	zr.zr_replay = replay_func;
  1440 	zr.zr_replay = replay_func;
  1327 	zr.zr_arg = arg;
  1441 	zr.zr_arg = arg;
  1328 	zr.zr_rm_sync = rm_sync;
  1442 	zr.zr_rm_sync = rm_sync;
  1329 	zr.zr_txgp = txgp;
  1443 	zr.zr_txgp = txgp;
  1330 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zilog->zl_header->zh_log);
  1444 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
  1331 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
  1445 	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
  1332 
  1446 
  1333 	/*
  1447 	/*
  1334 	 * Wait for in-progress removes to sync before starting replay.
  1448 	 * Wait for in-progress removes to sync before starting replay.
  1335 	 */
  1449 	 */
  1336 	if (rm_sync != NULL)
  1450 	if (rm_sync != NULL)
  1337 		rm_sync(arg);
  1451 		rm_sync(arg);
  1338 	txg_wait_synced(zilog->zl_dmu_pool, 0);
  1452 	txg_wait_synced(zilog->zl_dmu_pool, 0);
  1339 
  1453 
  1340 	zilog->zl_stop_replay = 0;
  1454 	zilog->zl_stop_replay = 0;
  1341 	zil_parse(zilog, NULL, zil_replay_log_record, &zr,
  1455 	(void) zil_parse(zilog, NULL, zil_replay_log_record, &zr,
  1342 	    zilog->zl_header->zh_claim_txg);
  1456 	    zh->zh_claim_txg);
  1343 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
  1457 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
  1344 
  1458 
  1345 	zil_destroy(zilog);
  1459 	zil_destroy(zilog, B_FALSE);
  1346 }
  1460 }
  1347 
  1461 
  1348 /*
  1462 /*
  1349  * Report whether all transactions are committed
  1463  * Report whether all transactions are committed
  1350  */
  1464  */
  1351 int
  1465 int
  1352 zil_is_committed(zilog_t *zilog)
  1466 zil_is_committed(zilog_t *zilog)
  1353 {
  1467 {
  1354 	lwb_t *lwb;
  1468 	lwb_t *lwb;
  1355 
  1469 
  1356 	if (zilog == NULL || list_head(&zilog->zl_itx_list))
  1470 	if (!list_is_empty(&zilog->zl_itx_list))
  1357 		return (B_FALSE);
  1471 		return (B_FALSE);
  1358 
  1472 
  1359 	/*
  1473 	/*
  1360 	 * A log write buffer at the head of the list that is not UNWRITTEN
  1474 	 * A log write buffer at the head of the list that is not UNWRITTEN
  1361 	 * means there's a lwb yet to be freed after a txg commit
  1475 	 * means there's a lwb yet to be freed after a txg commit