6595532 ZIL is too talkative
authorNeil Perrin <Neil.Perrin@Sun.COM>
Wed, 17 Feb 2010 15:13:37 -0700
changeset 11670 1d964fb5d948
parent 11669 573bcdd6e9c2
child 11671 3073ea7cd00e
6595532 ZIL is too talkative 6388458 zil need not inflate blocksize that much 6738159 slog can probably pack 2X more data per lwb 6897432 ziltest_noslog fails after dedup putback 6911570 Shouldn't start an ereport on speculative read failure 6920442 ZIL: Should use the stronger checksum fletcher4
usr/src/cmd/zpool/zpool_main.c
usr/src/grub/capability
usr/src/grub/grub-0.97/stage2/fsys_zfs.c
usr/src/grub/grub-0.97/stage2/fsys_zfs.h
usr/src/grub/grub-0.97/stage2/zfs-include/spa.h
usr/src/grub/grub-0.97/stage2/zfs-include/vdev_impl.h
usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
usr/src/grub/grub-0.97/stage2/zfs-include/zio.h
usr/src/grub/grub-0.97/stage2/zfs-include/zio_checksum.h
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_traverse.c
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/vdev_impl.h
usr/src/uts/common/fs/zfs/sys/zil.h
usr/src/uts/common/fs/zfs/sys/zil_impl.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/sys/zio_checksum.h
usr/src/uts/common/fs/zfs/vdev_raidz.c
usr/src/uts/common/fs/zfs/zil.c
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/fs/zfs/zio_checksum.c
usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/cmd/zpool/zpool_main.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/cmd/zpool/zpool_main.c	Wed Feb 17 15:13:37 2010 -0700
@@ -3888,6 +3888,7 @@
 		    "(zero-length encoding)\n"));
 		(void) printf(gettext(" 21  Deduplication\n"));
 		(void) printf(gettext(" 22  Received properties\n"));
+		(void) printf(gettext(" 23  Slim ZIL\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
--- a/usr/src/grub/capability	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/capability	Wed Feb 17 15:13:37 2010 -0700
@@ -18,7 +18,7 @@
 #
 # CDDL HEADER END
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #
@@ -40,7 +40,7 @@
 # This file and the associated version are Solaris specific and are
 # not a part of the open source distribution of GRUB.
 #
-VERSION=14
+VERSION=15
 dboot
 xVM
 zfs
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c	Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -123,10 +123,11 @@
 	zio_checksum_off,	zio_checksum_off,	0, 0,	"off",
 	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"label",
 	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 1,	"gang_header",
-	fletcher_2_native,	fletcher_2_byteswap,	0, 1,	"zilog",
+	NULL,			NULL,			0, 0,	"zilog",
 	fletcher_2_native,	fletcher_2_byteswap,	0, 0,	"fletcher2",
 	fletcher_4_native,	fletcher_4_byteswap,	1, 0,	"fletcher4",
 	zio_checksum_SHA256,	zio_checksum_SHA256,	1, 0,	"SHA256",
+	NULL,			NULL,			0, 0,	"zilog2",
 };
 
 /*
@@ -144,7 +145,7 @@
 	zio_cksum_t zc = bp->blk_cksum;
 	uint32_t checksum = BP_GET_CHECKSUM(bp);
 	int byteswap = BP_SHOULD_BYTESWAP(bp);
-	zio_block_tail_t *zbt = (zio_block_tail_t *)(data + size) - 1;
+	zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;
 
@@ -155,11 +156,11 @@
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (-1);
 
-	if (ci->ci_zbt) {
-		expected_cksum = zbt->zbt_cksum;
-		zbt->zbt_cksum = zc;
+	if (ci->ci_eck) {
+		expected_cksum = zec->zec_cksum;
+		zec->zec_cksum = zc;
 		ci->ci_func[0](data, size, &actual_cksum);
-		zbt->zbt_cksum = expected_cksum;
+		zec->zec_cksum = expected_cksum;
 		zc = expected_cksum;
 
 	} else {
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 #ifndef _FSYS_ZFS_H
@@ -107,8 +107,8 @@
 typedef struct uberblock_phys {
 	uberblock_t	ubp_uberblock;
 	char		ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
-				sizeof (zio_block_tail_t)];
-	zio_block_tail_t ubp_zbt;
+				sizeof (zio_eck_t)];
+	zio_eck_t	ubp_zec;
 } uberblock_phys_t;
 
 /*
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -276,6 +276,10 @@
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
 
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+				BP_GET_PSIZE(bp))
+
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/vdev_impl.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/vdev_impl.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,8 +31,8 @@
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
 typedef struct vdev_phys {
-	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
-	zio_block_tail_t vp_zbt;
+	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+	zio_eck_t	vp_zbt;
 } vdev_phys_t;
 
 typedef struct vdev_label {
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,7 +27,7 @@
 /*
  * On-disk version number.
  */
-#define	SPA_VERSION			22ULL
+#define	SPA_VERSION			23ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zio.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zio.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,19 +17,19 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
-#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
+#define	ZEC_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
 
-typedef struct zio_block_tail {
-	uint64_t	zbt_magic;	/* for validation, endianness	*/
-	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
-} zio_block_tail_t;
+typedef struct zio_eck {
+	uint64_t	zec_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
+} zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
@@ -37,9 +37,9 @@
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t) - \
+	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
@@ -50,7 +50,7 @@
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_block_tail_t	zg_tail;
+	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
@@ -63,6 +63,7 @@
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zio_checksum.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zio_checksum.h	Wed Feb 17 15:13:37 2010 -0700
@@ -17,15 +17,13 @@
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
 #define	_SYS_ZIO_CHECKSUM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Signature for checksum functions.
  */
@@ -37,7 +35,7 @@
 typedef struct zio_checksum_info {
 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_zbt;		/* uses zio block tail?	*/
+	int		ci_eck;		/* uses zio embedded checksum? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
--- a/usr/src/uts/common/fs/zfs/dmu.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Wed Feb 17 15:13:37 2010 -0700
@@ -1096,7 +1096,7 @@
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
-	if (dmu_tx_assign(tx, TXG_NOWAIT) != 0) {
+	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
 		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
 	}
@@ -1304,7 +1304,7 @@
 		 * to fletcher4.
 		 */
 		if (zio_checksum_table[checksum].ci_correctable < 1 ||
-		    zio_checksum_table[checksum].ci_zbt)
+		    zio_checksum_table[checksum].ci_eck)
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 	} else {
 		checksum = zio_checksum_select(dn->dn_checksum, checksum);
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -271,7 +271,8 @@
 		return (EINTR);
 
 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Wed Feb 17 15:13:37 2010 -0700
@@ -304,6 +304,10 @@
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
 
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+				BP_GET_PSIZE(bp))
+
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Wed Feb 17 15:13:37 2010 -0700
@@ -217,8 +217,8 @@
 #define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
 typedef struct vdev_phys {
-	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
-	zio_block_tail_t vp_zbt;
+	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+	zio_eck_t	vp_zbt;
 } vdev_phys_t;
 
 typedef struct vdev_label {
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -68,23 +68,27 @@
 #define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
 
 /*
- * Log block trailer - structure at the end of the header and each log block
+ * Log block chaining.
  *
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
+ *
+ * The zio_eck_t contains a zec_cksum which for the intent log is
  * the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
  * number passed in the blk_cksum field of the blkptr_t
  */
-typedef struct zil_trailer {
-	uint64_t zit_pad;
-	blkptr_t zit_next_blk;	/* next block in chain */
-	uint64_t zit_nused;	/* bytes in log block used */
-	zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+	uint64_t zc_pad;
+	blkptr_t zc_next_blk;	/* next block in chain */
+	uint64_t zc_nused;	/* bytes in log block used */
+	zio_eck_t zc_eck;	/* block trailer */
+} zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
 #define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
-#define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
 
 /*
  * The words of a log block checksum.
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,8 @@
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define	ZIL_PREV_BLKS 16
+
 /*
  * Stable storage intent log management structure.  One per dataset.
  */
@@ -101,6 +103,8 @@
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 	zil_header_t	zl_old_header;	/* debugging aid */
+	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
 };
 
 typedef struct zil_bp_node {
@@ -108,7 +112,7 @@
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
-#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Wed Feb 17 15:13:37 2010 -0700
@@ -38,12 +38,15 @@
 extern "C" {
 #endif
 
-#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
-typedef struct zio_block_tail {
-	uint64_t	zbt_magic;	/* for validation, endianness	*/
-	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
-} zio_block_tail_t;
+typedef struct zio_eck {
+	uint64_t	zec_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
+} zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
@@ -51,16 +54,16 @@
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t) - \
+	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_block_tail_t	zg_tail;
+	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
@@ -73,6 +76,7 @@
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
@@ -462,6 +466,7 @@
     blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,7 +44,7 @@
 typedef struct zio_checksum_info {
 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_zbt;		/* uses zio block tail?	*/
+	int		ci_eck;		/* uses zio embedded checksum? */
 	int		ci_dedup;	/* strong enough for dedup? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Wed Feb 17 15:13:37 2010 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -2073,21 +2073,25 @@
 		 * reconstruction.
 		 *
 		 * Start checksum ereports for all children which haven't
-		 * failed.
+		 * failed, and the IO wasn't speculative.
 		 */
 		zio->io_error = ECKSUM;
 
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			if (rc->rc_error == 0) {
-				zio_bad_cksum_t zbc;
-				zbc.zbc_has_cksum = 0;
-				zbc.zbc_injected = rm->rm_ecksuminjected;
+		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			for (c = 0; c < rm->rm_cols; c++) {
+				rc = &rm->rm_col[c];
+				if (rc->rc_error == 0) {
+					zio_bad_cksum_t zbc;
+					zbc.zbc_has_cksum = 0;
+					zbc.zbc_injected =
+					    rm->rm_ecksuminjected;
 
-				zfs_ereport_start_checksum(
-				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
-				    zio, rc->rc_offset, rc->rc_size,
-				    (void *)(uintptr_t)c, &zbc);
+					zfs_ereport_start_checksum(
+					    zio->io_spa,
+					    vd->vdev_child[rc->rc_devidx],
+					    zio, rc->rc_offset, rc->rc_size,
+					    (void *)(uintptr_t)c, &zbc);
+				}
 			}
 		}
 	}
--- a/usr/src/uts/common/fs/zfs/zil.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zil.c	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -78,6 +78,10 @@
 
 static boolean_t zil_empty(zilog_t *zilog);
 
+#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
@@ -156,7 +160,8 @@
  * Read a log block and make sure it's valid.
  */
 static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+    char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	uint32_t aflags = ARC_WAIT;
@@ -177,14 +182,8 @@
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
-		char *data = abuf->b_data;
-		uint64_t size = BP_GET_LSIZE(bp);
-		zil_trailer_t *ztp = (zil_trailer_t *)(data + size) - 1;
 		zio_cksum_t cksum = bp->blk_cksum;
 
-		bcopy(data, dst, size);
-		*nbp = ztp->zit_next_blk;
-
 		/*
 		 * Validate the checksummed log block.
 		 *
@@ -195,10 +194,34 @@
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
-		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
-		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
-		    (ztp->zit_nused > (size - sizeof (zil_trailer_t))))
-			error = ECKSUM;
+		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = abuf->b_data;
+			char *lr = (char *)(zilc + 1);
+			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
+
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+				error = ECKSUM;
+			} else {
+				bcopy(lr, dst, len);
+				*end = (char *)dst + len;
+				*nbp = zilc->zc_next_blk;
+			}
+		} else {
+			char *lr = abuf->b_data;
+			uint64_t size = BP_GET_LSIZE(bp);
+			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
+				error = ECKSUM;
+			} else {
+				bcopy(lr, dst, zilc->zc_nused);
+				*end = (char *)dst + zilc->zc_nused;
+				*nbp = zilc->zc_next_blk;
+			}
+		}
 
 		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 	}
@@ -281,27 +304,26 @@
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
-		zil_trailer_t *ztp =
-		    (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
+		char *end;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
 			break;
-		ASSERT(max_blk_seq < blk_seq);
+		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
-		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf);
+		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
 		if (error)
 			break;
 
-		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+		for (lrp = lrbuf; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
@@ -309,7 +331,7 @@
 				goto done;
 			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
 				goto done;
-			ASSERT(max_lr_seq < lr->lrc_seq);
+			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
@@ -365,7 +387,6 @@
 	if (lr->lr_blkptr.blk_birth >= first_txg &&
 	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
 		return (error);
-
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
@@ -394,14 +415,41 @@
 	return (0);
 }
 
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+	lwb_t *lwb;
+
+	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+	lwb->lwb_zilog = zilog;
+	lwb->lwb_blk = *bp;
+	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+	lwb->lwb_max_txg = txg;
+	lwb->lwb_zio = NULL;
+	lwb->lwb_tx = NULL;
+	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+		lwb->lwb_nused = sizeof (zil_chain_t);
+		lwb->lwb_sz = BP_GET_LSIZE(bp);
+	} else {
+		lwb->lwb_nused = 0;
+		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+	}
+
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_lwb_list, lwb);
+	mutex_exit(&zilog->zl_lock);
+
+	return (lwb);
+}
+
 /*
  * Create an on-disk intent log.
  */
-static void
+static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb;
+	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
@@ -418,8 +466,9 @@
 	blk = zh->zh_log;
 
 	/*
-	 * If we don't already have an initial log block or we have one
-	 * but it's the wrong endianness then allocate one.
+	 * Allocate an initial log block if:
+	 *    - there isn't one already
+	 *    - the existing block is the wrong endianess
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
@@ -442,21 +491,8 @@
 	/*
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
-	if (error == 0) {
-		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-		lwb->lwb_zilog = zilog;
-		lwb->lwb_blk = blk;
-		lwb->lwb_nused = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
-		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
-		lwb->lwb_max_txg = txg;
-		lwb->lwb_zio = NULL;
-		lwb->lwb_tx = NULL;
-
-		mutex_enter(&zilog->zl_lock);
-		list_insert_tail(&zilog->zl_lwb_list, lwb);
-		mutex_exit(&zilog->zl_lock);
-	}
+	if (error == 0)
+		lwb = zil_alloc_lwb(zilog, &blk, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -469,6 +505,8 @@
 	}
 
 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+	return (lwb);
 }
 
 /*
@@ -712,7 +750,6 @@
 	dmu_tx_t *tx = lwb->lwb_tx;
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -760,13 +797,26 @@
 	}
 	if (lwb->lwb_zio == NULL) {
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
 		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
 
 /*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+    4096,		/* non TX_WRITE */
+    8192+4096,		/* data base */
+    32*1024 + 4096, 	/* NFS writes */
+    UINT64_MAX
+};
+
+/*
  * Use the slog as long as the logbias is 'latency' and the current commit size
  * is less than the limit or the total list size is less than 2X the limit.
  * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
@@ -783,16 +833,24 @@
 static lwb_t *
 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 {
-	lwb_t *nlwb;
-	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+	lwb_t *nlwb = NULL;
+	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp = &ztp->zit_next_blk;
+	blkptr_t *bp;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz;
-	int error;
+	int i, error;
 
-	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		zilc = (zil_chain_t *)lwb->lwb_buf;
+		bp = &zilc->zc_next_blk;
+	} else {
+		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+		bp = &zilc->zc_next_blk;
+	}
+
+	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
 	/*
 	 * Allocate the next block and save its address in this block
@@ -811,73 +869,69 @@
 	lwb->lwb_tx = tx;
 
 	/*
-	 * Pick a ZIL blocksize. We request a size that is the
-	 * maximum of the previous used size, the current used size and
-	 * the amount waiting in the queue.
+	 * Log blocks are pre-allocated. Here we select the size of the next
+	 * block, based on size used in the last block.
+	 * - first find the smallest bucket that will fit the block from a
+	 *   limited set of block sizes. This is because it's faster to write
+	 *   blocks allocated from the same metaslab as they are adjacent or
+	 *   close.
+	 * - next find the maximum from the new suggested size and an array of
+	 *   previous sizes. This lessens a picket fence effect of wrongly
+	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+	 *   requests.
+	 *
+	 * Note we only write what is used, but we can't just allocate
+	 * the maximum block size because we can exhaust the available
+	 * pool log space.
 	 */
-	zil_blksz = MAX(zilog->zl_prev_used,
-	    zilog->zl_cur_used + sizeof (*ztp));
-	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
-	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
-	if (zil_blksz > ZIL_MAX_BLKSZ)
-		zil_blksz = ZIL_MAX_BLKSZ;
+	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+		continue;
+	zil_blksz = zil_block_buckets[i];
+	if (zil_blksz == UINT64_MAX)
+		zil_blksz = SPA_MAXBLOCKSIZE;
+	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+	for (i = 0; i < ZIL_PREV_BLKS; i++)
+		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
 	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
 	    USE_SLOG(zilog));
-	if (error) {
-		/*
-		 * Since we've just experienced an allocation failure,
-		 * terminate the current lwb and send it on its way.
-		 */
-		ztp->zit_pad = 0;
-		ztp->zit_nused = lwb->lwb_nused;
-		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-		zio_nowait(lwb->lwb_zio);
+	if (!error) {
+		ASSERT3U(bp->blk_birth, ==, txg);
+		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		/*
-		 * By returning NULL the caller will call tx_wait_synced()
+		 * Allocate a new log write buffer (lwb).
 		 */
-		return (NULL);
+		nlwb = zil_alloc_lwb(zilog, bp, txg);
+
+		/* Record the block for later vdev flushing */
+		zil_add_block(zilog, &lwb->lwb_blk);
 	}
 
-	ASSERT3U(bp->blk_birth, ==, txg);
-	ztp->zit_pad = 0;
-	ztp->zit_nused = lwb->lwb_nused;
-	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		uint64_t len;
+
+		/* For Slim ZIL only write what is used. */
+		len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+		ASSERT3U(len, <=, lwb->lwb_sz);
+		zio_shrink(lwb->lwb_zio, len);
+
+	}
+	zilc->zc_pad = 0;
+	zilc->zc_nused = lwb->lwb_nused;
+	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+
+	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
 	/*
-	 * Allocate a new log write buffer (lwb).
+	 * If there was an allocation failure then nlwb will be null which
+	 * forces a txg_wait_synced().
 	 */
-	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-	nlwb->lwb_zilog = zilog;
-	nlwb->lwb_blk = *bp;
-	nlwb->lwb_nused = 0;
-	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
-	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
-	nlwb->lwb_max_txg = txg;
-	nlwb->lwb_zio = NULL;
-	nlwb->lwb_tx = NULL;
-
-	/*
-	 * Put new lwb at the end of the log chain
-	 */
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_lwb_list, nlwb);
-	mutex_exit(&zilog->zl_lock);
-
-	/* Record the block for later vdev flushing */
-	zil_add_block(zilog, &lwb->lwb_blk);
-
-	/*
-	 * kick off the write for the old log block
-	 */
-	ASSERT(lwb->lwb_zio);
-	zio_nowait(lwb->lwb_zio);
-
 	return (nlwb);
 }
 
@@ -907,13 +961,13 @@
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 */
-	if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
-		ASSERT(lwb->lwb_nused == 0);
-		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+		ASSERT(LWB_EMPTY(lwb));
+		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			return (lwb);
 		}
@@ -965,7 +1019,7 @@
 	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
 	lwb->lwb_nused += reclen + dlen;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
-	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
 
 	return (lwb);
@@ -1100,11 +1154,11 @@
 				return;
 			}
 			mutex_exit(&zilog->zl_lock);
-			zil_create(zilog);
+			lwb = zil_create(zilog);
 			mutex_enter(&zilog->zl_lock);
-			lwb = list_tail(&zilog->zl_lwb_list);
 		}
 	}
+	ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
 
 	/* Loop through in-memory log transactions filling log blocks. */
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
@@ -1133,8 +1187,8 @@
 			continue; /* skip this record */
 
 		if ((itx->itx_lr.lrc_seq > seq) &&
-		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb))))
+		    ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
+		    (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
 			break;
 
 		list_remove(&zilog->zl_itx_list, itx);
--- a/usr/src/uts/common/fs/zfs/zio.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -764,9 +764,9 @@
 
 	zio->io_prop.zp_checksum = checksum;
 
-	if (zio_checksum_table[checksum].ci_zbt) {
+	if (zio_checksum_table[checksum].ci_eck) {
 		/*
-		 * zbt checksums are necessarily destructive -- they modify
+		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
@@ -849,6 +849,23 @@
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+	ASSERT(zio->io_executor == NULL);
+	ASSERT(zio->io_orig_size == zio->io_size);
+	ASSERT(size <= zio->io_size);
+
+	/*
+	 * We don't shrink for raidz because of problems with the
+	 * reconstruction when reading back less than the block size.
+	 * Note, BP_IS_RAIDZ() assumes no compression.
+	 */
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	if (!BP_IS_RAIDZ(zio->io_bp))
+		zio->io_orig_size = zio->io_size = size;
+}
+
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
@@ -1524,7 +1541,7 @@
 
 	ASSERT(zio->io_data == gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1551,7 +1568,7 @@
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
 
 	if (gn != NULL) {
-		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -2178,7 +2195,9 @@
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+		BP_SET_CHECKSUM(new_bp,
+		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,6 +27,7 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/zil.h>
 
 /*
  * Checksum vectors.
@@ -75,6 +76,7 @@
 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
+	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
 };
 
 enum zio_checksum
@@ -151,23 +153,33 @@
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t zbt_cksum;
+	zio_cksum_t cksum;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
-	if (ci->ci_zbt) {
+	if (ci->ci_eck) {
+		zio_eck_t *eck;
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = data;
+
+			size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+			    uint64_t);
+			eck = &zilc->zc_eck;
+		} else {
+			eck = (zio_eck_t *)((char *)data + size) - 1;
+		}
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+			zio_checksum_gang_verifier(&eck->zec_cksum, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+			zio_checksum_label_verifier(&eck->zec_cksum, offset);
 		else
-			bp->blk_cksum = zbt->zbt_cksum;
-		zbt->zbt_magic = ZBT_MAGIC;
-		ci->ci_func[0](data, size, &zbt_cksum);
-		zbt->zbt_cksum = zbt_cksum;
+			bp->blk_cksum = eck->zec_cksum;
+		eck->zec_magic = ZEC_MAGIC;
+		ci->ci_func[0](data, size, &cksum);
+		eck->zec_cksum = cksum;
 	} else {
 		ci->ci_func[0](data, size, &bp->blk_cksum);
 	}
@@ -185,14 +197,35 @@
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
 	void *data = zio->io_data;
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum, verifier;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (EINVAL);
 
-	if (ci->ci_zbt) {
+	if (ci->ci_eck) {
+		zio_eck_t *eck;
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = data;
+			uint64_t nused;
+
+			eck = &zilc->zc_eck;
+			if (eck->zec_magic == ZEC_MAGIC)
+				nused = zilc->zc_nused;
+			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+				nused = BSWAP_64(zilc->zc_nused);
+			else
+				return (ECKSUM);
+
+			if (nused > size)
+				return (ECKSUM);
+
+			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+		} else {
+			eck = (zio_eck_t *)((char *)data + size) - 1;
+		}
+
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -200,15 +233,15 @@
 		else
 			verifier = bp->blk_cksum;
 
-		byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
-		expected_cksum = zbt->zbt_cksum;
-		zbt->zbt_cksum = verifier;
+		expected_cksum = eck->zec_cksum;
+		eck->zec_cksum = verifier;
 		ci->ci_func[byteswap](data, size, &actual_cksum);
-		zbt->zbt_cksum = expected_cksum;
+		eck->zec_cksum = expected_cksum;
 
 		if (byteswap)
 			byteswap_uint64_array(&expected_cksum,
--- a/usr/src/uts/common/sys/fs/zfs.h	Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/sys/fs/zfs.h	Wed Feb 17 15:13:37 2010 -0700
@@ -324,14 +324,15 @@
 #define	SPA_VERSION_20			20ULL
 #define	SPA_VERSION_21			21ULL
 #define	SPA_VERSION_22			22ULL
+#define	SPA_VERSION_23			23ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_22
-#define	SPA_VERSION_STRING		"22"
+#define	SPA_VERSION			SPA_VERSION_23
+#define	SPA_VERSION_STRING		"23"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -375,6 +376,7 @@
 #define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
 #define	SPA_VERSION_DEDUP		SPA_VERSION_21
 #define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
+#define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change