6595532 ZIL is too talkative
6388458 zil need not inflate blocksize that much
6738159 slog can probably pack 2X more data per lwb
6897432 ziltest_noslog fails after dedup putback
6911570 Shouldn't start an ereport on speculative read failure
6920442 ZIL: Should use the stronger checksum fletcher4
--- a/usr/src/cmd/zpool/zpool_main.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/cmd/zpool/zpool_main.c Wed Feb 17 15:13:37 2010 -0700
@@ -3888,6 +3888,7 @@
"(zero-length encoding)\n"));
(void) printf(gettext(" 21 Deduplication\n"));
(void) printf(gettext(" 22 Received properties\n"));
+ (void) printf(gettext(" 23 Slim ZIL\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
--- a/usr/src/grub/capability Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/capability Wed Feb 17 15:13:37 2010 -0700
@@ -18,7 +18,7 @@
#
# CDDL HEADER END
#
-# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
@@ -40,7 +40,7 @@
# This file and the associated version are Solaris specific and are
# not a part of the open source distribution of GRUB.
#
-VERSION=14
+VERSION=15
dboot
xVM
zfs
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -123,10 +123,11 @@
zio_checksum_off, zio_checksum_off, 0, 0, "off",
zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "label",
zio_checksum_SHA256, zio_checksum_SHA256, 1, 1, "gang_header",
- fletcher_2_native, fletcher_2_byteswap, 0, 1, "zilog",
+ NULL, NULL, 0, 0, "zilog",
fletcher_2_native, fletcher_2_byteswap, 0, 0, "fletcher2",
fletcher_4_native, fletcher_4_byteswap, 1, 0, "fletcher4",
zio_checksum_SHA256, zio_checksum_SHA256, 1, 0, "SHA256",
+ NULL, NULL, 0, 0, "zilog2",
};
/*
@@ -144,7 +145,7 @@
zio_cksum_t zc = bp->blk_cksum;
uint32_t checksum = BP_GET_CHECKSUM(bp);
int byteswap = BP_SHOULD_BYTESWAP(bp);
- zio_block_tail_t *zbt = (zio_block_tail_t *)(data + size) - 1;
+ zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum;
@@ -155,11 +156,11 @@
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (-1);
- if (ci->ci_zbt) {
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = zc;
+ if (ci->ci_eck) {
+ expected_cksum = zec->zec_cksum;
+ zec->zec_cksum = zc;
ci->ci_func[0](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
+ zec->zec_cksum = expected_cksum;
zc = expected_cksum;
} else {
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _FSYS_ZFS_H
@@ -107,8 +107,8 @@
typedef struct uberblock_phys {
uberblock_t ubp_uberblock;
char ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
- sizeof (zio_block_tail_t)];
- zio_block_tail_t ubp_zbt;
+ sizeof (zio_eck_t)];
+ zio_eck_t ubp_zec;
} uberblock_phys_t;
/*
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -276,6 +276,10 @@
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
+
#define BP_ZERO(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/vdev_impl.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/vdev_impl.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -31,8 +31,8 @@
#define VDEV_UBERBLOCK_RING (128 << 10)
typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,7 +17,7 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,7 +27,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 22ULL
+#define SPA_VERSION 23ULL
/*
* The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zio.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zio.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,19 +17,19 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _ZIO_H
#define _ZIO_H
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
/*
* Gang block headers are self-checksumming and contain an array
@@ -37,9 +37,9 @@
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
+ sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
@@ -50,7 +50,7 @@
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
+ zio_eck_t zg_tail;
} zio_gbh_phys_t;
enum zio_checksum {
@@ -63,6 +63,7 @@
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_FUNCTIONS
};
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zio_checksum.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zio_checksum.h Wed Feb 17 15:13:37 2010 -0700
@@ -17,15 +17,13 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
#define _SYS_ZIO_CHECKSUM_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Signature for checksum functions.
*/
@@ -37,7 +35,7 @@
typedef struct zio_checksum_info {
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
+ int ci_eck; /* uses zio embedded checksum? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
--- a/usr/src/uts/common/fs/zfs/dmu.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c Wed Feb 17 15:13:37 2010 -0700
@@ -1096,7 +1096,7 @@
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
- if (dmu_tx_assign(tx, TXG_NOWAIT) != 0) {
+ if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
dmu_tx_abort(tx);
return (EIO); /* Make zl_get_data do txg_waited_synced() */
}
@@ -1304,7 +1304,7 @@
* to fletcher4.
*/
if (zio_checksum_table[checksum].ci_correctable < 1 ||
- zio_checksum_table[checksum].ci_zbt)
+ zio_checksum_table[checksum].ci_eck)
checksum = ZIO_CHECKSUM_FLETCHER_4;
} else {
checksum = zio_checksum_select(dn->dn_checksum, checksum);
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -271,7 +271,8 @@
return (EINTR);
if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+ BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
return (0);
mutex_enter(&pfd->pd_mtx);
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h Wed Feb 17 15:13:37 2010 -0700
@@ -304,6 +304,10 @@
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+ BP_GET_PSIZE(bp))
+
#define BP_ZERO(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Wed Feb 17 15:13:37 2010 -0700
@@ -217,8 +217,8 @@
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
typedef struct vdev_phys {
- char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
- zio_block_tail_t vp_zbt;
+ char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+ zio_eck_t vp_zbt;
} vdev_phys_t;
typedef struct vdev_label {
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -68,23 +68,27 @@
#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
/*
- * Log block trailer - structure at the end of the header and each log block
+ * Log block chaining.
*
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
+ *
+ * The zio_eck_t contains a zec_cksum which for the intent log is
* the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
* number passed in the blk_cksum field of the blkptr_t
*/
-typedef struct zil_trailer {
- uint64_t zit_pad;
- blkptr_t zit_next_blk; /* next block in chain */
- uint64_t zit_nused; /* bytes in log block used */
- zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+ uint64_t zc_pad;
+ blkptr_t zc_next_blk; /* next block in chain */
+ uint64_t zc_nused; /* bytes in log block used */
+ zio_eck_t zc_eck; /* block trailer */
+} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
-#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t))
/*
* The words of a log block checksum.
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,6 +57,8 @@
avl_node_t zv_node; /* AVL tree linkage */
} zil_vdev_node_t;
+#define ZIL_PREV_BLKS 16
+
/*
* Stable storage intent log management structure. One per dataset.
*/
@@ -101,6 +103,8 @@
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
zil_header_t zl_old_header; /* debugging aid */
+ uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+ uint_t zl_prev_rotor; /* rotor for zl_prev[] */
};
typedef struct zil_bp_node {
@@ -108,7 +112,7 @@
avl_node_t zn_node;
} zil_bp_node_t;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
+#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
#ifdef __cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Wed Feb 17 15:13:37 2010 -0700
@@ -38,12 +38,15 @@
extern "C" {
#endif
-#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define ZEC_MAGIC 0x210da7ab10c7a11ULL
-typedef struct zio_block_tail {
- uint64_t zbt_magic; /* for validation, endianness */
- zio_cksum_t zbt_cksum; /* 256-bit checksum */
-} zio_block_tail_t;
+typedef struct zio_eck {
+ uint64_t zec_magic; /* for validation, endianness */
+ zio_cksum_t zec_cksum; /* 256-bit checksum */
+} zio_eck_t;
/*
* Gang block headers are self-checksumming and contain an array
@@ -51,16 +54,16 @@
*/
#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE
#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+ sizeof (zio_eck_t)) / sizeof (blkptr_t))
#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \
- sizeof (zio_block_tail_t) - \
+ sizeof (zio_eck_t) - \
(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
sizeof (uint64_t))
typedef struct zio_gbh {
blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS];
uint64_t zg_filler[SPA_GBH_FILLER];
- zio_block_tail_t zg_tail;
+ zio_eck_t zg_tail;
} zio_gbh_phys_t;
enum zio_checksum {
@@ -73,6 +76,7 @@
ZIO_CHECKSUM_FLETCHER_2,
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
+ ZIO_CHECKSUM_ZILOG2,
ZIO_CHECKSUM_FUNCTIONS
};
@@ -462,6 +466,7 @@
blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -44,7 +44,7 @@
typedef struct zio_checksum_info {
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
- int ci_zbt; /* uses zio block tail? */
+ int ci_eck; /* uses zio embedded checksum? */
int ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c Wed Feb 17 15:13:37 2010 -0700
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2073,21 +2073,25 @@
* reconstruction.
*
* Start checksum ereports for all children which haven't
- * failed.
+ * failed, and the IO wasn't speculative.
*/
zio->io_error = ECKSUM;
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error == 0) {
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected = rm->rm_ecksuminjected;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
- zfs_ereport_start_checksum(
- zio->io_spa, vd->vdev_child[rc->rc_devidx],
- zio, rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ }
}
}
}
--- a/usr/src/uts/common/fs/zfs/zil.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zil.c Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -78,6 +78,10 @@
static boolean_t zil_empty(zilog_t *zilog);
+#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
static int
zil_bp_compare(const void *x1, const void *x2)
{
@@ -156,7 +160,8 @@
* Read a log block and make sure it's valid.
*/
static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+ char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
uint32_t aflags = ARC_WAIT;
@@ -177,14 +182,8 @@
ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
- char *data = abuf->b_data;
- uint64_t size = BP_GET_LSIZE(bp);
- zil_trailer_t *ztp = (zil_trailer_t *)(data + size) - 1;
zio_cksum_t cksum = bp->blk_cksum;
- bcopy(data, dst, size);
- *nbp = ztp->zit_next_blk;
-
/*
* Validate the checksummed log block.
*
@@ -195,10 +194,34 @@
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
- if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
- sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
- (ztp->zit_nused > (size - sizeof (zil_trailer_t))))
- error = ECKSUM;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = abuf->b_data;
+ char *lr = (char *)(zilc + 1);
+ uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, len);
+ *end = (char *)dst + len;
+ *nbp = zilc->zc_next_blk;
+ }
+ } else {
+ char *lr = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+ if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+ sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+ (zilc->zc_nused > (size - sizeof (*zilc)))) {
+ error = ECKSUM;
+ } else {
+ bcopy(lr, dst, zilc->zc_nused);
+ *end = (char *)dst + zilc->zc_nused;
+ *nbp = zilc->zc_next_blk;
+ }
+ }
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
@@ -281,27 +304,26 @@
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
- zil_trailer_t *ztp =
- (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
+ char *end;
if (blk_seq > claim_blk_seq)
break;
if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
break;
- ASSERT(max_blk_seq < blk_seq);
+ ASSERT3U(max_blk_seq, <, blk_seq);
max_blk_seq = blk_seq;
blk_count++;
if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
break;
- error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf);
+ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
if (error)
break;
- for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+ for (lrp = lrbuf; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
@@ -309,7 +331,7 @@
goto done;
if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
goto done;
- ASSERT(max_lr_seq < lr->lrc_seq);
+ ASSERT3U(max_lr_seq, <, lr->lrc_seq);
max_lr_seq = lr->lrc_seq;
lr_count++;
}
@@ -365,7 +387,6 @@
if (lr->lr_blkptr.blk_birth >= first_txg &&
(error = zil_read_log_data(zilog, lr, NULL)) != 0)
return (error);
-
return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
@@ -394,14 +415,41 @@
return (0);
}
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+ lwb_t *lwb;
+
+ lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+ lwb->lwb_zilog = zilog;
+ lwb->lwb_blk = *bp;
+ lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+ lwb->lwb_max_txg = txg;
+ lwb->lwb_zio = NULL;
+ lwb->lwb_tx = NULL;
+ if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+ lwb->lwb_nused = sizeof (zil_chain_t);
+ lwb->lwb_sz = BP_GET_LSIZE(bp);
+ } else {
+ lwb->lwb_nused = 0;
+ lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+ }
+
+ mutex_enter(&zilog->zl_lock);
+ list_insert_tail(&zilog->zl_lwb_list, lwb);
+ mutex_exit(&zilog->zl_lock);
+
+ return (lwb);
+}
+
/*
* Create an on-disk intent log.
*/
-static void
+static lwb_t *
zil_create(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
- lwb_t *lwb;
+ lwb_t *lwb = NULL;
uint64_t txg = 0;
dmu_tx_t *tx = NULL;
blkptr_t blk;
@@ -418,8 +466,9 @@
blk = zh->zh_log;
/*
- * If we don't already have an initial log block or we have one
- * but it's the wrong endianness then allocate one.
+ * Allocate an initial log block if:
+ * - there isn't one already
+ * - the existing block is the wrong endianess
*/
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os);
@@ -442,21 +491,8 @@
/*
* Allocate a log write buffer (lwb) for the first log block.
*/
- if (error == 0) {
- lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
- lwb->lwb_zilog = zilog;
- lwb->lwb_blk = blk;
- lwb->lwb_nused = 0;
- lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
- lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
- lwb->lwb_max_txg = txg;
- lwb->lwb_zio = NULL;
- lwb->lwb_tx = NULL;
-
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, lwb);
- mutex_exit(&zilog->zl_lock);
- }
+ if (error == 0)
+ lwb = zil_alloc_lwb(zilog, &blk, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -469,6 +505,8 @@
}
ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+ return (lwb);
}
/*
@@ -712,7 +750,6 @@
dmu_tx_t *tx = lwb->lwb_tx;
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
- ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -760,13 +797,26 @@
}
if (lwb->lwb_zio == NULL) {
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
+ 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
/*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+ 4096, /* non TX_WRITE */
+ 8192+4096, /* data base */
+ 32*1024 + 4096, /* NFS writes */
+ UINT64_MAX
+};
+
+/*
* Use the slog as long as the logbias is 'latency' and the current commit size
* is less than the limit or the total list size is less than 2X the limit.
* Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
@@ -783,16 +833,24 @@
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
- lwb_t *nlwb;
- zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+ lwb_t *nlwb = NULL;
+ zil_chain_t *zilc;
spa_t *spa = zilog->zl_spa;
- blkptr_t *bp = &ztp->zit_next_blk;
+ blkptr_t *bp;
dmu_tx_t *tx;
uint64_t txg;
uint64_t zil_blksz;
- int error;
+ int i, error;
- ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ zilc = (zil_chain_t *)lwb->lwb_buf;
+ bp = &zilc->zc_next_blk;
+ } else {
+ zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+ bp = &zilc->zc_next_blk;
+ }
+
+ ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/*
* Allocate the next block and save its address in this block
@@ -811,73 +869,69 @@
lwb->lwb_tx = tx;
/*
- * Pick a ZIL blocksize. We request a size that is the
- * maximum of the previous used size, the current used size and
- * the amount waiting in the queue.
+ * Log blocks are pre-allocated. Here we select the size of the next
+ * block, based on size used in the last block.
+ * - first find the smallest bucket that will fit the block from a
+ * limited set of block sizes. This is because it's faster to write
+ * blocks allocated from the same metaslab as they are adjacent or
+ * close.
+ * - next find the maximum from the new suggested size and an array of
+ * previous sizes. This lessens a picket fence effect of wrongly
+ * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+ * requests.
+ *
+ * Note we only write what is used, but we can't just allocate
+ * the maximum block size because we can exhaust the available
+ * pool log space.
*/
- zil_blksz = MAX(zilog->zl_prev_used,
- zilog->zl_cur_used + sizeof (*ztp));
- zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
- zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
- if (zil_blksz > ZIL_MAX_BLKSZ)
- zil_blksz = ZIL_MAX_BLKSZ;
+ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+ for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+ continue;
+ zil_blksz = zil_block_buckets[i];
+ if (zil_blksz == UINT64_MAX)
+ zil_blksz = SPA_MAXBLOCKSIZE;
+ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+ for (i = 0; i < ZIL_PREV_BLKS; i++)
+ zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+ zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
USE_SLOG(zilog));
- if (error) {
- /*
- * Since we've just experienced an allocation failure,
- * terminate the current lwb and send it on its way.
- */
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- zio_nowait(lwb->lwb_zio);
+ if (!error) {
+ ASSERT3U(bp->blk_birth, ==, txg);
+ bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
/*
- * By returning NULL the caller will call tx_wait_synced()
+ * Allocate a new log write buffer (lwb).
*/
- return (NULL);
+ nlwb = zil_alloc_lwb(zilog, bp, txg);
+
+ /* Record the block for later vdev flushing */
+ zil_add_block(zilog, &lwb->lwb_blk);
}
- ASSERT3U(bp->blk_birth, ==, txg);
- ztp->zit_pad = 0;
- ztp->zit_nused = lwb->lwb_nused;
- ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum = lwb->lwb_blk.blk_cksum;
- bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+ if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+ uint64_t len;
+
+ /* For Slim ZIL only write what is used. */
+ len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+ ASSERT3U(len, <=, lwb->lwb_sz);
+ zio_shrink(lwb->lwb_zio, len);
+
+ }
+ zilc->zc_pad = 0;
+ zilc->zc_nused = lwb->lwb_nused;
+ zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+
+ zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
/*
- * Allocate a new log write buffer (lwb).
+ * If there was an allocation failure then nlwb will be null which
+ * forces a txg_wait_synced().
*/
- nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
- nlwb->lwb_zilog = zilog;
- nlwb->lwb_blk = *bp;
- nlwb->lwb_nused = 0;
- nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
- nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
- nlwb->lwb_max_txg = txg;
- nlwb->lwb_zio = NULL;
- nlwb->lwb_tx = NULL;
-
- /*
- * Put new lwb at the end of the log chain
- */
- mutex_enter(&zilog->zl_lock);
- list_insert_tail(&zilog->zl_lwb_list, nlwb);
- mutex_exit(&zilog->zl_lock);
-
- /* Record the block for later vdev flushing */
- zil_add_block(zilog, &lwb->lwb_blk);
-
- /*
- * kick off the write for the old log block
- */
- ASSERT(lwb->lwb_zio);
- zio_nowait(lwb->lwb_zio);
-
return (nlwb);
}
@@ -907,13 +961,13 @@
/*
* If this record won't fit in the current log block, start a new one.
*/
- if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_init(zilog, lwb);
- ASSERT(lwb->lwb_nused == 0);
- if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+ ASSERT(LWB_EMPTY(lwb));
+ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
}
@@ -965,7 +1019,7 @@
lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
lwb->lwb_nused += reclen + dlen;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
- ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+ ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
return (lwb);
@@ -1100,11 +1154,11 @@
return;
}
mutex_exit(&zilog->zl_lock);
- zil_create(zilog);
+ lwb = zil_create(zilog);
mutex_enter(&zilog->zl_lock);
- lwb = list_tail(&zilog->zl_lwb_list);
}
}
+ ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
/* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
@@ -1133,8 +1187,8 @@
continue; /* skip this record */
if ((itx->itx_lr.lrc_seq > seq) &&
- ((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb))))
+ ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
+ (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
break;
list_remove(&zilog->zl_itx_list, itx);
--- a/usr/src/uts/common/fs/zfs/zio.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -764,9 +764,9 @@
zio->io_prop.zp_checksum = checksum;
- if (zio_checksum_table[checksum].ci_zbt) {
+ if (zio_checksum_table[checksum].ci_eck) {
/*
- * zbt checksums are necessarily destructive -- they modify
+ * zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
@@ -849,6 +849,23 @@
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
}
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+ ASSERT(zio->io_executor == NULL);
+ ASSERT(zio->io_orig_size == zio->io_size);
+ ASSERT(size <= zio->io_size);
+
+ /*
+ * We don't shrink for raidz because of problems with the
+ * reconstruction when reading back less than the block size.
+ * Note, BP_IS_RAIDZ() assumes no compression.
+ */
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+ if (!BP_IS_RAIDZ(zio->io_bp))
+ zio->io_orig_size = zio->io_size = size;
+}
+
/*
* ==========================================================================
* Prepare to read and write logical blocks
@@ -1524,7 +1541,7 @@
ASSERT(zio->io_data == gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1551,7 +1568,7 @@
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+ ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -2178,7 +2195,9 @@
BP_SET_LSIZE(new_bp, size);
BP_SET_PSIZE(new_bp, size);
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+ BP_SET_CHECKSUM(new_bp,
+ spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+ ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_LEVEL(new_bp, 0);
BP_SET_DEDUP(new_bp, 0);
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c Wed Feb 17 15:13:37 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -27,6 +27,7 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/zil.h>
/*
* Checksum vectors.
@@ -75,6 +76,7 @@
{{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
};
enum zio_checksum
@@ -151,23 +153,33 @@
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t zbt_cksum;
+ zio_cksum_t cksum;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+
+ size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+ uint64_t);
+ eck = &zilc->zc_eck;
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
- zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+ zio_checksum_gang_verifier(&eck->zec_cksum, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
- zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+ zio_checksum_label_verifier(&eck->zec_cksum, offset);
else
- bp->blk_cksum = zbt->zbt_cksum;
- zbt->zbt_magic = ZBT_MAGIC;
- ci->ci_func[0](data, size, &zbt_cksum);
- zbt->zbt_cksum = zbt_cksum;
+ bp->blk_cksum = eck->zec_cksum;
+ eck->zec_magic = ZEC_MAGIC;
+ ci->ci_func[0](data, size, &cksum);
+ eck->zec_cksum = cksum;
} else {
ci->ci_func[0](data, size, &bp->blk_cksum);
}
@@ -185,14 +197,35 @@
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
void *data = zio->io_data;
- zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum, verifier;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (EINVAL);
- if (ci->ci_zbt) {
+ if (ci->ci_eck) {
+ zio_eck_t *eck;
+
+ if (checksum == ZIO_CHECKSUM_ZILOG2) {
+ zil_chain_t *zilc = data;
+ uint64_t nused;
+
+ eck = &zilc->zc_eck;
+ if (eck->zec_magic == ZEC_MAGIC)
+ nused = zilc->zc_nused;
+ else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+ nused = BSWAP_64(zilc->zc_nused);
+ else
+ return (ECKSUM);
+
+ if (nused > size)
+ return (ECKSUM);
+
+ size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+ } else {
+ eck = (zio_eck_t *)((char *)data + size) - 1;
+ }
+
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
zio_checksum_gang_verifier(&verifier, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -200,15 +233,15 @@
else
verifier = bp->blk_cksum;
- byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+ byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
- expected_cksum = zbt->zbt_cksum;
- zbt->zbt_cksum = verifier;
+ expected_cksum = eck->zec_cksum;
+ eck->zec_cksum = verifier;
ci->ci_func[byteswap](data, size, &actual_cksum);
- zbt->zbt_cksum = expected_cksum;
+ eck->zec_cksum = expected_cksum;
if (byteswap)
byteswap_uint64_array(&expected_cksum,
--- a/usr/src/uts/common/sys/fs/zfs.h Wed Feb 17 13:05:11 2010 -0800
+++ b/usr/src/uts/common/sys/fs/zfs.h Wed Feb 17 15:13:37 2010 -0700
@@ -324,14 +324,15 @@
#define SPA_VERSION_20 20ULL
#define SPA_VERSION_21 21ULL
#define SPA_VERSION_22 22ULL
+#define SPA_VERSION_23 23ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_22
-#define SPA_VERSION_STRING "22"
+#define SPA_VERSION SPA_VERSION_23
+#define SPA_VERSION_STRING "23"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -375,6 +376,7 @@
#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
#define SPA_VERSION_DEDUP SPA_VERSION_21
#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22
+#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
/*
* ZPL version - rev'd whenever an incompatible on-disk format change