PSARC 2009/571 ZFS Deduplication Properties
6677093 zfs should have dedup capability
--- a/usr/src/cmd/filebench/Makefile.com Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/filebench/Makefile.com Sun Nov 01 14:14:46 2009 -0800
@@ -51,9 +51,9 @@
ROOTFBBINDIR = $(ROOT)/usr/benchmarks/filebench/bin
OBJS = $(SRCS:%.c=%.o) parser_gram.o parser_lex.o
LINTFLAGS += -erroff=E_FUNC_ARG_UNUSED -erroff=E_NAME_DEF_NOT_USED2 \
- -erroff=E_NAME_USED_NOT_DEF2
+ -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_DECL2
LINTFLAGS64 += -erroff=E_FUNC_ARG_UNUSED -erroff=E_NAME_DEF_NOT_USED2 \
- -erroff=E_NAME_USED_NOT_DEF2
+ -erroff=E_NAME_USED_NOT_DEF2 -erroff=E_INCONS_ARG_DECL2
LINTFILES = $(SRCS:%.c=%.ln)
CLEANFILES += parser_gram.c parser_gram.h parser_lex.c y.tab.h y.tab.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Sun Nov 01 14:14:46 2009 -0800
@@ -35,7 +35,6 @@
#include <sys/list.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
-#include <sys/zio_compress.h>
#include <ctype.h>
#ifndef _KERNEL
@@ -48,15 +47,6 @@
#define ZFS_OBJ_NAME "libzpool.so.1"
#endif
-static char *
-local_strdup(const char *s)
-{
- char *s1 = mdb_alloc(strlen(s) + 1, UM_SLEEP);
-
- (void) strcpy(s1, s);
- return (s1);
-}
-
static int
getmember(uintptr_t addr, const char *type, mdb_ctf_id_t *idp,
const char *member, int len, void *buf)
@@ -130,27 +120,6 @@
return (GETMEMBID(addr + off, &rc_id, rc_count, *rc));
}
-static int
-read_symbol(char *sym_name, void **bufp)
-{
- GElf_Sym sym;
-
- if (mdb_lookup_by_obj(MDB_TGT_OBJ_EVERY, sym_name, &sym)) {
- mdb_warn("can't find symbol %s", sym_name);
- return (DCMD_ERR);
- }
-
- *bufp = mdb_alloc(sym.st_size, UM_SLEEP);
-
- if (mdb_vread(*bufp, sym.st_size, sym.st_value) == -1) {
- mdb_warn("can't read data for symbol %s", sym_name);
- mdb_free(*bufp, sym.st_size);
- return (DCMD_ERR);
- }
-
- return (DCMD_OK);
-}
-
static int verbose;
static int
@@ -305,30 +274,6 @@
/* ARGSUSED */
static int
-zio_pipeline(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
- mdb_ctf_id_t pipe_enum;
- int i;
- char stage[1024];
-
- if (mdb_ctf_lookup_by_name("enum zio_stage", &pipe_enum) == -1) {
- mdb_warn("Could not find enum zio_stage");
- return (DCMD_ERR);
- }
-
- for (i = 0; i < 32; i++) {
- if (addr & (1U << i)) {
- enum_lookup(stage, sizeof (stage), pipe_enum, i,
- "ZIO_STAGE_");
- mdb_printf(" %s\n", stage);
- }
- }
-
- return (DCMD_OK);
-}
-
-/* ARGSUSED */
-static int
zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
/*
@@ -351,9 +296,8 @@
"metaslab_aliquot",
"reference_tracking_enable",
"reference_history",
- "zio_taskq_threads",
"spa_max_replication_override",
- "spa_mode",
+ "spa_mode_global",
"zfs_flags",
"zfs_txg_synctime",
"zfs_txg_timeout",
@@ -383,9 +327,8 @@
"zio_injection_enabled",
"zvol_immediate_write_sz",
};
- int i;
- for (i = 0; i < sizeof (params) / sizeof (params[0]); i++) {
+ for (int i = 0; i < sizeof (params) / sizeof (params[0]); i++) {
int sz;
uint64_t val64;
uint32_t *val32p = (uint32_t *)&val64;
@@ -407,76 +350,33 @@
static int
blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- blkptr_t bp;
- dmu_object_type_info_t *doti;
- zio_compress_info_t *zct;
- zio_checksum_info_t *zci;
- int i;
- char buf[MAXPATHLEN];
+ mdb_ctf_id_t type_enum, checksum_enum, compress_enum;
+ char type[80], checksum[80], compress[80];
+ blkptr_t blk, *bp = &blk;
+ char buf[BP_SPRINTF_LEN];
- if (mdb_vread(&bp, sizeof (blkptr_t), addr) == -1) {
+ if (mdb_vread(&blk, sizeof (blkptr_t), addr) == -1) {
mdb_warn("failed to read blkptr_t");
return (DCMD_ERR);
}
- if (read_symbol("dmu_ot", (void **)&doti) != DCMD_OK)
- return (DCMD_ERR);
- for (i = 0; i < DMU_OT_NUMTYPES; i++) {
- mdb_readstr(buf, sizeof (buf), (uintptr_t)doti[i].ot_name);
- doti[i].ot_name = local_strdup(buf);
- }
-
- if (read_symbol("zio_checksum_table", (void **)&zci) != DCMD_OK)
+ if (mdb_ctf_lookup_by_name("enum dmu_object_type", &type_enum) == -1 ||
+ mdb_ctf_lookup_by_name("enum zio_checksum", &checksum_enum) == -1 ||
+ mdb_ctf_lookup_by_name("enum zio_compress", &compress_enum) == -1) {
+ mdb_warn("Could not find blkptr enumerated types");
return (DCMD_ERR);
- for (i = 0; i < ZIO_CHECKSUM_FUNCTIONS; i++) {
- mdb_readstr(buf, sizeof (buf), (uintptr_t)zci[i].ci_name);
- zci[i].ci_name = local_strdup(buf);
- }
-
- if (read_symbol("zio_compress_table", (void **)&zct) != DCMD_OK)
- return (DCMD_ERR);
- for (i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
- mdb_readstr(buf, sizeof (buf), (uintptr_t)zct[i].ci_name);
- zct[i].ci_name = local_strdup(buf);
}
- /*
- * Super-ick warning: This code is also duplicated in
- * cmd/zdb.c . Yeah, I hate code replication, too.
- */
- for (i = 0; i < BP_GET_NDVAS(&bp); i++) {
- dva_t *dva = &bp.blk_dva[i];
+ enum_lookup(type, sizeof (type), type_enum,
+ BP_GET_TYPE(bp), "DMU_OT_");
+ enum_lookup(checksum, sizeof (checksum), checksum_enum,
+ BP_GET_CHECKSUM(bp), "ZIO_CHECKSUM_");
+ enum_lookup(compress, sizeof (compress), compress_enum,
+ BP_GET_COMPRESS(bp), "ZIO_COMPRESS_");
- mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
- DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
- mdb_printf("DVA[%d]: GANG: %-5s GRID: %04x\t"
- "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
- (int)DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
- mdb_printf("DVA[%d]: %llu:%llx:%llx:%s%s%s%s\n", i,
- DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
- BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
- !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "",
- DVA_GET_GANG(dva) ? "g" : "",
- BP_GET_COMPRESS(&bp) != 0 ? "d" : "");
- }
- mdb_printf("LSIZE: %-16llx\t\tPSIZE: %llx\n",
- BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
- mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
- BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
- BP_GET_TYPE(&bp) < DMU_OT_NUMTYPES ?
- doti[BP_GET_TYPE(&bp)].ot_name : "UNKNOWN");
- mdb_printf("BIRTH: %-16llx LEVEL: %-2d\tFILL: %llx\n",
- bp.blk_birth, (int)BP_GET_LEVEL(&bp), bp.blk_fill);
- mdb_printf("CKFUNC: %-16s\t\tCOMP: %s\n",
- BP_GET_CHECKSUM(&bp) < ZIO_CHECKSUM_FUNCTIONS ?
- zci[BP_GET_CHECKSUM(&bp)].ci_name : "UNKNOWN",
- BP_GET_COMPRESS(&bp) < ZIO_COMPRESS_FUNCTIONS ?
- zct[BP_GET_COMPRESS(&bp)].ci_name : "UNKNOWN");
- mdb_printf("CKSUM: %llx:%llx:%llx:%llx\n",
- bp.blk_cksum.zc_word[0],
- bp.blk_cksum.zc_word[1],
- bp.blk_cksum.zc_word[2],
- bp.blk_cksum.zc_word[3]);
+ SPRINTF_BLKPTR(mdb_snprintf, '\n', buf, bp, type, checksum, compress);
+
+ mdb_printf("%s\n", buf);
return (DCMD_OK);
}
@@ -2293,7 +2193,6 @@
"zio_t summary", zio_print },
{ "zio_state", "?", "print out all zio_t structures on system or "
"for a particular pool", zio_state },
- { "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline },
{ "zfs_blkstats", ":[-v]",
"given a spa_t, print block type stats from last scrub",
zfs_blkstats },
--- a/usr/src/cmd/sgs/Makefile.var Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/sgs/Makefile.var Sun Nov 01 14:14:46 2009 -0800
@@ -75,7 +75,7 @@
# the system.
#
VAR_AVLDIR= $(SRCBASE)/common/avl
-VAR_AVLINCDIR=
+VAR_AVLINCDIR= -I $(SRCBASE)/uts/common
#
# VAR_DTRDIR - directory to find dtrace_data.c in.
--- a/usr/src/cmd/zdb/Makefile.com Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/Makefile.com Sun Nov 01 14:14:46 2009 -0800
@@ -33,6 +33,7 @@
INCS += -I../../../lib/libzpool/common
INCS += -I../../../uts/common/fs/zfs
+INCS += -I../../../common/zfs
LDLIBS += -lzpool -lumem -lnvpair -lzfs -lavl
--- a/usr/src/cmd/zdb/zdb.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/zdb.c Sun Nov 01 14:14:46 2009 -0800
@@ -51,6 +51,7 @@
#include <sys/zio_compress.h>
#include <sys/zfs_fuid.h>
#include <sys/arc.h>
+#include <sys/ddt.h>
#undef ZFS_MAXNAMELEN
#undef verify
#include <libzfs.h>
@@ -72,8 +73,6 @@
uint64_t *zopt_object = NULL;
int zopt_objects = 0;
libzfs_handle_t *g_zfs;
-boolean_t zdb_sig_user_data = B_TRUE;
-int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -121,8 +120,7 @@
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
- (void) fprintf(stderr, " -S <user|all>:<cksum_alg|all> -- "
- "dump blkptr signatures\n");
+ (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
(void) fprintf(stderr, " -v verbose (applies to all others)\n");
(void) fprintf(stderr, " -l dump label contents\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
@@ -540,6 +538,198 @@
}
static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+ const ddt_phys_t *ddp = dde->dde_phys;
+ const ddt_key_t *ddk = &dde->dde_key;
+ char *types[4] = { "ditto", "single", "double", "triple" };
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t blk;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt, ddk, ddp, &blk);
+ sprintf_blkptr(blkbuf, &blk);
+ (void) printf("index %llx refcnt %llu %s %s\n",
+ (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ types[p], blkbuf);
+ }
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+ double rL, rP, rD, D, dedup, compress, copies;
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ rL = (double)dds->dds_ref_lsize;
+ rP = (double)dds->dds_ref_psize;
+ rD = (double)dds->dds_ref_dsize;
+ D = (double)dds->dds_dsize;
+
+ dedup = rD / D;
+ compress = rL / rP;
+ copies = rD / rP;
+
+ (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ "dedup * compress / copies = %.2f\n\n",
+ dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+ char refcnt[6];
+ char blocks[6], lsize[6], psize[6], dsize[6];
+ char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ if (h == -1)
+ (void) strcpy(refcnt, "Total");
+ else
+ nicenum(1ULL << h, refcnt);
+
+ nicenum(dds->dds_blocks, blocks);
+ nicenum(dds->dds_lsize, lsize);
+ nicenum(dds->dds_psize, psize);
+ nicenum(dds->dds_dsize, dsize);
+ nicenum(dds->dds_ref_blocks, ref_blocks);
+ nicenum(dds->dds_ref_lsize, ref_lsize);
+ nicenum(dds->dds_ref_psize, ref_psize);
+ nicenum(dds->dds_ref_dsize, ref_dsize);
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ refcnt,
+ blocks, lsize, psize, dsize,
+ ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+static void
+dump_ddt_histogram(const ddt_histogram_t *ddh)
+{
+ ddt_stat_t dds_total = { 0 };
+
+ ddt_histogram_stat(&dds_total, ddh);
+
+ (void) printf("\n");
+
+ (void) printf("bucket "
+ " allocated "
+ " referenced \n");
+ (void) printf("______ "
+ "______________________________ "
+ "______________________________\n");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "refcnt",
+ "blocks", "LSIZE", "PSIZE", "DSIZE",
+ "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+ (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n",
+ "------",
+ "------", "-----", "-----", "-----",
+ "------", "-----", "-----", "-----");
+
+ for (int h = 0; h < 64; h++)
+ dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+ dump_ddt_stat(&dds_total, -1);
+
+ (void) printf("\n");
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ char name[DDT_NAMELEN];
+ ddt_entry_t dde;
+ uint64_t walk = 0;
+ dmu_object_info_t doi;
+ uint64_t count, dspace, mspace;
+ int error;
+
+ error = ddt_object_info(ddt, type, class, &doi);
+
+ if (error == ENOENT)
+ return;
+ ASSERT(error == 0);
+
+ count = ddt_object_count(ddt, type, class);
+ dspace = doi.doi_physical_blocks_512 << 9;
+ mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ASSERT(count != 0); /* we should have destroyed it */
+
+ ddt_object_name(ddt, type, class, name);
+
+ (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ name,
+ (u_longlong_t)count,
+ (u_longlong_t)(dspace / count),
+ (u_longlong_t)(mspace / count));
+
+ if (dump_opt['D'] < 3)
+ return;
+
+ dump_ddt_histogram(&ddt->ddt_histogram[type][class]);
+
+ if (dump_opt['D'] < 4)
+ return;
+
+ if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ return;
+
+ (void) printf("%s contents:\n\n", name);
+
+ while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0)
+ dump_dde(ddt, &dde, walk);
+
+ ASSERT(error == ENOENT);
+
+ (void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(&ddh_total,
+ &ddt->ddt_histogram[type][class]);
+ dump_ddt(ddt, type, class);
+ }
+ }
+ }
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ if (dds_total.dds_blocks == 0) {
+ (void) printf("All DDTs are empty\n");
+ return;
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['D'] > 1) {
+ (void) printf("DDT histogram (aggregated over all DDTs):\n");
+ dump_ddt_histogram(&ddh_total);
+ }
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static void
dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
{
char *prefix = (void *)sm;
@@ -658,35 +848,48 @@
}
static uint64_t
-blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
{
- if (level < 0)
- return (blkid);
-
- return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+ if (dnp == NULL) {
+ ASSERT(zb->zb_level < 0);
+ if (zb->zb_object == 0)
+ return (zb->zb_blkid);
+ return (zb->zb_blkid * BP_GET_LSIZE(bp));
+ }
+
+ ASSERT(zb->zb_level >= 0);
+
+ return ((zb->zb_blkid <<
+ (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
}
static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp)
{
dva_t *dva = bp->blk_dva;
- int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
- int i;
+ int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+
+ if (dump_opt['b'] >= 5) {
+ sprintf_blkptr(blkbuf, bp);
+ return;
+ }
blkbuf[0] = '\0';
- for (i = 0; i < ndvas; i++)
+ for (int i = 0; i < ndvas; i++)
(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
- (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+ (void) sprintf(blkbuf + strlen(blkbuf),
+ "%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_birth);
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
}
static void
@@ -699,8 +902,7 @@
ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
- (void) printf("%16llx ",
- (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+ (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
ASSERT(zb->zb_level >= 0);
@@ -712,18 +914,10 @@
}
}
- sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ sprintf_blkptr_compact(blkbuf, bp);
(void) printf("%s\n", blkbuf);
}
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
static int
visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
blkptr_t *bp, const zbookmark_t *zb)
@@ -859,7 +1053,7 @@
nicenum(ds->ds_compressed_bytes, compressed);
nicenum(ds->ds_uncompressed_bytes, uncompressed);
nicenum(ds->ds_unique_bytes, unique);
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+ sprintf_blkptr(blkbuf, &ds->ds_bp);
(void) printf("\t\tdir_obj = %llu\n",
(u_longlong_t)ds->ds_dir_obj);
@@ -910,11 +1104,11 @@
if (dump_opt['d'] < 3)
return;
- mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ bplist_init(&bpl);
VERIFY(0 == bplist_open(&bpl, mos, object));
if (bplist_empty(&bpl)) {
bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
+ bplist_fini(&bpl);
return;
}
@@ -932,7 +1126,7 @@
if (dump_opt['d'] < 5) {
bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
+ bplist_fini(&bpl);
return;
}
@@ -942,13 +1136,13 @@
char blkbuf[BP_SPRINTF_LEN];
ASSERT(bp->blk_birth != 0);
- sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+ sprintf_blkptr_compact(blkbuf, bp);
(void) printf("\tItem %3llu: %s\n",
(u_longlong_t)itor - 1, blkbuf);
}
bplist_close(&bpl);
- mutex_destroy(&bpl.bpl_lock);
+ bplist_fini(&bpl);
}
static avl_tree_t idx_tree;
@@ -1107,6 +1301,8 @@
dump_zap, /* ZFS user/group used */
dump_zap, /* ZFS user/group quota */
dump_zap, /* snapshot refcount tags */
+ dump_none, /* DDT ZAP object */
+ dump_zap, /* DDT statistics */
dump_unknown /* Unknown type, must be last */
};
@@ -1118,13 +1314,14 @@
dnode_t *dn;
void *bonus = NULL;
size_t bsize = 0;
- char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+ char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7];
char aux[50];
int error;
if (*print_header) {
- (void) printf("\n Object lvl iblk dblk lsize"
- " asize type\n");
+ (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n",
+ "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+ "%full", "type");
*print_header = 0;
}
@@ -1143,10 +1340,11 @@
nicenum(doi.doi_metadata_block_size, iblk);
nicenum(doi.doi_data_block_size, dblk);
- nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
- lsize);
- nicenum(doi.doi_physical_blks << 9, asize);
+ nicenum(doi.doi_max_offset, lsize);
+ nicenum(doi.doi_physical_blocks_512 << 9, asize);
nicenum(doi.doi_bonus_size, bonus_size);
+ (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+ doi.doi_data_block_size / doi.doi_max_offset);
aux[0] = '\0';
@@ -1160,13 +1358,13 @@
ZDB_COMPRESS_NAME(doi.doi_compress));
}
- (void) printf("%10lld %3u %5s %5s %5s %5s %s%s\n",
- (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
- asize, ZDB_OT_NAME(doi.doi_type), aux);
+ (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+ asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
- (void) printf("%10s %3s %5s %5s %5s %5s %s\n",
- "", "", "", "", bonus_size, "bonus",
+ (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n",
+ "", "", "", "", "", bonus_size, "bonus",
ZDB_OT_NAME(doi.doi_bonus_type));
}
@@ -1203,6 +1401,7 @@
}
for (;;) {
+ char segsize[6];
error = dnode_next_offset(dn,
0, &start, minlvl, blkfill, 0);
if (error)
@@ -1261,8 +1460,7 @@
if (verbosity >= 4) {
(void) sprintf(blkbuf, ", rootbp ");
- (void) sprintf_blkptr(blkbuf + strlen(blkbuf),
- BP_SPRINTF_LEN - strlen(blkbuf), os->os_rootbp);
+ (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
} else {
blkbuf[0] = '\0';
}
@@ -1275,7 +1473,16 @@
(u_longlong_t)dds.dds_creation_txg,
numbuf, (u_longlong_t)usedobjs, blkbuf);
- dump_intent_log(dmu_objset_zil(os));
+ if (zopt_objects != 0) {
+ for (i = 0; i < zopt_objects; i++)
+ dump_object(os, zopt_object[i], verbosity,
+ &print_header);
+ (void) printf("\n");
+ return;
+ }
+
+ if (dump_opt['i'] != 0 || verbosity >= 2)
+ dump_intent_log(dmu_objset_zil(os));
if (dmu_objset_ds(os) != NULL)
dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
@@ -1287,14 +1494,6 @@
if (os->os_rootbp->blk_birth == 0)
return;
- if (zopt_objects != 0) {
- for (i = 0; i < zopt_objects; i++)
- dump_object(os, zopt_object[i], verbosity,
- &print_header);
- (void) printf("\n");
- return;
- }
-
dump_object(os, 0, verbosity, &print_header);
object_count = 0;
if (os->os_userused_dnode &&
@@ -1333,7 +1532,7 @@
(u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp)));
if (dump_opt['u'] >= 3) {
char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+ sprintf_blkptr(blkbuf, &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
(void) printf("\n");
@@ -1466,7 +1665,7 @@
error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
if (error) {
- (void) printf("Could not open %s\n", dsname);
+ (void) printf("Could not open %s, error %d\n", dsname, error);
return (0);
}
dump_dir(os);
@@ -1475,6 +1674,160 @@
return (0);
}
+/*
+ * Block statistics.
+ */
+typedef struct zdb_blkstats {
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_count;
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
+#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
+#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2)
+
+static char *zdb_ot_extname[] = {
+ "deferred free",
+ "dedup ditto",
+ "Total",
+};
+
+#define ZB_TOTAL DN_MAX_LEVELS
+
+typedef struct zdb_cb {
+ zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+ uint64_t zcb_dedup_asize;
+ uint64_t zcb_dedup_blocks;
+ uint64_t zcb_errors[256];
+ int zcb_readfails;
+ int zcb_haderrors;
+} zdb_cb_t;
+
+static void
+zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+ dmu_object_type_t type)
+{
+ uint64_t refcnt = 0;
+
+ ASSERT(type < ZDB_OT_TOTAL);
+
+ if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ return;
+
+ for (int i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+ int t = (i & 1) ? type : ZDB_OT_TOTAL;
+ zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_count++;
+ }
+
+ if (dump_opt['L'])
+ return;
+
+ if (BP_GET_DEDUP(bp)) {
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+
+ ddt = ddt_select(spa, bp);
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_FALSE);
+
+ if (dde == NULL) {
+ refcnt = 0;
+ } else {
+ ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ refcnt = ddp->ddp_refcnt;
+ if (ddt_phys_total_refcnt(dde) == 0)
+ ddt_remove(ddt, dde);
+ }
+ ddt_exit(ddt);
+ }
+
+ VERIFY3U(zio_wait(zio_claim(NULL, spa,
+ refcnt ? 0 : spa_first_txg(spa),
+ bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ zdb_cb_t *zcb = arg;
+ char blkbuf[BP_SPRINTF_LEN];
+ dmu_object_type_t type;
+ boolean_t is_metadata;
+
+ if (bp == NULL)
+ return (0);
+
+ type = BP_GET_TYPE(bp);
+
+ zdb_count_block(spa, zilog, zcb, bp, type);
+
+ is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+
+ if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+ int ioerr;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = malloc(size);
+ int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ flags |= ZIO_FLAG_SPECULATIVE;
+
+ ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+
+ free(data);
+
+ if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
+ zcb->zcb_haderrors = 1;
+ zcb->zcb_errors[ioerr]++;
+
+ if (dump_opt['b'] >= 2)
+ sprintf_blkptr(blkbuf, bp);
+ else
+ blkbuf[0] = '\0';
+
+ (void) printf("zdb_blkptr_cb: "
+ "Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+ ioerr,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf);
+ }
+ }
+
+ zcb->zcb_readfails = 0;
+
+ if (dump_opt['b'] >= 4) {
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("objset %llu object %llu "
+ "level %lld offset 0x%llx %s\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (u_longlong_t)blkid2offset(dnp, bp, zb),
+ blkbuf);
+ }
+
+ return (0);
+}
+
static void
zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
{
@@ -1512,169 +1865,90 @@
};
static void
-zdb_leak_init(spa_t *spa)
+zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ zdb_cb_t *zcb)
{
- vdev_t *rvd = spa->spa_root_vdev;
-
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- mutex_enter(&msp->ms_lock);
- VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops,
- SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0);
- msp->ms_map.sm_ppd = vd;
- mutex_exit(&msp->ms_lock);
+ uint64_t walk = 0;
+ ddt_entry_t dde;
+ int error;
+
+ if (class == DDT_CLASS_UNIQUE || !ddt_object_exists(ddt, type, class))
+ return;
+
+ while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+ blkptr_t blk;
+ ddt_phys_t *ddp = dde.dde_phys;
+ ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt, &dde.dde_key, ddp, &blk);
+ if (p == DDT_PHYS_DITTO) {
+ zdb_count_block(ddt->ddt_spa, NULL, zcb, &blk,
+ ZDB_OT_DITTO);
+ } else {
+ zcb->zcb_dedup_asize +=
+ BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ zcb->zcb_dedup_blocks++;
+ }
+ }
+ if (!dump_opt['L']) {
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
}
}
+
+ ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ VERIFY(space_map_load(&msp->ms_map,
+ &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+ spa->spa_meta_objset) == 0);
+ msp->ms_map.sm_ppd = vd;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++)
+ zdb_ddt_leak_init(spa->spa_ddt[c],
+ type, class, zcb);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
static void
zdb_leak_fini(spa_t *spa)
{
- vdev_t *rvd = spa->spa_root_vdev;
-
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- mutex_enter(&msp->ms_lock);
- space_map_unload(&msp->ms_map);
- mutex_exit(&msp->ms_lock);
- }
- }
-}
-
-/*
- * Verify that the sum of the sizes of all blocks in the pool adds up
- * to the SPA's sa_alloc total.
- */
-typedef struct zdb_blkstats {
- uint64_t zb_asize;
- uint64_t zb_lsize;
- uint64_t zb_psize;
- uint64_t zb_count;
-} zdb_blkstats_t;
-
-#define DMU_OT_DEFERRED DMU_OT_NONE
-#define DMU_OT_TOTAL DMU_OT_NUMTYPES
-
-#define ZB_TOTAL DN_MAX_LEVELS
-
-typedef struct zdb_cb {
- zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
- uint64_t zcb_errors[256];
- int zcb_readfails;
- int zcb_haderrors;
-} zdb_cb_t;
-
-static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
-{
- for (int i = 0; i < 4; i++) {
- int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
- int t = (i & 1) ? type : DMU_OT_TOTAL;
- zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
-
- zb->zb_asize += BP_GET_ASIZE(bp);
- zb->zb_lsize += BP_GET_LSIZE(bp);
- zb->zb_psize += BP_GET_PSIZE(bp);
- zb->zb_count++;
- }
-
- if (dump_opt['S']) {
- boolean_t print_sig;
-
- print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
- BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
-
- if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
- print_sig = B_FALSE;
-
- if (print_sig) {
- (void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t"
- "%llx:%llx:%llx:%llx\n",
- (u_longlong_t)BP_GET_LEVEL(bp),
- (longlong_t)BP_GET_PSIZE(bp),
- (longlong_t)BP_GET_NDVAS(bp),
- ZDB_OT_NAME(BP_GET_TYPE(bp)),
- ZDB_CHECKSUM_NAME(BP_GET_CHECKSUM(bp)),
- ZDB_COMPRESS_NAME(BP_GET_COMPRESS(bp)),
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
- }
- }
-
- if (!dump_opt['L'])
- VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
- NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
-}
-
-static int
-zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
-{
- zdb_cb_t *zcb = arg;
- char blkbuf[BP_SPRINTF_LEN];
- dmu_object_type_t type;
- boolean_t is_metadata;
-
- if (bp == NULL)
- return (0);
-
- type = BP_GET_TYPE(bp);
-
- zdb_count_block(spa, zcb, bp, type);
-
- is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
-
- if (dump_opt['c'] > 1 || dump_opt['S'] ||
- (dump_opt['c'] && is_metadata)) {
- size_t size = BP_GET_PSIZE(bp);
- void *data = malloc(size);
- int ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
- free(data);
-
- /* We expect io errors on intent log */
- if (ioerr && type != DMU_OT_INTENT_LOG) {
- zcb->zcb_haderrors = 1;
- zcb->zcb_errors[ioerr]++;
-
- if (dump_opt['b'] >= 2)
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
- else
- blkbuf[0] = '\0';
-
- if (!dump_opt['S']) {
- (void) printf("zdb_blkptr_cb: "
- "Got error %d reading "
- "<%llu, %llu, %lld, %llx> %s -- skipping\n",
- ioerr,
- (u_longlong_t)zb->zb_objset,
- (u_longlong_t)zb->zb_object,
- (u_longlong_t)zb->zb_level,
- (u_longlong_t)zb->zb_blkid,
- blkbuf);
+ if (!dump_opt['L']) {
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ mutex_enter(&msp->ms_lock);
+ space_map_unload(&msp->ms_map);
+ mutex_exit(&msp->ms_lock);
}
}
}
-
- zcb->zcb_readfails = 0;
-
- if (dump_opt['b'] >= 4) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
- (void) printf("objset %llu object %llu offset 0x%llx %s\n",
- (u_longlong_t)zb->zb_objset,
- (u_longlong_t)zb->zb_object,
- (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
- blkbuf);
- }
-
- return (0);
}
static int
@@ -1682,19 +1956,15 @@
{
zdb_cb_t zcb = { 0 };
zdb_blkstats_t *zb, *tzb;
- uint64_t alloc, space, logalloc;
- vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t norm_alloc, norm_space, total_alloc, total_found;
int leaks = 0;
- int c, e;
-
- if (!dump_opt['S']) {
- (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
- (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
- (dump_opt['c'] == 1) ? "metadata " : "",
- dump_opt['c'] ? "checksums " : "",
- (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
- !dump_opt['L'] ? "nothing leaked " : "");
- }
+
+ (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+ (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ (dump_opt['c'] == 1) ? "metadata " : "",
+ dump_opt['c'] ? "checksums " : "",
+ (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ !dump_opt['L'] ? "nothing leaked " : "");
/*
* Load all space maps as SM_ALLOC maps, then traverse the pool
@@ -1704,28 +1974,27 @@
* it's not part of any space map) is a double allocation,
* reference to a freed block, or an unclaimed log block.
*/
- if (!dump_opt['L'])
- zdb_leak_init(spa);
+ zdb_leak_init(spa, &zcb);
/*
* If there's a deferred-free bplist, process that first.
*/
- if (spa->spa_sync_bplist_obj != 0) {
- bplist_t *bpl = &spa->spa_sync_bplist;
+ if (spa->spa_deferred_bplist_obj != 0) {
+ bplist_t *bpl = &spa->spa_deferred_bplist;
blkptr_t blk;
uint64_t itor = 0;
VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
- spa->spa_sync_bplist_obj));
+ spa->spa_deferred_bplist_obj));
while (bplist_iterate(bpl, &itor, &blk) == 0) {
if (dump_opt['b'] >= 4) {
char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
+ sprintf_blkptr(blkbuf, &blk);
(void) printf("[%s] %s\n",
"deferred free", blkbuf);
}
- zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
+ zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
}
bplist_close(bpl);
@@ -1733,10 +2002,10 @@
zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0);
- if (zcb.zcb_haderrors && !dump_opt['S']) {
+ if (zcb.zcb_haderrors) {
(void) printf("\nError counts:\n\n");
(void) printf("\t%5s %s\n", "errno", "count");
- for (e = 0; e < 256; e++) {
+ for (int e = 0; e < 256; e++) {
if (zcb.zcb_errors[e] != 0) {
(void) printf("\t%5d %llu\n",
e, (u_longlong_t)zcb.zcb_errors[e]);
@@ -1747,43 +2016,27 @@
/*
* Report any leaked segments.
*/
- if (!dump_opt['L'])
- zdb_leak_fini(spa);
-
- /*
- * If we're interested in printing out the blkptr signatures,
- * return now as we don't print out anything else (including
- * errors and leaks).
- */
- if (dump_opt['S'])
- return (zcb.zcb_haderrors ? 3 : 0);
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
-
- /*
- * Log blocks allocated from a separate log device don't count
- * as part of the normal pool space; factor them in here.
- */
- logalloc = 0;
-
- for (c = 0; c < rvd->vdev_children; c++)
- if (rvd->vdev_child[c]->vdev_islog)
- logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc;
-
- tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
-
- if (tzb->zb_asize == alloc + logalloc) {
+ zdb_leak_fini(spa);
+
+ tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+ norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
+
+ if (total_found == total_alloc) {
if (!dump_opt['L'])
(void) printf("\n\tNo leaks (block sum matches space"
" maps exactly)\n");
} else {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
- (u_longlong_t)tzb->zb_asize,
- (u_longlong_t)alloc + logalloc,
+ (u_longlong_t)total_found,
+ (u_longlong_t)total_alloc,
(dump_opt['L']) ? "unreachable" : "leaked",
- (longlong_t)(alloc + logalloc - tzb->zb_asize));
+ (longlong_t)(total_alloc - total_found));
leaks = 1;
}
@@ -1793,33 +2046,40 @@
(void) printf("\n");
(void) printf("\tbp count: %10llu\n",
(u_longlong_t)tzb->zb_count);
- (void) printf("\tbp logical: %10llu\t avg: %6llu\n",
+ (void) printf("\tbp logical: %10llu avg: %6llu\n",
(u_longlong_t)tzb->zb_lsize,
(u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
- (void) printf("\tbp physical: %10llu\t avg:"
- " %6llu\tcompression: %6.2f\n",
+ (void) printf("\tbp physical: %10llu avg:"
+ " %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_psize,
(u_longlong_t)(tzb->zb_psize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_psize);
- (void) printf("\tbp allocated: %10llu\t avg:"
- " %6llu\tcompression: %6.2f\n",
+ (void) printf("\tbp allocated: %10llu avg:"
+ " %6llu compression: %6.2f\n",
(u_longlong_t)tzb->zb_asize,
(u_longlong_t)(tzb->zb_asize / tzb->zb_count),
(double)tzb->zb_lsize / tzb->zb_asize);
- (void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
- (u_longlong_t)alloc, 100.0 * alloc / space);
+ (void) printf("\tbp deduped: %10llu ref>1:"
+ " %6llu deduplication: %6.2f\n",
+ (u_longlong_t)zcb.zcb_dedup_asize,
+ (u_longlong_t)zcb.zcb_dedup_blocks,
+ (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+ (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n",
+ (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
if (dump_opt['b'] >= 2) {
int l, t, level;
(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
"\t avg\t comp\t%%Total\tType\n");
- for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
+ for (t = 0; t <= ZDB_OT_TOTAL; t++) {
char csize[6], lsize[6], psize[6], asize[6], avg[6];
char *typename;
- typename = t == DMU_OT_DEFERRED ? "deferred free" :
- t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+ if (t < DMU_OT_NUMTYPES)
+ typename = dmu_ot[t].ot_name;
+ else
+ typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
(void) printf("%6s\t%5s\t%5s\t%5s"
@@ -1881,12 +2141,116 @@
return (0);
}
+typedef struct zdb_ddt_entry {
+ ddt_key_t zdde_key;
+ uint64_t zdde_ref_blocks;
+ uint64_t zdde_ref_lsize;
+ uint64_t zdde_ref_psize;
+ uint64_t zdde_ref_dsize;
+ avl_node_t zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ avl_tree_t *t = arg;
+ avl_index_t where;
+ zdb_ddt_entry_t *zdde, zdde_search;
+
+ if (bp == NULL)
+ return (0);
+
+ if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+ (void) printf("traversing objset %llu, %llu objects, "
+ "%lu blocks so far\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)bp->blk_fill,
+ avl_numnodes(t));
+ }
+
+ if (BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+ return (0);
+
+ ddt_key_fill(&zdde_search.zdde_key, bp);
+
+ zdde = avl_find(t, &zdde_search, &where);
+
+ if (zdde == NULL) {
+ zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+ zdde->zdde_key = zdde_search.zdde_key;
+ avl_insert(t, zdde, where);
+ }
+
+ zdde->zdde_ref_blocks += 1;
+ zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+ zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+ zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+ return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+ avl_tree_t t;
+ void *cookie = NULL;
+ zdb_ddt_entry_t *zdde;
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ avl_create(&t, ddt_entry_compare,
+ sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ (void) traverse_pool(spa, zdb_ddt_add_cb, &t, 0);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+ ddt_stat_t dds;
+ uint64_t refcnt = zdde->zdde_ref_blocks;
+ ASSERT(refcnt != 0);
+
+ dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+ dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+ dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+ dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+ dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+ dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+ dds.dds_ref_psize = zdde->zdde_ref_psize;
+ dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+ ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+
+ umem_free(zdde, sizeof (*zdde));
+ }
+
+ avl_destroy(&t);
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ (void) printf("Simulated DDT histogram:\n");
+
+ dump_ddt_histogram(&ddh_total);
+
+ dump_dedup_ratio(&dds_total);
+}
+
static void
dump_zpool(spa_t *spa)
{
dsl_pool_t *dp = spa_get_dsl(spa);
int rc = 0;
+ if (dump_opt['S']) {
+ dump_simulated_ddt(spa);
+ return;
+ }
+
if (!dump_opt['e'] && dump_opt['C'] > 1) {
(void) printf("\nCached configuration:\n");
dump_nvlist(spa->spa_config, 8);
@@ -1898,6 +2262,9 @@
if (dump_opt['u'])
dump_uberblock(&spa->spa_uberblock);
+ if (dump_opt['D'])
+ dump_all_ddts(spa);
+
if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa);
@@ -1905,13 +2272,13 @@
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
dump_bplist(dp->dp_meta_objset,
- spa->spa_sync_bplist_obj, "Deferred frees");
+ spa->spa_deferred_bplist_obj, "Deferred frees");
dump_dtl(spa->spa_root_vdev, 0);
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
}
- if (dump_opt['b'] || dump_opt['c'] || dump_opt['S'])
+ if (dump_opt['b'] || dump_opt['c'])
rc = dump_block_stats(spa);
if (dump_opt['s'])
@@ -1938,51 +2305,13 @@
static void
zdb_print_blkptr(blkptr_t *bp, int flags)
{
- dva_t *dva = bp->blk_dva;
- int d;
+ char blkbuf[BP_SPRINTF_LEN];
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
- /*
- * Super-ick warning: This code is also duplicated in
- * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code
- * replication, too.
- */
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
- (longlong_t)DVA_GET_VDEV(&dva[d]),
- (longlong_t)DVA_GET_OFFSET(&dva[d]));
- (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t"
- "ASIZE: %llx\n", d,
- DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
- (longlong_t)DVA_GET_GRID(&dva[d]),
- (longlong_t)DVA_GET_ASIZE(&dva[d]));
- (void) printf("\tDVA[%d]: %llu:%llx:%llx:%s%s%s%s\n", d,
- (u_longlong_t)DVA_GET_VDEV(&dva[d]),
- (longlong_t)DVA_GET_OFFSET(&dva[d]),
- (longlong_t)BP_GET_PSIZE(bp),
- BP_SHOULD_BYTESWAP(bp) ? "e" : "",
- !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
- "d" : "",
- DVA_GET_GANG(&dva[d]) ? "g" : "",
- BP_GET_COMPRESS(bp) != 0 ? "d" : "");
- }
- (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n",
- (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
- (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n",
- BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
- ZDB_OT_NAME(BP_GET_TYPE(bp)));
- (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n",
- (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
- (u_longlong_t)bp->blk_fill);
- (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n",
- ZDB_CHECKSUM_NAME(BP_GET_CHECKSUM(bp)),
- ZDB_COMPRESS_NAME(BP_GET_COMPRESS(bp)));
- (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n",
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("%s\n", blkbuf);
}
static void
@@ -2005,7 +2334,7 @@
{
if (flags & ZDB_FLAG_BSWAP)
byteswap_uint64_array(buf, size);
- (void) write(2, buf, size);
+ (void) write(1, buf, size);
}
static void
@@ -2108,10 +2437,10 @@
* flags - A string of characters specifying options
* b: Decode a blkptr at given offset within block
* *c: Calculate and display checksums
- * *d: Decompress data before dumping
+ * d: Decompress data before dumping
* e: Byteswap data before dumping
- * *g: Display data as a gang block header
- * *i: Display as an indirect block
+ * g: Display data as a gang block header
+ * i: Display as an indirect block
* p: Do I/O to physical offset
* r: Dump raw data to stdout
*
@@ -2120,13 +2449,15 @@
static void
zdb_read_block(char *thing, spa_t *spa)
{
+ blkptr_t blk, *bp = &blk;
+ dva_t *dva = bp->blk_dva;
int flags = 0;
- uint64_t offset = 0, size = 0, blkptr_offset = 0;
+ uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
- void *buf;
+ void *pbuf, *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
- int i, error, zio_flags;
+ int i, error;
dup = strdup(thing);
s = strtok(dup, ":");
@@ -2163,7 +2494,7 @@
flags |= bit;
/* If it's not something with an argument, keep going */
- if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+ if ((bit & (ZDB_FLAG_CHECKSUM |
ZDB_FLAG_PRINT_BLKPTR)) == 0)
continue;
@@ -2185,22 +2516,58 @@
return;
} else {
if (vd->vdev_path)
- (void) printf("Found vdev: %s\n", vd->vdev_path);
+ (void) fprintf(stderr, "Found vdev: %s\n",
+ vd->vdev_path);
else
- (void) printf("Found vdev type: %s\n",
+ (void) fprintf(stderr, "Found vdev type: %s\n",
vd->vdev_ops->vdev_op_type);
}
- buf = umem_alloc(size, UMEM_NOFAIL);
-
- zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
- ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY;
+ psize = size;
+ lsize = size;
+
+ pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[0], offset);
+ DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
zio = zio_root(spa, NULL, NULL, 0);
- /* XXX todo - cons up a BP so RAID-Z will be happy */
- zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
- ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+
+ if (vd == vd->vdev_top) {
+ /*
+ * Treat this as a normal block read.
+ */
+ zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+ } else {
+ /*
+ * Treat this as a vdev child I/O.
+ */
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+ ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+ }
+
error = zio_wait(zio);
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -2209,6 +2576,52 @@
goto out;
}
+ if (flags & ZDB_FLAG_DECOMPRESS) {
+ /*
+ * We don't know how the data was compressed, so just try
+ * every decompress function at every inflated blocksize.
+ */
+ enum zio_compress c;
+ void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+ bcopy(pbuf, pbuf2, psize);
+
+ VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+ SPA_MAXBLOCKSIZE - psize) == 0);
+
+ VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+ SPA_MAXBLOCKSIZE - psize) == 0);
+
+ for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+ lsize -= SPA_MINBLOCKSIZE) {
+ for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+ if (zio_decompress_data(c, pbuf, lbuf,
+ psize, lsize) == 0 &&
+ zio_decompress_data(c, pbuf2, lbuf2,
+ psize, lsize) == 0 &&
+ bcmp(lbuf, lbuf2, lsize) == 0)
+ break;
+ }
+ if (c != ZIO_COMPRESS_FUNCTIONS)
+ break;
+ lsize -= SPA_MINBLOCKSIZE;
+ }
+
+ umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+ umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+ if (lsize <= psize) {
+ (void) printf("Decompress of %s failed\n", thing);
+ goto out;
+ }
+ buf = lbuf;
+ size = lsize;
+ } else {
+ buf = pbuf;
+ size = psize;
+ }
+
if (flags & ZDB_FLAG_PRINT_BLKPTR)
zdb_print_blkptr((blkptr_t *)(void *)
((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
@@ -2223,7 +2636,8 @@
zdb_dump_block(thing, buf, size, flags);
out:
- umem_free(buf, size);
+ umem_free(pbuf, SPA_MAXBLOCKSIZE);
+ umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
@@ -2312,7 +2726,6 @@
struct rlimit rl = { 1024, 1024 };
spa_t *spa = NULL;
objset_t *os = NULL;
- char *endstr;
int dump_all = 1;
int verbose = 0;
int error;
@@ -2327,19 +2740,21 @@
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "udhibcmsvCLS:RU:lep:t:")) != -1) {
+ while ((c = getopt(argc, argv, "bcdhilmsuCDRSLevp:t:U:")) != -1) {
switch (c) {
- case 'u':
- case 'd':
- case 'i':
- case 'h':
case 'b':
case 'c':
+ case 'd':
+ case 'h':
+ case 'i':
+ case 'l':
case 'm':
case 's':
+ case 'u':
case 'C':
- case 'l':
+ case 'D':
case 'R':
+ case 'S':
dump_opt[c]++;
dump_all = 0;
break;
@@ -2350,9 +2765,6 @@
case 'v':
verbose++;
break;
- case 'U':
- spa_config_path = optarg;
- break;
case 'p':
if (searchdirs == NULL) {
searchdirs = umem_alloc(sizeof (char *),
@@ -2368,24 +2780,6 @@
}
searchdirs[nsearch++] = optarg;
break;
- case 'S':
- dump_opt[c]++;
- dump_all = 0;
- zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0);
- if (!zdb_sig_user_data && strncmp(optarg, "all:", 4))
- usage();
- endstr = strchr(optarg, ':') + 1;
- if (strcmp(endstr, "fletcher2") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
- else if (strcmp(endstr, "fletcher4") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4;
- else if (strcmp(endstr, "sha256") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
- else if (strcmp(endstr, "all") == 0)
- zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
- else
- usage();
- break;
case 't':
max_txg = strtoull(optarg, NULL, 0);
if (max_txg < TXG_INITIAL) {
@@ -2394,6 +2788,9 @@
usage();
}
break;
+ case 'U':
+ spa_config_path = optarg;
+ break;
default:
usage();
break;
@@ -2409,8 +2806,11 @@
g_zfs = libzfs_init();
ASSERT(g_zfs != NULL);
+ if (dump_all)
+ verbose = MAX(verbose, 1);
+
for (c = 0; c < 256; c++) {
- if (dump_all && !strchr("elLR", c))
+ if (dump_all && !strchr("elLRS", c))
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
--- a/usr/src/cmd/zdb/zdb_il.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zdb/zdb_il.c Sun Nov 01 14:14:46 2009 -0800
@@ -40,12 +40,14 @@
extern uint8_t dump_opt[256];
+static char prefix[4] = "\t\t\t";
+
static void
print_log_bp(const blkptr_t *bp, const char *prefix)
{
char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+ sprintf_blkptr(blkbuf, bp);
(void) printf("%s%s\n", prefix, blkbuf);
}
@@ -58,15 +60,15 @@
char *link = name + strlen(name) + 1;
if (txtype == TX_SYMLINK)
- (void) printf("\t\t\t%s -> %s\n", name, link);
+ (void) printf("%s%s -> %s\n", prefix, name, link);
else
- (void) printf("\t\t\t%s\n", name);
+ (void) printf("%s%s\n", prefix, name);
- (void) printf("\t\t\t%s", ctime(&crtime));
- (void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+ (void) printf("%s%s", prefix, ctime(&crtime));
+ (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
(longlong_t)lr->lr_mode);
- (void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
(u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
(u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
}
@@ -75,7 +77,7 @@
static void
zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
{
- (void) printf("\t\t\tdoid %llu, name %s\n",
+ (void) printf("%sdoid %llu, name %s\n", prefix,
(u_longlong_t)lr->lr_doid, (char *)(lr + 1));
}
@@ -83,7 +85,7 @@
static void
zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
{
- (void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+ (void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
(u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
(char *)(lr + 1));
}
@@ -95,9 +97,9 @@
char *snm = (char *)(lr + 1);
char *tnm = snm + strlen(snm) + 1;
- (void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+ (void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
(u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
- (void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+ (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
}
/* ARGSUSED */
@@ -106,44 +108,48 @@
{
char *data, *dlimit;
blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_t zb;
char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
- (void) printf("\t\t\tfoid %llu, offset 0x%llx,"
- " length 0x%llx, blkoff 0x%llx\n",
- (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
- (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+ (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
if (txtype == TX_WRITE2 || verbose < 5)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
- (void) printf("\t\t\thas blkptr, %s\n",
+ (void) printf("%shas blkptr, %s\n", prefix,
bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
- print_log_bp(bp, "\t\t\t");
+ print_log_bp(bp, prefix);
+
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
}
if (bp->blk_birth == 0) {
bzero(buf, sizeof (buf));
- } else {
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lr->lr_foid;
- zb.zb_level = 0;
- zb.zb_blkid = -1; /* unknown */
+ (void) printf("%s<hole>\n", prefix);
+ return;
+ }
+ if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+ (void) printf("%s<block already committed>\n", prefix);
+ return;
+ }
- error = zio_wait(zio_read(NULL, zilog->zl_spa,
- bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
- if (error)
- return;
- }
- data = buf + lr->lr_blkoff;
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = zio_wait(zio_read(NULL, zilog->zl_spa,
+ bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+ if (error)
+ return;
+ data = buf;
} else {
data = (char *)(lr + 1);
}
@@ -151,7 +157,7 @@
dlimit = data + MIN(lr->lr_length,
(verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
- (void) printf("\t\t\t");
+ (void) printf("%s", prefix);
while (data < dlimit) {
if (isprint(*data))
(void) printf("%c ", *data);
@@ -166,7 +172,7 @@
static void
zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
{
- (void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+ (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
(u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length);
}
@@ -178,38 +184,38 @@
time_t atime = (time_t)lr->lr_atime[0];
time_t mtime = (time_t)lr->lr_mtime[0];
- (void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+ (void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
if (lr->lr_mask & AT_MODE) {
- (void) printf("\t\t\tAT_MODE %llo\n",
+ (void) printf("%sAT_MODE %llo\n", prefix,
(longlong_t)lr->lr_mode);
}
if (lr->lr_mask & AT_UID) {
- (void) printf("\t\t\tAT_UID %llu\n",
+ (void) printf("%sAT_UID %llu\n", prefix,
(u_longlong_t)lr->lr_uid);
}
if (lr->lr_mask & AT_GID) {
- (void) printf("\t\t\tAT_GID %llu\n",
+ (void) printf("%sAT_GID %llu\n", prefix,
(u_longlong_t)lr->lr_gid);
}
if (lr->lr_mask & AT_SIZE) {
- (void) printf("\t\t\tAT_SIZE %llu\n",
+ (void) printf("%sAT_SIZE %llu\n", prefix,
(u_longlong_t)lr->lr_size);
}
if (lr->lr_mask & AT_ATIME) {
- (void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+ (void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
(u_longlong_t)lr->lr_atime[0],
(u_longlong_t)lr->lr_atime[1],
ctime(&atime));
}
if (lr->lr_mask & AT_MTIME) {
- (void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+ (void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
(u_longlong_t)lr->lr_mtime[0],
(u_longlong_t)lr->lr_mtime[1],
ctime(&mtime));
@@ -220,7 +226,7 @@
static void
zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
{
- (void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+ (void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
}
@@ -256,7 +262,7 @@
};
/* ARGSUSED */
-static void
+static int
print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
{
int txtype;
@@ -280,23 +286,24 @@
zil_rec_info[txtype].zri_count++;
zil_rec_info[0].zri_count++;
+
+ return (0);
}
/* ARGSUSED */
-static void
+static int
print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
- char blkbuf[BP_SPRINTF_LEN];
+ char blkbuf[BP_SPRINTF_LEN + 10];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
char *claim;
if (verbose <= 3)
- return;
+ return (0);
if (verbose >= 5) {
(void) strcpy(blkbuf, ", ");
- sprintf_blkptr(blkbuf + strlen(blkbuf),
- BP_SPRINTF_LEN - strlen(blkbuf), bp);
+ sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
} else {
blkbuf[0] = '\0';
}
@@ -310,6 +317,8 @@
(void) printf("\tBlock seqno %llu, %s%s\n",
(u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+ return (0);
}
static void
@@ -342,17 +351,17 @@
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int i;
- if (zh->zh_log.blk_birth == 0 || verbose < 2)
+ if (zh->zh_log.blk_birth == 0 || verbose < 1)
return;
- (void) printf("\n ZIL header: claim_txg %llu, claim_seq %llu",
- (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+ (void) printf("\n ZIL header: claim_txg %llu, "
+ "claim_blk_seq %llu, claim_lr_seq %llu",
+ (u_longlong_t)zh->zh_claim_txg,
+ (u_longlong_t)zh->zh_claim_blk_seq,
+ (u_longlong_t)zh->zh_claim_lr_seq);
(void) printf(" replay_seq %llu, flags 0x%llx\n",
(u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
- if (verbose >= 4)
- print_log_bp(&zh->zh_log, "\n\tfirst block: ");
-
for (i = 0; i < TX_MAX_TYPE; i++)
zil_rec_info[i].zri_count = 0;
--- a/usr/src/cmd/zpool/zpool_main.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/zpool/zpool_main.c Sun Nov 01 14:14:46 2009 -0800
@@ -250,12 +250,12 @@
{
FILE *fp = cb;
- (void) fprintf(fp, "\t%-13s ", zpool_prop_to_name(prop));
+ (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop));
if (zpool_prop_readonly(prop))
(void) fprintf(fp, " NO ");
else
- (void) fprintf(fp, " YES ");
+ (void) fprintf(fp, " YES ");
if (zpool_prop_values(prop) == NULL)
(void) fprintf(fp, "-\n");
@@ -302,7 +302,7 @@
(void) fprintf(fp,
gettext("\nthe following properties are supported:\n"));
- (void) fprintf(fp, "\n\t%-13s %s %s\n\n",
+ (void) fprintf(fp, "\n\t%-15s %s %s\n\n",
"PROPERTY", "EDIT", "VALUES");
/* Iterate over all properties */
@@ -2449,7 +2449,7 @@
int ret;
list_cbdata_t cb = { 0 };
static char default_props[] =
- "name,size,used,available,capacity,health,altroot";
+ "name,size,used,available,capacity,dedupratio,health,altroot";
char *props = default_props;
/* check options */
@@ -3672,9 +3672,12 @@
(void) printf(gettext(" 15 user/group space accounting\n"));
(void) printf(gettext(" 16 stmf property support\n"));
(void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
- (void) printf(gettext(" 18 snapshot user holds\n"));
+ (void) printf(gettext(" 18 Snapshot user holds\n"));
(void) printf(gettext(" 19 Log device removal\n"));
- (void) printf(gettext("For more information on a particular "
+ (void) printf(gettext(" 20 Compression using zle "
+ "(zero-length encoding)\n"));
+ (void) printf(gettext(" 21 Deduplication\n"));
+ (void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
"version/N\n\n");
--- a/usr/src/cmd/ztest/ztest.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/cmd/ztest/ztest.c Sun Nov 01 14:14:46 2009 -0800
@@ -86,9 +86,8 @@
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/zil.h>
+#include <sys/zil_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/spa_impl.h>
@@ -106,6 +105,7 @@
#include <ctype.h>
#include <math.h>
#include <sys/fs/zfs.h>
+#include <libnvpair.h>
static char cmdname[] = "ztest";
static char *zopt_pool = cmdname;
@@ -127,112 +127,171 @@
static uint64_t zopt_time = 300; /* 5 minutes */
static int zopt_maxfaults;
+#define BT_MAGIC 0x123456789abcdefULL
+
+enum ztest_io_type {
+ ZTEST_IO_WRITE_TAG,
+ ZTEST_IO_WRITE_PATTERN,
+ ZTEST_IO_WRITE_ZEROES,
+ ZTEST_IO_TRUNCATE,
+ ZTEST_IO_SETATTR,
+ ZTEST_IO_TYPES
+};
+
typedef struct ztest_block_tag {
+ uint64_t bt_magic;
uint64_t bt_objset;
uint64_t bt_object;
uint64_t bt_offset;
+ uint64_t bt_gen;
uint64_t bt_txg;
- uint64_t bt_thread;
- uint64_t bt_seq;
+ uint64_t bt_crtxg;
} ztest_block_tag_t;
-typedef struct ztest_args {
- char za_pool[MAXNAMELEN];
- spa_t *za_spa;
- objset_t *za_os;
- zilog_t *za_zilog;
- thread_t za_thread;
- uint64_t za_instance;
- uint64_t za_random;
- uint64_t za_diroff;
- uint64_t za_diroff_shared;
- uint64_t za_zil_seq;
- hrtime_t za_start;
- hrtime_t za_stop;
- hrtime_t za_kill;
- /*
- * Thread-local variables can go here to aid debugging.
- */
- ztest_block_tag_t za_rbt;
- ztest_block_tag_t za_wbt;
- dmu_object_info_t za_doi;
- dmu_buf_t *za_dbuf;
-} ztest_args_t;
-
-typedef void ztest_func_t(ztest_args_t *);
+typedef struct bufwad {
+ uint64_t bw_index;
+ uint64_t bw_txg;
+ uint64_t bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+ void *rll_writer;
+ int rll_readers;
+ mutex_t rll_lock;
+ cond_t rll_cv;
+} rll_t;
+
+typedef struct rl {
+ uint64_t rl_object;
+ uint64_t rl_offset;
+ uint64_t rl_size;
+ rll_t *rl_lock;
+} rl_t;
+
+#define ZTEST_RANGE_LOCKS 64
+#define ZTEST_OBJECT_LOCKS 64
+
+/*
+ * Object descriptor. Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+ uint64_t od_dir;
+ uint64_t od_object;
+ dmu_object_type_t od_type;
+ dmu_object_type_t od_crtype;
+ uint64_t od_blocksize;
+ uint64_t od_crblocksize;
+ uint64_t od_gen;
+ uint64_t od_crgen;
+ char od_name[MAXNAMELEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+ objset_t *zd_os;
+ zilog_t *zd_zilog;
+ uint64_t zd_seq;
+ ztest_od_t *zd_od; /* debugging aid */
+ char zd_name[MAXNAMELEN];
+ mutex_t zd_dirobj_lock;
+ rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
+ rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+ ztest_func_t *zi_func; /* test function */
+ uint64_t zi_iters; /* iterations per execution */
+ uint64_t *zi_interval; /* execute every <interval> seconds */
+ uint64_t zi_call_count; /* per-pass count */
+ uint64_t zi_call_time; /* per-pass time */
+ uint64_t zi_call_next; /* next time to call this function */
+} ztest_info_t;
/*
* Note: these aren't static because we want dladdr() to work.
*/
ztest_func_t ztest_dmu_read_write;
-ztest_func_t ztest_dmu_read_write_zcopy;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_dmu_read_write_zcopy;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
ztest_func_t ztest_fzap;
-ztest_func_t ztest_zap_parallel;
-ztest_func_t ztest_traverse;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
ztest_func_t ztest_dsl_prop_get_set;
-ztest_func_t ztest_dmu_objset_create_destroy;
-ztest_func_t ztest_dmu_snapshot_create_destroy;
-ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_spa_prop_get_set;
ztest_func_t ztest_spa_create_destroy;
ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
ztest_func_t ztest_vdev_attach_detach;
ztest_func_t ztest_vdev_LUN_growth;
ztest_func_t ztest_vdev_add_remove;
ztest_func_t ztest_vdev_aux_add_remove;
-ztest_func_t ztest_scrub;
-ztest_func_t ztest_dmu_snapshot_hold;
-
-typedef struct ztest_info {
- ztest_func_t *zi_func; /* test function */
- uint64_t zi_iters; /* iterations per execution */
- uint64_t *zi_interval; /* execute every <interval> seconds */
- uint64_t zi_calls; /* per-pass count */
- uint64_t zi_call_time; /* per-pass time */
- uint64_t zi_call_total; /* cumulative total */
- uint64_t zi_call_target; /* target cumulative total */
-} ztest_info_t;
-
-uint64_t zopt_always = 0; /* all the time */
-uint64_t zopt_often = 1; /* every second */
-uint64_t zopt_sometimes = 10; /* every 10 seconds */
-uint64_t zopt_rarely = 60; /* every 60 seconds */
+
+uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always },
- { ztest_dmu_write_parallel, 30, &zopt_always },
+ { ztest_dmu_write_parallel, 10, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
- { ztest_dmu_commit_callbacks, 10, &zopt_always },
+ { ztest_dmu_commit_callbacks, 1, &zopt_always },
{ ztest_zap, 30, &zopt_always },
- { ztest_fzap, 1, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
- { ztest_dmu_read_write_zcopy, 1, &zopt_sometimes },
- { ztest_dsl_prop_get_set, 1, &zopt_sometimes },
- { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
- { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
- { ztest_spa_create_destroy, 1, &zopt_sometimes },
+ { ztest_zil_commit, 1, &zopt_incessant },
+ { ztest_dmu_read_write_zcopy, 1, &zopt_often },
+ { ztest_dmu_objset_create_destroy, 1, &zopt_often },
+ { ztest_dsl_prop_get_set, 1, &zopt_often },
+ { ztest_spa_prop_get_set, 1, &zopt_sometimes },
+#if 0
+ { ztest_dmu_prealloc, 1, &zopt_sometimes },
+#endif
+ { ztest_fzap, 1, &zopt_sometimes },
+ { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
+ { ztest_spa_create_destroy, 1, &zopt_sometimes },
{ ztest_fault_inject, 1, &zopt_sometimes },
+ { ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_spa_rename, 1, &zopt_rarely },
+ { ztest_scrub, 1, &zopt_rarely },
+ { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
{ ztest_vdev_attach_detach, 1, &zopt_rarely },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
- { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1, &zopt_vdevtime },
{ ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
- { ztest_scrub, 1, &zopt_vdevtime },
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
-#define ZTEST_SYNC_LOCKS 16
-
/*
* The following struct is used to hold a list of uncalled commit callbacks.
- *
* The callbacks are ordered by txg number.
*/
typedef struct ztest_cb_list {
@@ -244,28 +303,34 @@
* Stuff we need to share writably between parent and child.
*/
typedef struct ztest_shared {
- mutex_t zs_vdev_lock;
- rwlock_t zs_name_lock;
+ char *zs_pool;
+ spa_t *zs_spa;
+ hrtime_t zs_proc_start;
+ hrtime_t zs_proc_stop;
+ hrtime_t zs_thread_start;
+ hrtime_t zs_thread_stop;
+ hrtime_t zs_thread_kill;
+ uint64_t zs_enospc_count;
uint64_t zs_vdev_next_leaf;
uint64_t zs_vdev_aux;
- uint64_t zs_enospc_count;
- hrtime_t zs_start_time;
- hrtime_t zs_stop_time;
uint64_t zs_alloc;
uint64_t zs_space;
+ mutex_t zs_vdev_lock;
+ rwlock_t zs_name_lock;
ztest_info_t zs_info[ZTEST_FUNCS];
- mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS];
- uint64_t zs_seq[ZTEST_SYNC_LOCKS];
+ ztest_ds_t zs_zd[];
} ztest_shared_t;
+#define ID_PARALLEL -1ULL
+
static char ztest_dev_template[] = "%s/%s.%llua";
static char ztest_aux_template[] = "%s/%s.%s.%llu";
-static ztest_shared_t *ztest_shared;
+ztest_shared_t *ztest_shared;
+uint64_t *ztest_seq;
static int ztest_random_fd;
static int ztest_dump_core = 1;
-static uint64_t metaslab_sz;
static boolean_t ztest_exiting;
/* Global commit callback list */
@@ -273,13 +338,13 @@
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
-
-#define ZTEST_DIROBJ 1
-#define ZTEST_MICROZAP_OBJ 2
-#define ZTEST_FATZAP_OBJ 3
-
-#define ZTEST_DIROBJ_BLOCKSIZE (1 << 10)
-#define ZTEST_DIRSIZE 256
+static uint64_t metaslab_sz;
+
+enum ztest_object {
+ ZTEST_META_DNODE = 0,
+ ZTEST_DIROBJ,
+ ZTEST_OBJECTS
+};
static void usage(boolean_t) __NORETURN;
@@ -433,27 +498,6 @@
exit(requested ? 0 : 1);
}
-static uint64_t
-ztest_random(uint64_t range)
-{
- uint64_t r;
-
- if (range == 0)
- return (0);
-
- if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
- fatal(1, "short read from /dev/urandom");
-
- return (r % range);
-}
-
-/* ARGSUSED */
-static void
-ztest_record_enospc(char *s)
-{
- ztest_shared->zs_enospc_count++;
-}
-
static void
process_options(int argc, char **argv)
{
@@ -546,10 +590,40 @@
zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
- zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
+ zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+ UINT64_MAX >> 2);
zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
}
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+ (void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+ uint64_t r;
+
+ if (range == 0)
+ return (0);
+
+ if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+ fatal(1, "short read from /dev/urandom");
+
+ return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+ ztest_shared->zs_enospc_count++;
+}
+
static uint64_t
ztest_get_ashift(void)
{
@@ -687,100 +761,805 @@
return (root);
}
-static void
-ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+static int
+ztest_random_blocksize(void)
{
- int bs = SPA_MINBLOCKSHIFT +
- ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
- int ibs = DN_MIN_INDBLKSHIFT +
- ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
- int error;
-
- error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
- if (error) {
- char osname[300];
- dmu_objset_name(os, osname);
- fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
- osname, object, 1 << bs, ibs, error);
- }
-}
-
-static uint8_t
-ztest_random_checksum(void)
-{
- uint8_t checksum;
-
- do {
- checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
- } while (zio_checksum_table[checksum].ci_zbt);
-
- if (checksum == ZIO_CHECKSUM_OFF)
- checksum = ZIO_CHECKSUM_ON;
-
- return (checksum);
-}
-
-static uint8_t
-ztest_random_compress(void)
-{
- return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
+ return (1 << (SPA_MINBLOCKSHIFT +
+ ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
}
static int
-ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
+ztest_random_ibshift(void)
+{
+ return (DN_MIN_INDBLKSHIFT +
+ ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
{
- dmu_tx_t *tx;
+ uint64_t top;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ do {
+ top = ztest_random(rvd->vdev_children);
+ tvd = rvd->vdev_child[top];
+ } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+ tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+ return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+ uint64_t value;
+
+ do {
+ value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+ } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+ return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+ boolean_t inherit)
+{
+ const char *propname = zfs_prop_to_name(prop);
+ const char *valname;
+ char setpoint[MAXPATHLEN];
+ uint64_t curval;
int error;
- if (byteswap)
- byteswap_uint64_array(lr, sizeof (*lr));
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
+ error = dsl_prop_set(osname, propname, sizeof (value),
+ inherit ? 0 : 1, &value);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
return (error);
}
-
- error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
- DMU_OT_NONE, 0, tx);
ASSERT3U(error, ==, 0);
- dmu_tx_commit(tx);
-
- if (zopt_verbose >= 5) {
- char osname[MAXNAMELEN];
- dmu_objset_name(os, osname);
- (void) printf("replay create of %s object %llu"
- " in txg %llu = %d\n",
- osname, (u_longlong_t)lr->lr_doid,
- (u_longlong_t)dmu_tx_get_txg(tx), error);
+
+ VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
+ 1, &curval, setpoint), ==, 0);
+
+ if (zopt_verbose >= 6) {
+ VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
+ (void) printf("%s %s = %s at '%s'\n",
+ osname, propname, valname, setpoint);
}
return (error);
}
+#if 0
static int
-ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
+ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
+{
+ spa_t *spa = zs->zs_spa;
+ nvlist_t *props = NULL;
+ int error;
+
+ VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+ error = spa_prop_set(spa, props);
+
+ nvlist_free(props);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT3U(error, ==, 0);
+
+ return (error);
+}
+#endif
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+ rll->rll_writer = NULL;
+ rll->rll_readers = 0;
+ VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+}
+
+static void
+ztest_rll_destroy(rll_t *rll)
+{
+ ASSERT(rll->rll_writer == NULL);
+ ASSERT(rll->rll_readers == 0);
+ VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+ VERIFY(cond_destroy(&rll->rll_cv) == 0);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+ if (type == RL_READER) {
+ while (rll->rll_writer != NULL)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_readers++;
+ } else {
+ while (rll->rll_writer != NULL || rll->rll_readers)
+ (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_writer = curthread;
+ }
+
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+ VERIFY(mutex_lock(&rll->rll_lock) == 0);
+
+ if (rll->rll_writer) {
+ ASSERT(rll->rll_readers == 0);
+ rll->rll_writer = NULL;
+ } else {
+ ASSERT(rll->rll_readers != 0);
+ ASSERT(rll->rll_writer == NULL);
+ rll->rll_readers--;
+ }
+
+ if (rll->rll_writer == NULL && rll->rll_readers == 0)
+ VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+ VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+ uint64_t size, rl_type_t type)
+{
+ uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ rl_t *rl;
+
+ rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ rl->rl_object = object;
+ rl->rl_offset = offset;
+ rl->rl_size = size;
+ rl->rl_lock = rll;
+
+ ztest_rll_lock(rll, type);
+
+ return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+ rll_t *rll = rl->rl_lock;
+
+ ztest_rll_unlock(rll);
+
+ umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+{
+ zd->zd_os = os;
+ zd->zd_zilog = dmu_objset_zil(os);
+ zd->zd_seq = 0;
+ dmu_objset_name(os, zd->zd_name);
+
+ VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
+
+ for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_init(&zd->zd_object_lock[l]);
+
+ for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+ VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
+
+ for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+ for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
{
+ uint64_t txg;
+ int error;
+
+ /*
+ * Attempt to assign tx to some transaction group.
+ */
+ error = dmu_tx_assign(tx, txg_how);
+ if (error) {
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ } else {
+ ASSERT3U(error, ==, ENOSPC);
+ ztest_record_enospc(tag);
+ }
+ dmu_tx_abort(tx);
+ return (0);
+ }
+ txg = dmu_tx_get_txg(tx);
+ ASSERT(txg != 0);
+ return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+ while (ip < ip_end)
+ *ip++ = value;
+}
+
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+{
+ uint64_t *ip = buf;
+ uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+ uint64_t diff = 0;
+
+ while (ip < ip_end)
+ diff |= (value - *ip++);
+
+ return (diff == 0);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+ bt->bt_magic = BT_MAGIC;
+ bt->bt_objset = dmu_objset_id(os);
+ bt->bt_object = object;
+ bt->bt_offset = offset;
+ bt->bt_gen = gen;
+ bt->bt_txg = txg;
+ bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+ ASSERT(bt->bt_magic == BT_MAGIC);
+ ASSERT(bt->bt_objset == dmu_objset_id(os));
+ ASSERT(bt->bt_object == object);
+ ASSERT(bt->bt_offset == offset);
+ ASSERT(bt->bt_gen <= gen);
+ ASSERT(bt->bt_txg <= txg);
+ ASSERT(bt->bt_crtxg == crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+ dmu_object_info_t doi;
+ ztest_block_tag_t *bt;
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+ bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+ return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define lrz_type lr_mode
+#define lrz_blocksize lr_uid
+#define lrz_ibshift lr_gid
+#define lrz_bonustype lr_rdev
+#define lrz_bonuslen lr_crtime[1]
+
+static uint64_t
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+ itx_t *itx;
+ itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ if (lr->lr_length > ZIL_MAX_LOG_DATA)
+ write_state = WR_INDIRECT;
+
+ itx = zil_itx_create(TX_WRITE,
+ sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+ if (write_state == WR_COPIED &&
+ dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ write_state = WR_NEED_COPY;
+ }
+ itx->itx_private = zd;
+ itx->itx_wr_state = write_state;
+ itx->itx_sync = (ztest_random(8) == 0);
+ itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return (0);
+
+ itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ ztest_block_tag_t *bbt;
+ dmu_buf_t *db;
dmu_tx_t *tx;
- int error;
+ uint64_t txg;
+ int error = 0;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ }
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0)
+ return (ENOSPC);
+
+ ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = zap_create(os,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = zap_create_claim(os, lr->lr_foid,
+ lr->lrz_type, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ } else {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = dmu_object_alloc(os,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ } else {
+ error = dmu_object_claim(os, lr->lr_foid,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ lr->lrz_bonuslen, tx);
+ }
+ }
+
if (error) {
- dmu_tx_abort(tx);
+ ASSERT3U(error, ==, EEXIST);
+ ASSERT(zd->zd_zilog->zl_replay);
+ dmu_tx_commit(tx);
return (error);
}
- error = dmu_object_free(os, lr->lr_doid, tx);
+ ASSERT(lr->lr_foid != 0);
+
+ if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+ VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+ lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ bbt = ztest_bt_bonus(db);
+ dmu_buf_will_dirty(db, tx);
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+ &lr->lr_foid, tx));
+
+ (void) ztest_log_create(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object, txg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ VERIFY3U(0, ==,
+ zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+ ASSERT(object != 0);
+
+ ztest_object_lock(zd, object, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_object_unlock(zd, object);
+ return (ENOSPC);
+ }
+
+ if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+ VERIFY3U(0, ==, zap_destroy(os, object, tx));
+ } else {
+ VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+ }
+
+ VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+ (void) ztest_log_remove(zd, tx, lr);
+
dmu_tx_commit(tx);
- return (error);
+ ztest_object_unlock(zd, object);
+
+ return (0);
+}
+
+static int
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zd->zd_os;
+ void *data = lr + 1; /* data follows lr */
+ uint64_t offset, length;
+ ztest_block_tag_t *bt = data;
+ ztest_block_tag_t *bbt;
+ uint64_t gen, txg, lrtxg, crtxg;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ arc_buf_t *abuf = NULL;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+ byteswap_uint64_array(bt, sizeof (*bt));
+
+ if (bt->bt_magic != BT_MAGIC)
+ bt = NULL;
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ dmu_object_info_from_db(db, &doi);
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ gen = bbt->bt_gen;
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+ if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+ P2PHASE(offset, length) == 0)
+ abuf = dmu_request_arcbuf(db, length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ dmu_buf_rele(db, FTAG);
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ if (bt != NULL) {
+ /*
+ * Usually, verify the old data before writing new data --
+ * but not always, because we also want to verify correct
+ * behavior when the data was not recently read into cache.
+ */
+ ASSERT(offset % doi.doi_data_block_size == 0);
+ if (ztest_random(4) != 0) {
+ int prefetch = ztest_random(2) ?
+ DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+ ztest_block_tag_t rbt;
+
+ VERIFY(dmu_read(os, lr->lr_foid, offset,
+ sizeof (rbt), &rbt, prefetch) == 0);
+ if (rbt.bt_magic == BT_MAGIC) {
+ ztest_bt_verify(&rbt, os, lr->lr_foid,
+ offset, gen, txg, crtxg);
+ }
+ }
+
+ /*
+ * Writes can appear to be newer than the bonus buffer because
+ * the ztest_get_data() callback does a dmu_read() of the
+ * open-context data, which may be different than the data
+ * as it was when the write was generated.
+ */
+ if (zd->zd_zilog->zl_replay) {
+ ztest_bt_verify(bt, os, lr->lr_foid, offset,
+ MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+ bt->bt_crtxg);
+ }
+
+ /*
+ * Set the bt's gen/txg to the bonus buffer's gen/txg
+ * so that all of the usual ASSERTs will work.
+ */
+ ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+ }
+
+ if (abuf == NULL) {
+ dmu_write(os, lr->lr_foid, offset, length, data, tx);
+ } else {
+ bcopy(data, abuf->b_data, length);
+ dmu_assign_arcbuf(db, offset, abuf, tx);
+ }
+
+ (void) ztest_log_write(zd, tx, lr);
+
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+ lr->lr_length, tx) == 0);
+
+ (void) ztest_log_truncate(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ uint64_t txg, lrtxg, crtxg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ if (zd->zd_zilog->zl_replay) {
+ ASSERT(lr->lr_size != 0);
+ ASSERT(lr->lr_mode != 0);
+ ASSERT(lrtxg != 0);
+ } else {
+ /*
+ * Randomly change the size and increment the generation.
+ */
+ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+ sizeof (*bbt);
+ lr->lr_mode = bbt->bt_gen + 1;
+ ASSERT(lrtxg == 0);
+ }
+
+ /*
+ * Verify that the current bonus buffer is not newer than our txg.
+ */
+ ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+ MAX(txg, lrtxg), crtxg);
+
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+ ASSERT3U(lr->lr_size, <=, db->db_size);
+ VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+ bbt = ztest_bt_bonus(db);
+
+ ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+ dmu_buf_rele(db, FTAG);
+
+ (void) ztest_log_setattr(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
}
zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
@@ -793,9 +1572,9 @@
NULL, /* TX_RMDIR */
NULL, /* TX_LINK */
NULL, /* TX_RENAME */
- NULL, /* TX_WRITE */
- NULL, /* TX_TRUNCATE */
- NULL, /* TX_SETATTR */
+ ztest_replay_write, /* TX_WRITE */
+ ztest_replay_truncate, /* TX_TRUNCATE */
+ ztest_replay_setattr, /* TX_SETATTR */
NULL, /* TX_ACL */
NULL, /* TX_CREATE_ACL */
NULL, /* TX_CREATE_ATTR */
@@ -807,13 +1586,472 @@
};
/*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+ ztest_ds_t *zd = zgd->zgd_private;
+ uint64_t object = zgd->zgd_rl->rl_object;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ ztest_range_unlock(zgd->zgd_rl);
+ ztest_object_unlock(zd, object);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+ umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+ ztest_ds_t *zd = arg;
+ objset_t *os = zd->zd_os;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
+ uint64_t txg = lr->lr_common.lrc_txg;
+ uint64_t crtxg;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ztest_object_lock(zd, object, RL_READER);
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error) {
+ ztest_object_unlock(zd, object);
+ return (error);
+ }
+
+ crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+ if (crtxg == 0 || crtxg > txg) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, object);
+ return (ENOENT);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ dmu_buf_rele(db, FTAG);
+ db = NULL;
+
+ zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+ zgd->zgd_zilog = zd->zd_zilog;
+ zgd->zgd_private = zd;
+
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ ASSERT(error == 0);
+ } else {
+ size = doi.doi_data_block_size;
+ if (ISP2(size))
+ offset = P2ALIGN(offset, size);
+
+ zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+ RL_READER);
+
+ error = dmu_buf_hold(os, object, offset, zgd, &db);
+
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ ztest_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ ztest_get_done(zgd, error);
+
+ return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+ char *lr;
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+ if (name)
+ bcopy(name, lr + lrsize, namesize);
+
+ return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects. Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ for (int i = 0; i < count; i++, od++) {
+ od->od_object = 0;
+ error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+ sizeof (uint64_t), 1, &od->od_object);
+ if (error) {
+ ASSERT(error == ENOENT);
+ ASSERT(od->od_object == 0);
+ missing++;
+ } else {
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ dmu_object_info_t doi;
+
+ ASSERT(od->od_object != 0);
+ ASSERT(missing == 0); /* there should be no gaps */
+
+ ztest_object_lock(zd, od->od_object, RL_READER);
+ VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+ od->od_object, FTAG, &db));
+ dmu_object_info_from_db(db, &doi);
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ od->od_type = doi.doi_type;
+ od->od_blocksize = doi.doi_data_block_size;
+ od->od_gen = bbt->bt_gen;
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, od->od_object);
+ }
+ }
+
+ return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ for (int i = 0; i < count; i++, od++) {
+ if (missing) {
+ od->od_object = 0;
+ missing++;
+ continue;
+ }
+
+ lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+ lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
+ lr->lrz_type = od->od_crtype;
+ lr->lrz_blocksize = od->od_crblocksize;
+ lr->lrz_ibshift = ztest_random_ibshift();
+ lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+ lr->lrz_bonuslen = dmu_bonus_max();
+ lr->lr_gen = od->od_crgen;
+ lr->lr_crtime[0] = time(NULL);
+
+ if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+ ASSERT(missing == 0);
+ od->od_object = 0;
+ missing++;
+ } else {
+ od->od_object = lr->lr_foid;
+ od->od_type = od->od_crtype;
+ od->od_blocksize = od->od_crblocksize;
+ od->od_gen = od->od_crgen;
+ ASSERT(od->od_object != 0);
+ }
+
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+
+ ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+ od += count - 1;
+
+ for (int i = count - 1; i >= 0; i--, od--) {
+ if (missing) {
+ missing++;
+ continue;
+ }
+
+ if (od->od_object == 0)
+ continue;
+
+ lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+
+ if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+ ASSERT3U(error, ==, ENOSPC);
+ missing++;
+ } else {
+ od->od_object = 0;
+ }
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+ void *data)
+{
+ lr_write_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ bcopy(data, lr + 1, size);
+
+ error = ztest_replay_write(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+ return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ lr_truncate_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+
+ error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+ lr_setattr_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_size = 0;
+ lr->lr_mode = 0;
+
+ error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ ztest_object_lock(zd, object, RL_READER);
+ rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, object, offset, size);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+ if (txg != 0) {
+ dmu_prealloc(os, object, offset, size, tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(dmu_objset_pool(os), txg);
+ } else {
+ (void) dmu_free_long_range(os, object, offset, size);
+ }
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+ ztest_block_tag_t wbt;
+ dmu_object_info_t doi;
+ enum ztest_io_type io_type;
+ uint64_t blocksize;
+ void *data;
+
+ VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+ blocksize = doi.doi_data_block_size;
+ data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+ /*
+ * Pick an i/o type at random, biased toward writing block tags.
+ */
+ io_type = ztest_random(ZTEST_IO_TYPES);
+ if (ztest_random(2) == 0)
+ io_type = ZTEST_IO_WRITE_TAG;
+
+ switch (io_type) {
+
+ case ZTEST_IO_WRITE_TAG:
+ ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+ (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+ break;
+
+ case ZTEST_IO_WRITE_PATTERN:
+ (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+ if (ztest_random(2) == 0) {
+ /*
+ * Induce fletcher2 collisions to ensure that
+ * zio_ddt_collision() detects and resolves them
+ * when using fletcher2-verify for deduplication.
+ */
+ ((uint64_t *)data)[0] ^= 1ULL << 63;
+ ((uint64_t *)data)[4] ^= 1ULL << 63;
+ }
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_WRITE_ZEROES:
+ bzero(data, blocksize);
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_TRUNCATE:
+ (void) ztest_truncate(zd, object, offset, blocksize);
+ break;
+
+ case ZTEST_IO_SETATTR:
+ (void) ztest_setattr(zd, object);
+ break;
+ }
+
+ umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+ dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+ od->od_dir = ZTEST_DIROBJ;
+ od->od_object = 0;
+
+ od->od_crtype = type;
+ od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ od->od_crgen = gen;
+
+ od->od_type = DMU_OT_NONE;
+ od->od_blocksize = 0;
+ od->od_gen = 0;
+
+ (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+ tag, (int64_t)id, index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones. Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+ int count = size / sizeof (*od);
+ int rv = 0;
+
+ VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+ if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+ (ztest_remove(zd, od, count) != 0 ||
+ ztest_create(zd, od, count) != 0))
+ rv = -1;
+ zd->zd_od = od;
+ VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+ return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+ zilog_t *zilog = zd->zd_zilog;
+
+ zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+
+ /*
+ * Remember the committed values in zd, which is in parent/child
+ * shared memory. If we die, the next iteration of ztest_run()
+ * will verify that the log really does contain this record.
+ */
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+ zd->zd_seq = zilog->zl_commit_lr_seq;
+ mutex_exit(&zilog->zl_lock);
+}
+
+/*
* Verify that we can't destroy an active pool, create an existing pool,
* or create a pool with a bad vdev spec.
*/
+/* ARGSUSED */
void
-ztest_spa_create_destroy(ztest_args_t *za)
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
{
- int error;
+ ztest_shared_t *zs = ztest_shared;
spa_t *spa;
nvlist_t *nvroot;
@@ -821,41 +2059,31 @@
* Attempt to create using a bad file.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_file) = %d", error);
/*
* Attempt to create using a bad mirror.
*/
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
- error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != ENOENT)
- fatal(0, "spa_create(bad_mirror) = %d", error);
/*
* Attempt to create an existing pool. It shouldn't matter
* what's in the nvroot; we should fail with EEXIST.
*/
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
+ (void) rw_rdlock(&zs->zs_name_lock);
nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
- error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
+ VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error != EEXIST)
- fatal(0, "spa_create(whatever) = %d", error);
-
- error = spa_open(za->za_pool, &spa, FTAG);
- if (error)
- fatal(0, "spa_open() = %d", error);
-
- error = spa_destroy(za->za_pool);
- if (error != EBUSY)
- fatal(0, "spa_destroy() = %d", error);
-
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
spa_close(spa, FTAG);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
static vdev_t *
@@ -897,16 +2125,18 @@
/*
* Verify that vdev_add() works as expected.
*/
+/* ARGSUSED */
void
-ztest_vdev_add_remove(ztest_args_t *za)
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
uint64_t guid;
nvlist_t *nvroot;
int error;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -919,7 +2149,7 @@
/*
* Grab the guid from the head of the log class rotor.
*/
- guid = spa->spa_log_class->mc_rotor->mg_vd->vdev_guid;
+ guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
spa_config_exit(spa, SCL_VDEV, FTAG);
@@ -931,9 +2161,9 @@
* dmu_objset_destroy() to fail with EBUSY thus
* leaving the dataset in an inconsistent state.
*/
- (void) rw_wrlock(&ztest_shared->zs_name_lock);
+ VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
error = spa_vdev_remove(spa, guid, B_FALSE);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
if (error && error != EEXIST)
fatal(0, "spa_vdev_remove() = %d", error);
@@ -955,16 +2185,18 @@
fatal(0, "spa_vdev_add() = %d", error);
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
}
/*
* Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
*/
+/* ARGSUSED */
void
-ztest_vdev_aux_add_remove(ztest_args_t *za)
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
vdev_t *rvd = spa->spa_root_vdev;
spa_aux_vdev_t *sav;
char *aux;
@@ -979,7 +2211,7 @@
aux = ZPOOL_CONFIG_L2CACHE;
}
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -992,12 +2224,12 @@
/*
* Find an unused device we can add.
*/
- ztest_shared->zs_vdev_aux = 0;
+ zs->zs_vdev_aux = 0;
for (;;) {
char path[MAXPATHLEN];
int c;
(void) sprintf(path, ztest_aux_template, zopt_dir,
- zopt_pool, aux, ztest_shared->zs_vdev_aux);
+ zopt_pool, aux, zs->zs_vdev_aux);
for (c = 0; c < sav->sav_count; c++)
if (strcmp(sav->sav_vdevs[c]->vdev_path,
path) == 0)
@@ -1005,7 +2237,7 @@
if (c == sav->sav_count &&
vdev_lookup_by_path(rvd, path) == NULL)
break;
- ztest_shared->zs_vdev_aux++;
+ zs->zs_vdev_aux++;
}
}
@@ -1035,16 +2267,18 @@
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
}
/*
* Verify that we can attach and detach devices.
*/
+/* ARGSUSED */
void
-ztest_vdev_attach_detach(ztest_args_t *za)
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
spa_aux_vdev_t *sav = &spa->spa_spares;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *pvd;
@@ -1061,7 +2295,7 @@
int oldvd_is_log;
int error, expected_error;
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -1073,7 +2307,7 @@
/*
* Pick a random top-level vdev.
*/
- top = ztest_random(rvd->vdev_children);
+ top = ztest_random_vdev_top(spa, B_TRUE);
/*
* Pick a random leaf within it.
@@ -1121,7 +2355,7 @@
if (error != 0 && error != ENODEV && error != EBUSY &&
error != ENOTSUP)
fatal(0, "detach (%s) returned %d", oldpath, error);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
return;
}
@@ -1214,7 +2448,7 @@
(longlong_t)newsize, replacing, error, expected_error);
}
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
}
/*
@@ -1291,7 +2525,8 @@
if (zopt_verbose >= 5) {
(void) printf("vdev configuration has changed, "
"guid %llu, state %llu, expected gen %llu, "
- "got gen %llu\n", (u_longlong_t)guid,
+ "got gen %llu\n",
+ (u_longlong_t)guid,
(u_longlong_t)tvd->vdev_state,
(u_longlong_t)generation,
(u_longlong_t)spa->spa_config_generation);
@@ -1329,23 +2564,29 @@
/*
* Verify that dynamic LUN growth works as expected.
*/
+/* ARGSUSED */
void
-ztest_vdev_LUN_growth(ztest_args_t *za)
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
- vdev_t *vd, *tvd = NULL;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ vdev_t *vd, *tvd;
+ metaslab_class_t *mc;
+ metaslab_group_t *mg;
size_t psize, newsize;
- uint64_t spa_newsize, spa_cursize, ms_count;
-
- (void) mutex_lock(&ztest_shared->zs_vdev_lock);
+ uint64_t top;
+ uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+ VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
- while (tvd == NULL || tvd->vdev_islog) {
- uint64_t vdev;
-
- vdev = ztest_random(spa->spa_root_vdev->vdev_children);
- tvd = spa->spa_root_vdev->vdev_child[vdev];
- }
+ top = ztest_random_vdev_top(spa, B_TRUE);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ mg = tvd->vdev_mg;
+ mc = mg->mg_class;
+ old_ms_count = tvd->vdev_ms_count;
+ old_class_space = metaslab_class_get_space(mc);
/*
* Determine the size of the first leaf vdev associated with
@@ -1364,21 +2605,18 @@
if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
psize == 0 || psize >= 4 * zopt_vdev_size) {
spa_config_exit(spa, SCL_STATE, spa);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
return;
}
ASSERT(psize > 0);
newsize = psize + psize / 8;
ASSERT3U(newsize, >, psize);
- if (zopt_verbose >= 5) {
- (void) printf("Expanding vdev %s from %lu to %lu\n",
+ if (zopt_verbose >= 6) {
+ (void) printf("Expanding LUN %s from %lu to %lu\n",
vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
}
- spa_cursize = spa_get_space(spa);
- ms_count = tvd->vdev_ms_count;
-
/*
* Growing the vdev is a two step process:
* 1). expand the physical size (i.e. relabel)
@@ -1391,154 +2629,176 @@
(void) printf("Could not expand LUN because "
"the vdev configuration changed.\n");
}
- (void) spa_config_exit(spa, SCL_STATE, spa);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
return;
}
- (void) spa_config_exit(spa, SCL_STATE, spa);
+ spa_config_exit(spa, SCL_STATE, spa);
/*
* Expanding the LUN will update the config asynchronously,
* thus we must wait for the async thread to complete any
* pending tasks before proceeding.
*/
- mutex_enter(&spa->spa_async_lock);
- while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
- cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
- mutex_exit(&spa->spa_async_lock);
+ for (;;) {
+ boolean_t done;
+ mutex_enter(&spa->spa_async_lock);
+ done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+ mutex_exit(&spa->spa_async_lock);
+ if (done)
+ break;
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100);
+ }
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
- spa_newsize = spa_get_space(spa);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ new_ms_count = tvd->vdev_ms_count;
+ new_class_space = metaslab_class_get_space(mc);
+
+ if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+ if (zopt_verbose >= 5) {
+ (void) printf("Could not verify LUN expansion due to "
+ "intervening vdev offline or remove.\n");
+ }
+ spa_config_exit(spa, SCL_STATE, spa);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+ return;
+ }
+
+ /*
+ * Make sure we were able to grow the vdev.
+ */
+ if (new_ms_count <= old_ms_count)
+ fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+ old_ms_count, new_ms_count);
/*
* Make sure we were able to grow the pool.
*/
- if (ms_count >= tvd->vdev_ms_count ||
- spa_cursize >= spa_newsize) {
- (void) printf("Top-level vdev metaslab count: "
- "before %llu, after %llu\n",
- (u_longlong_t)ms_count,
- (u_longlong_t)tvd->vdev_ms_count);
- fatal(0, "LUN expansion failed: before %llu, "
- "after %llu\n", spa_cursize, spa_newsize);
- } else if (zopt_verbose >= 5) {
+ if (new_class_space <= old_class_space)
+ fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+ old_class_space, new_class_space);
+
+ if (zopt_verbose >= 5) {
char oldnumbuf[6], newnumbuf[6];
- nicenum(spa_cursize, oldnumbuf);
- nicenum(spa_newsize, newnumbuf);
+ nicenum(old_class_space, oldnumbuf);
+ nicenum(new_class_space, newnumbuf);
(void) printf("%s grew from %s to %s\n",
spa->spa_name, oldnumbuf, newnumbuf);
}
+
spa_config_exit(spa, SCL_STATE, spa);
- (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+ VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+/* ARGSUSED */
+static void
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ /*
+ * Create the objects common to all ztest datasets.
+ */
+ VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
+ DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
}
/* ARGSUSED */
-static void
-ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+static int
+ztest_objset_destroy_cb(char *name, void *arg)
{
- /*
- * Create the directory object.
- */
- VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
- DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
- DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
-
- VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
- DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
-
- VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
- DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
-}
-
-static int
-ztest_destroy_cb(char *name, void *arg)
-{
- ztest_args_t *za = arg;
objset_t *os;
- dmu_object_info_t *doi = &za->za_doi;
+ dmu_object_info_t doi;
int error;
/*
* Verify that the dataset contains a directory object.
*/
- error = dmu_objset_hold(name, FTAG, &os);
- ASSERT3U(error, ==, 0);
- error = dmu_object_info(os, ZTEST_DIROBJ, doi);
+ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+ error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
if (error != ENOENT) {
/* We could have crashed in the middle of destroying it */
ASSERT3U(error, ==, 0);
- ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
- ASSERT3S(doi->doi_physical_blks, >=, 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+ ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
}
dmu_objset_rele(os, FTAG);
/*
* Destroy the dataset.
*/
- error = dmu_objset_destroy(name, B_FALSE);
- if (error) {
- (void) dmu_objset_hold(name, FTAG, &os);
- fatal(0, "dmu_objset_destroy(os=%p) = %d\n", os, error);
- }
+ VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
return (0);
}
-/*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
-static uint64_t
-ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
{
- itx_t *itx;
- lr_create_t *lr;
- size_t namesize;
- char name[24];
-
- (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
- namesize = strlen(name) + 1;
-
- itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
- ztest_random(ZIL_MAX_BLKSZ));
- lr = (lr_create_t *)&itx->itx_lr;
- bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
- lr->lr_doid = object;
- lr->lr_foid = 0;
- lr->lr_mode = mode;
- lr->lr_uid = 0;
- lr->lr_gid = 0;
- lr->lr_gen = dmu_tx_get_txg(tx);
- lr->lr_crtime[0] = time(NULL);
- lr->lr_crtime[1] = 0;
- lr->lr_rdev = 0;
- bcopy(name, (char *)(lr + 1), namesize);
-
- return (zil_itx_assign(zilog, itx, tx));
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+ NULL, B_FALSE);
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (B_FALSE);
+ }
+ if (error != 0 && error != EEXIST)
+ fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+ return (B_TRUE);
}
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+ char snapname[MAXNAMELEN];
+ int error;
+
+ (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dmu_objset_destroy(snapname, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
void
-ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t zdtmp;
+ int iters;
int error;
objset_t *os, *os2;
- char name[100];
+ char name[MAXNAMELEN];
zilog_t *zilog;
- uint64_t seq;
- uint64_t objects;
-
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
- (u_longlong_t)za->za_instance);
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+ zs->zs_pool, (u_longlong_t)id);
/*
* If this dataset exists from a previous run, process its replay log
* half of the time. If we don't replay it, then dmu_objset_destroy()
- * (invoked from ztest_destroy_cb() below) should just throw it away.
+ * (invoked from ztest_objset_destroy_cb()) should just throw it away.
*/
if (ztest_random(2) == 0 &&
dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
- zil_replay(os, os, ztest_replay_vector);
+ ztest_zd_init(&zdtmp, os);
+ zil_replay(os, &zdtmp, ztest_replay_vector);
+ ztest_zd_fini(&zdtmp);
dmu_objset_disown(os, FTAG);
}
@@ -1547,152 +2807,106 @@
* create lying around from a previous run. If so, destroy it
* and all of its snapshots.
*/
- (void) dmu_objset_find(name, ztest_destroy_cb, za,
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
/*
* Verify that the destroyed dataset is no longer in the namespace.
*/
- error = dmu_objset_hold(name, FTAG, &os);
- if (error != ENOENT)
- fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
- name, os);
+ VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
/*
* Verify that we can create a new dataset.
*/
error = dmu_objset_create(name, DMU_OST_OTHER, 0,
- ztest_create_cb, NULL);
+ ztest_objset_create_cb, NULL);
if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_objset_create");
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ ztest_record_enospc(FTAG);
+ (void) rw_unlock(&zs->zs_name_lock);
return;
}
fatal(0, "dmu_objset_create(%s) = %d", name, error);
}
- error = dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os);
- if (error) {
- fatal(0, "dmu_objset_open(%s) = %d", name, error);
- }
+ VERIFY3U(0, ==,
+ dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+ ztest_zd_init(&zdtmp, os);
/*
* Open the intent log for it.
*/
- zilog = zil_open(os, NULL);
+ zilog = zil_open(os, ztest_get_data);
/*
- * Put a random number of objects in there.
+ * Put some objects in there, do a little I/O to them,
+ * and randomly take a couple of snapshots along the way.
*/
- objects = ztest_random(20);
- seq = 0;
- while (objects-- != 0) {
- uint64_t object;
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- } else {
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- seq = ztest_log_create(zilog, tx, object,
- DMU_OT_UINT64_OTHER);
- dmu_write(os, object, 0, sizeof (name), name, tx);
- dmu_tx_commit(tx);
- }
- if (ztest_random(5) == 0) {
- zil_commit(zilog, seq, object);
- }
- if (ztest_random(100) == 0) {
- error = zil_suspend(zilog);
- if (error == 0) {
- zil_resume(zilog);
- }
- }
+ iters = ztest_random(5);
+ for (int i = 0; i < iters; i++) {
+ ztest_dmu_object_alloc_free(&zdtmp, id);
+ if (ztest_random(iters) == 0)
+ (void) ztest_snapshot_create(name, i);
}
/*
* Verify that we cannot create an existing dataset.
*/
- error = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL);
- if (error != EEXIST)
- fatal(0, "created existing dataset, error = %d", error);
+ VERIFY3U(EEXIST, ==,
+ dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
/*
* Verify that we can hold an objset that is also owned.
*/
- error = dmu_objset_hold(name, FTAG, &os2);
- if (error)
- fatal(0, "dmu_objset_open('%s') = %d", name, error);
+ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
dmu_objset_rele(os2, FTAG);
/*
- * Verify that we can not own an objset that is already owned.
+ * Verify that we cannot own an objset that is already owned.
*/
- error = dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2);
- if (error != EBUSY)
- fatal(0, "dmu_objset_open('%s') = %d, expected EBUSY",
- name, error);
+ VERIFY3U(EBUSY, ==,
+ dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
zil_close(zilog);
dmu_objset_disown(os, FTAG);
-
- error = dmu_objset_destroy(name, B_FALSE);
- if (error)
- fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
-
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ ztest_zd_fini(&zdtmp);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
*/
void
-ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
{
- int error;
- objset_t *os = za->za_os;
- char snapname[100];
- char osname[MAXNAMELEN];
-
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- dmu_objset_name(os, osname);
- (void) snprintf(snapname, 100, "%s@%llu", osname,
- (u_longlong_t)za->za_instance);
-
- error = dmu_objset_destroy(snapname, B_FALSE);
- if (error != 0 && error != ENOENT)
- fatal(0, "dmu_objset_destroy() = %d", error);
- error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
- NULL, FALSE);
- if (error == ENOSPC)
- ztest_record_enospc("dmu_take_snapshot");
- else if (error != 0 && error != EEXIST)
- fatal(0, "dmu_take_snapshot() = %d", error);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ ztest_shared_t *zs = ztest_shared;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+ (void) ztest_snapshot_destroy(zd->zd_name, id);
+ (void) ztest_snapshot_create(zd->zd_name, id);
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Cleanup non-standard snapshots and clones.
*/
void
-ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
{
- char snap1name[100];
- char clone1name[100];
- char snap2name[100];
- char clone2name[100];
- char snap3name[100];
+ char snap1name[MAXNAMELEN];
+ char clone1name[MAXNAMELEN];
+ char snap2name[MAXNAMELEN];
+ char clone2name[MAXNAMELEN];
+ char snap3name[MAXNAMELEN];
int error;
- (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
- (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
- (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
- (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
- (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+ (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+ (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+ (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+ (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+ (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
error = dmu_objset_destroy(clone2name, B_FALSE);
if (error && error != ENOENT)
@@ -1715,36 +2929,34 @@
* Verify dsl_dataset_promote handles EBUSY
*/
void
-ztest_dsl_dataset_promote_busy(ztest_args_t *za)
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
{
- int error;
- objset_t *os = za->za_os;
+ ztest_shared_t *zs = ztest_shared;
objset_t *clone;
dsl_dataset_t *ds;
- char snap1name[100];
- char clone1name[100];
- char snap2name[100];
- char clone2name[100];
- char snap3name[100];
- char osname[MAXNAMELEN];
- uint64_t curval = za->za_instance;
-
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
-
- dmu_objset_name(os, osname);
- ztest_dsl_dataset_cleanup(osname, curval);
-
- (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
- (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
- (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
- (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
- (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
+ char snap1name[MAXNAMELEN];
+ char clone1name[MAXNAMELEN];
+ char snap2name[MAXNAMELEN];
+ char clone2name[MAXNAMELEN];
+ char snap3name[MAXNAMELEN];
+ char *osname = zd->zd_name;
+ int error;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ ztest_dsl_dataset_cleanup(osname, id);
+
+ (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+ (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+ (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+ (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+ (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
- NULL, FALSE);
+ NULL, B_FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_take_snapshot");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
@@ -1758,27 +2970,27 @@
dmu_objset_rele(clone, FTAG);
if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_objset_create");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
}
error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
- NULL, FALSE);
+ NULL, B_FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_take_snapshot");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
}
error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
- NULL, FALSE);
+ NULL, B_FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_take_snapshot");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
@@ -1792,7 +3004,7 @@
dmu_objset_rele(clone, FTAG);
if (error) {
if (error == ENOSPC) {
- ztest_record_enospc("dmu_objset_create");
+ ztest_record_enospc(FTAG);
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
@@ -1808,246 +3020,49 @@
dsl_dataset_disown(ds, FTAG);
out:
- ztest_dsl_dataset_cleanup(osname, curval);
-
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ ztest_dsl_dataset_cleanup(osname, id);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Verify that dmu_object_{alloc,free} work as expected.
*/
void
-ztest_dmu_object_alloc_free(ztest_args_t *za)
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_buf_t *db;
- dmu_tx_t *tx;
- uint64_t batchobj, object, batchsize, endoff, temp;
- int b, c, error, bonuslen;
- dmu_object_info_t *doi = &za->za_doi;
- char osname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
-
- endoff = -8ULL;
- batchsize = 2;
-
- /*
- * Create a batch object if necessary, and record it in the directory.
- */
- VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
- if (batchobj == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create a batch object");
- dmu_tx_abort(tx);
- return;
- }
- batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, batchobj, tx);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &batchobj, tx);
- dmu_tx_commit(tx);
- }
+ ztest_od_t od[4];
+ int batchsize = sizeof (od) / sizeof (od[0]);
+
+ for (int b = 0; b < batchsize; b++)
+ ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
/*
- * Destroy the previous batch of objects.
- */
- for (b = 0; b < batchsize; b++) {
- VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
- if (object == 0)
- continue;
- /*
- * Read and validate contents.
- * We expect the nth byte of the bonus buffer to be n.
- */
- VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
- za->za_dbuf = db;
-
- dmu_object_info_from_db(db, doi);
- ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
- ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
- ASSERT3S(doi->doi_physical_blks, >=, 0);
-
- bonuslen = doi->doi_bonus_size;
-
- for (c = 0; c < bonuslen; c++) {
- if (((uint8_t *)db->db_data)[c] !=
- (uint8_t)(c + bonuslen)) {
- fatal(0,
- "bad bonus: %s, obj %llu, off %d: %u != %u",
- osname, object, c,
- ((uint8_t *)db->db_data)[c],
- (uint8_t)(c + bonuslen));
- }
- }
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * We expect the word at endoff to be our object number.
- */
- VERIFY(0 == dmu_read(os, object, endoff,
- sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
-
- if (temp != object) {
- fatal(0, "bad data in %s, got %llu, expected %llu",
- osname, temp, object);
- }
-
- /*
- * Destroy old object and clear batch entry.
- */
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj,
- b * sizeof (uint64_t), sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("free object");
- dmu_tx_abort(tx);
- return;
- }
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- object = 0;
-
- dmu_object_set_checksum(os, batchobj,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, batchobj,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- dmu_tx_commit(tx);
- }
-
- /*
- * Before creating the new batch of objects, generate a bunch of churn.
+ * Destroy the previous batch of objects, create a new batch,
+ * and do some I/O on the new objects.
*/
- for (b = ztest_random(100); b > 0; b--) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("churn objects");
- dmu_tx_abort(tx);
- return;
- }
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, object, tx);
- error = dmu_object_free(os, object, tx);
- if (error) {
- fatal(0, "dmu_object_free('%s', %llu) = %d",
- osname, object, error);
- }
- dmu_tx_commit(tx);
- }
-
- /*
- * Create a new batch of objects with randomly chosen
- * blocksizes and record them in the batch directory.
- */
- for (b = 0; b < batchsize; b++) {
- uint32_t va_blksize;
- u_longlong_t va_nblocks;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
- sizeof (uint64_t));
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create batchobj");
- dmu_tx_abort(tx);
- return;
- }
- bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
- object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_PLAIN_OTHER, bonuslen, tx);
-
- ztest_set_random_blocksize(os, object, tx);
-
- dmu_object_set_checksum(os, object,
- ztest_random_checksum(), tx);
- dmu_object_set_compress(os, object,
- ztest_random_compress(), tx);
-
- dmu_write(os, batchobj, b * sizeof (uint64_t),
- sizeof (uint64_t), &object, tx);
-
- /*
- * Write to both the bonus buffer and the regular data.
- */
- VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
- za->za_dbuf = db;
- ASSERT3U(bonuslen, <=, db->db_size);
-
- dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
- ASSERT3S(va_nblocks, >=, 0);
-
- dmu_buf_will_dirty(db, tx);
-
- /*
- * See comments above regarding the contents of
- * the bonus buffer and the word at endoff.
- */
- for (c = 0; c < bonuslen; c++)
- ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
-
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- /*
- * Write to a large offset to increase indirection.
- */
- dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
-
- dmu_tx_commit(tx);
- }
+ if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+ return;
+
+ while (ztest_random(4 * batchsize) != 0)
+ ztest_io(zd, od[ztest_random(batchsize)].od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
}
/*
* Verify that dmu_{read,write} work as expected.
*/
-typedef struct bufwad {
- uint64_t bw_index;
- uint64_t bw_txg;
- uint64_t bw_data;
-} bufwad_t;
-
-typedef struct dmu_read_write_dir {
- uint64_t dd_packobj;
- uint64_t dd_bigobj;
- uint64_t dd_chunk;
-} dmu_read_write_dir_t;
-
void
-ztest_dmu_read_write(ztest_args_t *za)
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_read_write_dir_t dd;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[2];
dmu_tx_t *tx;
int i, freeit, error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
- uint64_t packoff, packsize, bigoff, bigsize;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 40;
@@ -2080,34 +3095,16 @@
/*
* Read the directory info. If it's the first time, set things up.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (dd), &dd, DMU_READ_PREFETCH));
- if (dd.dd_chunk == 0) {
- ASSERT(dd.dd_packobj == 0);
- ASSERT(dd.dd_bigobj == 0);
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create r/w directory");
- dmu_tx_abort(tx);
- return;
- }
-
- dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
-
- ztest_set_random_blocksize(os, dd.dd_packobj, tx);
- ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
-
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
- tx);
- dmu_tx_commit(tx);
- }
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ chunksize = od[0].od_gen;
+ ASSERT(chunksize == od[1].od_gen);
/*
* Prefetch a random chunk of the big object.
@@ -2117,7 +3114,7 @@
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(2 * width - 1);
- dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+ dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
/*
* Pick a random index and compute the offsets into packobj and bigobj.
@@ -2128,8 +3125,8 @@
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
- bigoff = n * dd.dd_chunk;
- bigsize = s * dd.dd_chunk;
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
packbuf = umem_alloc(packsize, UMEM_NOFAIL);
bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
@@ -2143,10 +3140,10 @@
/*
* Read the current contents of our objects.
*/
- error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
+ error = dmu_read(os, packobj, packoff, packsize, packbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
- error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
+ error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
@@ -2155,24 +3152,25 @@
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
if (freeit)
- dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+ dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
else
- dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
- error = dmu_tx_assign(tx, TXG_WAIT);
-
- if (error) {
- ztest_record_enospc("dmu r/w range");
- dmu_tx_abort(tx);
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
return;
}
- txg = dmu_tx_get_txg(tx);
+ dmu_object_set_checksum(os, bigobj,
+ (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+
+ dmu_object_set_compress(os, bigobj,
+ (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
/*
* For each index from n to n + s, verify that the existing bufwad
@@ -2184,9 +3182,9 @@
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
- bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
- bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2220,27 +3218,26 @@
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
- dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
if (freeit) {
- if (zopt_verbose >= 6) {
+ if (zopt_verbose >= 7) {
(void) printf("freeing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
- bigsize, tx));
+ VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
} else {
- if (zopt_verbose >= 6) {
+ if (zopt_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+ dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
}
dmu_tx_commit(tx);
@@ -2252,9 +3249,9 @@
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
- VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+ VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
- VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2270,7 +3267,7 @@
void
compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
- uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
+ uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
{
uint64_t i;
bufwad_t *pack;
@@ -2287,9 +3284,9 @@
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
- bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
/* LINTED */
- bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -2318,22 +3315,24 @@
}
void
-ztest_dmu_read_write_zcopy(ztest_args_t *za)
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_read_write_dir_t dd;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[2];
dmu_tx_t *tx;
uint64_t i;
int error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf;
- uint64_t packoff, packsize, bigoff, bigsize;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t blocksize = ztest_random_blocksize();
+ uint64_t chunksize = blocksize;
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 9;
dmu_buf_t *bonus_db;
arc_buf_t **bigbuf_arcbufs;
- dmu_object_info_t *doi = &za->za_doi;
+ dmu_object_info_t doi;
/*
* This test uses two objects, packobj and bigobj, that are always
@@ -2354,42 +3353,22 @@
/*
* Read the directory info. If it's the first time, set things up.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (dd), &dd, DMU_READ_PREFETCH));
- if (dd.dd_chunk == 0) {
- ASSERT(dd.dd_packobj == 0);
- ASSERT(dd.dd_bigobj == 0);
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create r/w directory");
- dmu_tx_abort(tx);
- return;
- }
-
- dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
- DMU_OT_NONE, 0, tx);
- ztest_set_random_blocksize(os, dd.dd_packobj, tx);
- ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
-
- VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
- ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
- ASSERT(ISP2(doi->doi_data_block_size));
- dd.dd_chunk = doi->doi_data_block_size;
-
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
- tx);
- dmu_tx_commit(tx);
- } else {
- VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
- VERIFY(ISP2(doi->doi_data_block_size));
- VERIFY(dd.dd_chunk == doi->doi_data_block_size);
- VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
- }
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ blocksize = od[0].od_blocksize;
+ chunksize = blocksize;
+ ASSERT(chunksize == od[1].od_gen);
+
+ VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+ VERIFY(ISP2(doi.doi_data_block_size));
+ VERIFY(chunksize == doi.doi_data_block_size);
+ VERIFY(chunksize >= 2 * sizeof (bufwad_t));
/*
* Pick a random index and compute the offsets into packobj and bigobj.
@@ -2400,13 +3379,13 @@
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
- bigoff = n * dd.dd_chunk;
- bigsize = s * dd.dd_chunk;
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
- VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
+ VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
@@ -2432,15 +3411,12 @@
for (j = 0; j < s; j++) {
if (i != 5) {
bigbuf_arcbufs[j] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk);
+ dmu_request_arcbuf(bonus_db, chunksize);
} else {
bigbuf_arcbufs[2 * j] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk / 2);
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
bigbuf_arcbufs[2 * j + 1] =
- dmu_request_arcbuf(bonus_db,
- dd.dd_chunk / 2);
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
}
}
@@ -2449,20 +3425,11 @@
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
- dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
- if (ztest_random(100) == 0) {
- error = -1;
- } else {
- error = dmu_tx_assign(tx, TXG_WAIT);
- }
-
- if (error) {
- if (error != -1) {
- ztest_record_enospc("dmu r/w range");
- }
- dmu_tx_abort(tx);
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
@@ -2480,53 +3447,51 @@
return;
}
- txg = dmu_tx_get_txg(tx);
-
/*
* 50% of the time don't read objects in the 1st iteration to
* test dmu_assign_arcbuf() for the case when there're no
* existing dbufs for the specified offsets.
*/
if (i != 0 || ztest_random(2) != 0) {
- error = dmu_read(os, dd.dd_packobj, packoff,
+ error = dmu_read(os, packobj, packoff,
packsize, packbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
- error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
+ error = dmu_read(os, bigobj, bigoff, bigsize,
bigbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
}
compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
- n, dd, txg);
+ n, chunksize, txg);
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
- dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
- if (zopt_verbose >= 6) {
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+ if (zopt_verbose >= 7) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
- for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
+ for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
bcopy((caddr_t)bigbuf + (off - bigoff),
- bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
+ bigbuf_arcbufs[j]->b_data, chunksize);
} else {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[2 * j]->b_data,
- dd.dd_chunk / 2);
+ chunksize / 2);
bcopy((caddr_t)bigbuf + (off - bigoff) +
- dd.dd_chunk / 2,
+ chunksize / 2,
bigbuf_arcbufs[2 * j + 1]->b_data,
- dd.dd_chunk / 2);
+ chunksize / 2);
}
if (i == 1) {
- VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
+ VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt) == 0);
}
if (i != 5) {
@@ -2536,7 +3501,7 @@
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[2 * j], tx);
dmu_assign_arcbuf(bonus_db,
- off + dd.dd_chunk / 2,
+ off + chunksize / 2,
bigbuf_arcbufs[2 * j + 1], tx);
}
if (i == 1) {
@@ -2552,9 +3517,9 @@
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
- VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
+ VERIFY(0 == dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
- VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
@@ -2576,256 +3541,60 @@
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
}
+/* ARGSUSED */
void
-ztest_dmu_check_future_leak(ztest_args_t *za)
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- dmu_buf_t *db;
- ztest_block_tag_t *bt;
- dmu_object_info_t *doi = &za->za_doi;
+ ztest_od_t od[1];
+ uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
/*
- * Make sure that, if there is a write record in the bonus buffer
- * of the ZTEST_DIROBJ, that the txg for this record is <= the
- * last synced txg of the pool.
+ * Have multiple threads write to large offsets in an object
+ * to verify that parallel writes to an object -- even to the
+ * same blocks within the object -- doesn't cause any trouble.
*/
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
- za->za_dbuf = db;
- VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
- ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
- ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
- ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
- bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
- if (bt->bt_objset != 0) {
- ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
- ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
- ASSERT3U(bt->bt_offset, ==, -1ULL);
- ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
- }
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ while (ztest_random(10) != 0)
+ ztest_io(zd, od[0].od_object, offset);
}
void
-ztest_dmu_write_parallel(ztest_args_t *za)
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- ztest_block_tag_t *rbt = &za->za_rbt;
- ztest_block_tag_t *wbt = &za->za_wbt;
- const size_t btsize = sizeof (ztest_block_tag_t);
- dmu_buf_t *db;
- int b, error;
- int bs = ZTEST_DIROBJ_BLOCKSIZE;
- int do_free = 0;
- uint64_t off, txg, txg_how;
- mutex_t *lp;
- char osname[MAXNAMELEN];
- char iobuf[SPA_MAXBLOCKSIZE];
- blkptr_t blk = { 0 };
- uint64_t blkoff;
- zbookmark_t zb;
- dmu_tx_t *tx = dmu_tx_create(os);
- dmu_buf_t *bonus_db;
- arc_buf_t *abuf = NULL;
-
- dmu_objset_name(os, osname);
-
- /*
- * Have multiple threads write to large offsets in ZTEST_DIROBJ
- * to verify that having multiple threads writing to the same object
- * in parallel doesn't cause any trouble.
- */
- if (ztest_random(4) == 0) {
- /*
- * Do the bonus buffer instead of a regular block.
- * We need a lock to serialize resize vs. others,
- * so we hash on the objset ID.
- */
- b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
- off = -1ULL;
- dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
- } else {
- b = ztest_random(ZTEST_SYNC_LOCKS);
- off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
- if (ztest_random(4) == 0) {
- do_free = 1;
- dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
- } else {
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
- }
- }
-
- if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
- ztest_random(8) == 0) {
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
- abuf = dmu_request_arcbuf(bonus_db, bs);
- }
-
- txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
- error = dmu_tx_assign(tx, txg_how);
- if (error) {
- if (error == ERESTART) {
- ASSERT(txg_how == TXG_NOWAIT);
- dmu_tx_wait(tx);
- } else {
- ztest_record_enospc("dmu write parallel");
- }
- dmu_tx_abort(tx);
- if (abuf != NULL) {
- dmu_return_arcbuf(abuf);
- dmu_buf_rele(bonus_db, FTAG);
- }
+ ztest_od_t od[1];
+ uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+ uint64_t count = ztest_random(20) + 1;
+ uint64_t blocksize = ztest_random_blocksize();
+ void *data;
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
+
+ if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
+ return;
+
+ ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
+
+ data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+ while (ztest_random(count) != 0) {
+ uint64_t randoff = offset + (ztest_random(count) * blocksize);
+ if (ztest_write(zd, od[0].od_object, randoff, blocksize,
+ data) != 0)
+ break;
+ while (ztest_random(4) != 0)
+ ztest_io(zd, od[0].od_object, randoff);
}
- txg = dmu_tx_get_txg(tx);
-
- lp = &ztest_shared->zs_sync_lock[b];
- (void) mutex_lock(lp);
-
- wbt->bt_objset = dmu_objset_id(os);
- wbt->bt_object = ZTEST_DIROBJ;
- wbt->bt_offset = off;
- wbt->bt_txg = txg;
- wbt->bt_thread = za->za_instance;
- wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */
-
- /*
- * Occasionally, write an all-zero block to test the behavior
- * of blocks that compress into holes.
- */
- if (off != -1ULL && ztest_random(8) == 0)
- bzero(wbt, btsize);
-
- if (off == -1ULL) {
- dmu_object_info_t *doi = &za->za_doi;
- char *dboff;
-
- VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
- za->za_dbuf = db;
- dmu_object_info_from_db(db, doi);
- ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
- ASSERT3U(doi->doi_bonus_size, >=, btsize);
- ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
- dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
- bcopy(dboff, rbt, btsize);
- if (rbt->bt_objset != 0) {
- ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
- ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
- ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
- ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
- }
- if (ztest_random(10) == 0) {
- int newsize = (ztest_random(db->db_size /
- btsize) + 1) * btsize;
-
- ASSERT3U(newsize, >=, btsize);
- ASSERT3U(newsize, <=, db->db_size);
- VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
- dboff = (char *)db->db_data + newsize - btsize;
- }
- dmu_buf_will_dirty(db, tx);
- bcopy(wbt, dboff, btsize);
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
- } else if (do_free) {
- VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
- } else if (abuf == NULL) {
- dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
- } else {
- bcopy(wbt, abuf->b_data, btsize);
- dmu_assign_arcbuf(bonus_db, off, abuf, tx);
- dmu_buf_rele(bonus_db, FTAG);
- }
-
- (void) mutex_unlock(lp);
-
- if (ztest_random(1000) == 0)
- (void) poll(NULL, 0, 1); /* open dn_notxholds window */
-
- dmu_tx_commit(tx);
-
- if (ztest_random(10000) == 0)
- txg_wait_synced(dmu_objset_pool(os), txg);
-
- if (off == -1ULL || do_free)
- return;
-
- if (ztest_random(2) != 0)
- return;
-
- /*
- * dmu_sync() the block we just wrote.
- */
- (void) mutex_lock(lp);
-
- blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
- error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
- za->za_dbuf = db;
- if (error) {
- (void) mutex_unlock(lp);
- return;
- }
- blkoff = off - blkoff;
- error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
- dmu_buf_rele(db, FTAG);
- za->za_dbuf = NULL;
-
- if (error) {
- (void) mutex_unlock(lp);
- return;
- }
-
- if (blk.blk_birth == 0) { /* concurrent free */
- (void) mutex_unlock(lp);
- return;
- }
-
- txg_suspend(dmu_objset_pool(os));
-
- (void) mutex_unlock(lp);
-
- ASSERT(blk.blk_fill == 1);
- ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
- ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
- ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
-
- /*
- * Read the block that dmu_sync() returned to make sure its contents
- * match what we wrote. We do this while still txg_suspend()ed
- * to ensure that the block can't be reused before we read it.
- */
- zb.zb_objset = dmu_objset_id(os);
- zb.zb_object = ZTEST_DIROBJ;
- zb.zb_level = 0;
- zb.zb_blkid = off / bs;
- error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
- NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
- ASSERT3U(error, ==, 0);
-
- txg_resume(dmu_objset_pool(os));
-
- bcopy(&iobuf[blkoff], rbt, btsize);
-
- if (rbt->bt_objset == 0) /* concurrent free */
- return;
-
- if (wbt->bt_objset == 0) /* all-zero overwrite */
- return;
-
- ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
- ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
- ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-
- /*
- * The semantic of dmu_sync() is that we always push the most recent
- * version of the data, so in the face of concurrent updates we may
- * see a newer version of the block. That's OK.
- */
- ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
- if (rbt->bt_thread == wbt->bt_thread)
- ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
- else
- ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+
+ umem_free(data, blocksize);
}
/*
@@ -2836,9 +3605,10 @@
#define ZTEST_ZAP_MAX_PROPS 1000
void
-ztest_zap(ztest_args_t *za)
+ztest_zap(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
uint64_t object;
uint64_t txg, last_txg;
uint64_t value[ZTEST_ZAP_MAX_INTS];
@@ -2847,64 +3617,45 @@
dmu_tx_t *tx;
char propname[100], txgname[100];
int error;
- char osname[MAXNAMELEN];
char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
- dmu_objset_name(os, osname);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+ return;
+
+ object = od[0].od_object;
/*
- * Create a new object if necessary, and record it in the directory.
+ * Generate a known hash collision, and verify that
+ * we can lookup and remove both entries.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-
- if (object == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap test obj");
- dmu_tx_abort(tx);
- return;
- }
- object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
- if (error) {
- fatal(0, "zap_create('%s', %llu) = %d",
- osname, object, error);
- }
- ASSERT(object != 0);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, tx);
- /*
- * Generate a known hash collision, and verify that
- * we can lookup and remove both entries.
- */
- for (i = 0; i < 2; i++) {
- value[i] = i;
- error = zap_add(os, object, hc[i], sizeof (uint64_t),
- 1, &value[i], tx);
- ASSERT3U(error, ==, 0);
- }
- for (i = 0; i < 2; i++) {
- error = zap_add(os, object, hc[i], sizeof (uint64_t),
- 1, &value[i], tx);
- ASSERT3U(error, ==, EEXIST);
- error = zap_length(os, object, hc[i],
- &zl_intsize, &zl_ints);
- ASSERT3U(error, ==, 0);
- ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
- ASSERT3U(zl_ints, ==, 1);
- }
- for (i = 0; i < 2; i++) {
- error = zap_remove(os, object, hc[i], tx);
- ASSERT3U(error, ==, 0);
- }
-
- dmu_tx_commit(tx);
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ return;
+ for (i = 0; i < 2; i++) {
+ value[i] = i;
+ VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+ 1, &value[i], tx));
}
-
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+ sizeof (uint64_t), 1, &value[i], tx));
+ VERIFY3U(0, ==,
+ zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
+ }
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+ }
+ dmu_tx_commit(tx);
+
+ /*
+ * Generate a buch of random entries.
+ */
ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
@@ -2948,14 +3699,10 @@
* should be txg + object + n.
*/
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- txg = dmu_tx_get_txg(tx);
if (last_txg > txg)
fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
@@ -2963,16 +3710,10 @@
for (i = 0; i < ints; i++)
value[i] = txg + object + i;
- error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
-
- error = zap_update(os, object, propname, sizeof (uint64_t),
- ints, value, tx);
- if (error)
- fatal(0, "zap_update('%s', %llu, '%s') = %d",
- osname, object, propname, error);
+ VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+ 1, &txg, tx));
+ VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+ ints, value, tx));
dmu_tx_commit(tx);
@@ -2991,47 +3732,12 @@
ASSERT3U(error, ==, 0);
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("remove zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- error = zap_remove(os, object, txgname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, txgname, error);
-
- error = zap_remove(os, object, propname, tx);
- if (error)
- fatal(0, "zap_remove('%s', %llu, '%s') = %d",
- osname, object, propname, error);
-
- dmu_tx_commit(tx);
-
- /*
- * Once in a while, destroy the object.
- */
- if (ztest_random(1000) != 0)
- return;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("destroy zap object");
- dmu_tx_abort(tx);
- return;
- }
- error = zap_destroy(os, object, tx);
- if (error)
- fatal(0, "zap_destroy('%s', %llu) = %d",
- osname, object, error);
- object = 0;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- &object, tx);
+ VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+ VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
dmu_tx_commit(tx);
}
@@ -3039,108 +3745,65 @@
* Testcase to test the upgrading of a microzap to fatzap.
*/
void
-ztest_fzap(ztest_args_t *za)
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- uint64_t object;
- uint64_t value;
- dmu_tx_t *tx;
- int i, error;
- char osname[MAXNAMELEN];
- char *name = "aaa";
- char entname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, txg;
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+ return;
+
+ object = od[0].od_object;
/*
- * Create a new object if necessary, and record it in the directory.
+ * Add entries to this ZAP and make sure it spills over
+ * and gets upgraded to a fatzap. Also, since we are adding
+ * 2050 entries we should see ptrtbl growth and leaf-block split.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, DMU_READ_PREFETCH));
-
- if (object == 0) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t));
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("create zap test obj");
- dmu_tx_abort(tx);
- return;
- }
- object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
- if (error) {
- fatal(0, "zap_create('%s', %llu) = %d",
- osname, object, error);
- }
- ASSERT(object != 0);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
- sizeof (uint64_t), &object, tx);
- dmu_tx_commit(tx);
- }
-
- /*
- * Add entries to this ZAP amd make sure it spills over
- * and gets upgraded to a fatzap. Also, since we are adding
- * 2050 entries we should see ptrtbl growth and leaf-block
- * split.
- */
- for (i = 0; i < 2050; i++) {
- (void) snprintf(entname, sizeof (entname), "%s-%d", name, i);
- value = i;
+ for (int i = 0; i < 2050; i++) {
+ char name[MAXNAMELEN];
+ uint64_t value = i;
+ dmu_tx_t *tx;
+ int error;
+
+ (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+ id, value);
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, entname);
- error = dmu_tx_assign(tx, TXG_WAIT);
-
- if (error) {
- ztest_record_enospc("create zap entry");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, name);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- error = zap_add(os, object, entname, sizeof (uint64_t),
- 1, &value, tx);
-
+ error = zap_add(os, object, name, sizeof (uint64_t), 1,
+ &value, tx);
ASSERT(error == 0 || error == EEXIST);
dmu_tx_commit(tx);
}
-
- /*
- * Once in a while, destroy the object.
- */
- if (ztest_random(1000) != 0)
- return;
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
- dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("destroy zap object");
- dmu_tx_abort(tx);
- return;
- }
- error = zap_destroy(os, object, tx);
- if (error)
- fatal(0, "zap_destroy('%s', %llu) = %d",
- osname, object, error);
- object = 0;
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
- &object, tx);
- dmu_tx_commit(tx);
}
+/* ARGSUSED */
void
-ztest_zap_parallel(ztest_args_t *za)
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
dmu_tx_t *tx;
int i, namelen, error;
+ int micro = ztest_random(2);
char name[20], string_value[20];
void *data;
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ object = od[0].od_object;
+
/*
* Generate a random name of the form 'xxx.....' where each
* x is a random printable character and the dots are dots.
@@ -3155,12 +3818,7 @@
name[i] = '.';
name[i] = '\0';
- if (ztest_random(2) == 0)
- object = ZTEST_MICROZAP_OBJ;
- else
- object = ZTEST_FATZAP_OBJ;
-
- if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
+ if ((namelen & 1) || micro) {
wsize = sizeof (txg);
wc = 1;
data = &txg;
@@ -3181,14 +3839,10 @@
if (i >= 2) {
tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, object, TRUE, NULL);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- ztest_record_enospc("zap parallel");
- dmu_tx_abort(tx);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
return;
- }
- txg = dmu_tx_get_txg(tx);
bcopy(name, string_value, namelen);
} else {
tx = NULL;
@@ -3322,20 +3976,26 @@
* Commit callback test.
*/
void
-ztest_dmu_commit_callbacks(ztest_args_t *za)
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
dmu_tx_t *tx;
ztest_cb_data_t *cb_data[3], *tmp_cb;
uint64_t old_txg, txg;
int i, error;
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
tx = dmu_tx_create(os);
cb_data[0] = ztest_create_cb_data(os, 0);
dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
- dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
+ dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
/* Every once in a while, abort the transaction on purpose */
if (ztest_random(100) == 0)
@@ -3378,14 +4038,14 @@
/*
* Read existing data to make sure there isn't a future leak.
*/
- VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
+ VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
&old_txg, DMU_READ_PREFETCH));
if (old_txg > txg)
fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
old_txg, txg);
- dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), &txg, tx);
+ dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
(void) mutex_lock(&zcl.zcl_callbacks_lock);
@@ -3439,69 +4099,60 @@
dmu_tx_commit(tx);
}
+/* ARGSUSED */
void
-ztest_dsl_prop_get_set(ztest_args_t *za)
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
{
- objset_t *os = za->za_os;
- int i, inherit;
- uint64_t value;
- const char *prop, *valname;
- char setpoint[MAXPATHLEN];
- char osname[MAXNAMELEN];
- int error;
-
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
-
- dmu_objset_name(os, osname);
-
- for (i = 0; i < 2; i++) {
- if (i == 0) {
- prop = "checksum";
- value = ztest_random_checksum();
- inherit = (value == ZIO_CHECKSUM_INHERIT);
- } else {
- prop = "compression";
- value = ztest_random_compress();
- inherit = (value == ZIO_COMPRESS_INHERIT);
- }
-
- error = dsl_prop_set(osname, prop, sizeof (value),
- !inherit, &value);
-
- if (error == ENOSPC) {
- ztest_record_enospc("dsl_prop_set");
- break;
- }
-
- ASSERT3U(error, ==, 0);
-
- VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
- 1, &value, setpoint), ==, 0);
-
- if (i == 0)
- valname = zio_checksum_table[value].ci_name;
- else
- valname = zio_compress_table[value].ci_name;
-
- if (zopt_verbose >= 6) {
- (void) printf("%s %s = %s for '%s'\n",
- osname, prop, valname, setpoint);
- }
- }
-
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ zfs_prop_t proplist[] = {
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_COPIES,
+ ZFS_PROP_DEDUP
+ };
+ ztest_shared_t *zs = ztest_shared;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+ ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+ (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ nvlist_t *props = NULL;
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+#if 0
+ (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+ ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+#endif
+
+ VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+ if (zopt_verbose >= 6)
+ dump_nvlist(props, 4);
+
+ nvlist_free(props);
+
+ (void) rw_unlock(&zs->zs_name_lock);
}
/*
* Test snapshot hold/release and deferred destroy.
*/
void
-ztest_dmu_snapshot_hold(ztest_args_t *za)
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
{
int error;
- objset_t *os = za->za_os;
+ objset_t *os = zd->zd_os;
objset_t *origin;
- uint64_t curval = za->za_instance;
char snapname[100];
char fullname[100];
char clonename[100];
@@ -3512,10 +4163,10 @@
dmu_objset_name(os, osname);
- (void) snprintf(snapname, 100, "sh1_%llu", curval);
+ (void) snprintf(snapname, 100, "sh1_%llu", id);
(void) snprintf(fullname, 100, "%s@%s", osname, snapname);
- (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, curval);
- (void) snprintf(tag, 100, "%tag_%llu", curval);
+ (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
+ (void) snprintf(tag, 100, "%tag_%llu", id);
/*
* Clean up from any previous run.
@@ -3608,9 +4259,12 @@
/*
* Inject random faults into the on-disk data.
*/
+/* ARGSUSED */
void
-ztest_fault_inject(ztest_args_t *za)
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
int fd;
uint64_t offset;
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
@@ -3619,7 +4273,6 @@
char path0[MAXPATHLEN];
char pathrand[MAXPATHLEN];
size_t fsize;
- spa_t *spa = za->za_spa;
int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
int iters = 1000;
int maxfaults = zopt_maxfaults;
@@ -3636,9 +4289,9 @@
if (ztest_random(2) == 0) {
/*
- * Inject errors on a normal data device.
+ * Inject errors on a normal data device or slog device.
*/
- top = ztest_random(spa->spa_root_vdev->vdev_children);
+ top = ztest_random_vdev_top(spa, B_TRUE);
leaf = ztest_random(leaves);
/*
@@ -3751,7 +4404,7 @@
if (offset >= fsize)
continue;
- if (zopt_verbose >= 6)
+ if (zopt_verbose >= 7)
(void) printf("injecting bad word into %s,"
" offset 0x%llx\n", pathrand, (u_longlong_t)offset);
@@ -3764,31 +4417,129 @@
}
/*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
*/
void
-ztest_scrub(ztest_args_t *za)
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
{
- spa_t *spa = za->za_spa;
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
+ objset_t *os = zd->zd_os;
+ ztest_od_t od[1];
+ uint64_t object, blocksize, txg, pattern, psize;
+ enum zio_checksum checksum = spa_dedup_checksum(spa);
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ void *buf;
+ blkptr_t blk;
+ int copies = 2 * ZIO_DEDUPDITTO_MIN;
+
+ blocksize = ztest_random_blocksize();
+ blocksize = MIN(blocksize, 2048); /* because we write so many */
+
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+
+ if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+ return;
+
+ /*
+ * Take the name lock as writer to prevent anyone else from changing
+ * the pool and dataset properies we need to maintain during this test.
+ */
+ (void) rw_wrlock(&zs->zs_name_lock);
+
+ if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+ B_FALSE) != 0 ||
+ ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+ B_FALSE) != 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
+
+ object = od[0].od_object;
+ blocksize = od[0].od_blocksize;
+ pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+ ASSERT(object != 0);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ return;
+ }
+
+ /*
+ * Write all the copies of our block.
+ */
+ for (int i = 0; i < copies; i++) {
+ uint64_t offset = i * blocksize;
+ VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0);
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == blocksize);
+ ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+ ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+ dmu_buf_will_fill(db, tx);
+ ztest_pattern_set(db->db_data, db->db_size, pattern);
+ dmu_buf_rele(db, FTAG);
+ }
+
+ dmu_tx_commit(tx);
+ txg_wait_synced(spa_get_dsl(spa), txg);
+
+ /*
+ * Find out what block we got.
+ */
+ VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0);
+ blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+ dmu_buf_rele(db, FTAG);
+
+ /*
+ * Damage the block. Dedup-ditto will save us when we read it later.
+ */
+ psize = BP_GET_PSIZE(&blk);
+ buf = zio_buf_alloc(psize);
+ ztest_pattern_set(buf, psize, ~pattern);
+
+ (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+ buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
+
+ zio_buf_free(buf, psize);
+
+ (void) rw_unlock(&zs->zs_name_lock);
+}
+
+/*
+ * Scrub the pool.
+ */
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = zs->zs_spa;
(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
- (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
+ (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
}
/*
* Rename the pool to a different name and then rename it back.
*/
+/* ARGSUSED */
void
-ztest_spa_rename(ztest_args_t *za)
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
{
+ ztest_shared_t *zs = ztest_shared;
char *oldname, *newname;
- int error;
spa_t *spa;
- (void) rw_wrlock(&ztest_shared->zs_name_lock);
-
- oldname = za->za_pool;
+ (void) rw_wrlock(&zs->zs_name_lock);
+
+ oldname = zs->zs_pool;
newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
(void) strcpy(newname, oldname);
(void) strcat(newname, "_tmp");
@@ -3796,128 +4547,44 @@
/*
* Do the rename
*/
- error = spa_rename(oldname, newname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", oldname,
- newname, error);
+ VERIFY3U(0, ==, spa_rename(oldname, newname));
/*
* Try to open it under the old name, which shouldn't exist
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != ENOENT)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
* Open it under the new name and make sure it's still the same spa_t.
*/
- error = spa_open(newname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", newname, error);
-
- ASSERT(spa == za->za_spa);
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+
+ ASSERT(spa == zs->zs_spa);
spa_close(spa, FTAG);
/*
* Rename it back to the original
*/
- error = spa_rename(newname, oldname);
- if (error)
- fatal(0, "spa_rename('%s', '%s') = %d", newname,
- oldname, error);
+ VERIFY3U(0, ==, spa_rename(newname, oldname));
/*
* Make sure it can still be opened
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != 0)
- fatal(0, "spa_open('%s') = %d", oldname, error);
-
- ASSERT(spa == za->za_spa);
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+ ASSERT(spa == zs->zs_spa);
spa_close(spa, FTAG);
umem_free(newname, strlen(newname) + 1);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ (void) rw_unlock(&zs->zs_name_lock);
}
-
/*
- * Completely obliterate one disk.
+ * Verify pool integrity by running zdb.
*/
static void
-ztest_obliterate_one_disk(uint64_t vdev)
-{
- int fd;
- char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
- size_t fsize;
-
- if (zopt_maxfaults < 2)
- return;
-
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
- (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
-
- fd = open(dev_name, O_RDWR);
-
- if (fd == -1)
- fatal(1, "can't open %s", dev_name);
-
- /*
- * Determine the size.
- */
- fsize = lseek(fd, 0, SEEK_END);
-
- (void) close(fd);
-
- /*
- * Rename the old device to dev_name.old (useful for debugging).
- */
- VERIFY(rename(dev_name, copy_name) == 0);
-
- /*
- * Create a new one.
- */
- VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
- VERIFY(ftruncate(fd, fsize) == 0);
- (void) close(fd);
-}
-
-static void
-ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
-{
- char dev_name[MAXPATHLEN];
- nvlist_t *root;
- int error;
- uint64_t guid;
- vdev_t *vd;
-
- (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-
- /*
- * Build the nvlist describing dev_name.
- */
- root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
-
- spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
- guid = 0;
- else
- guid = vd->vdev_guid;
- spa_config_exit(spa, SCL_VDEV, FTAG);
- error = spa_vdev_attach(spa, guid, root, B_TRUE);
- if (error != 0 &&
- error != EBUSY &&
- error != ENOTSUP &&
- error != ENODEV &&
- error != EDOM)
- fatal(0, "spa_vdev_attach(in-place) = %d", error);
-
- nvlist_free(root);
-}
-
-static void
-ztest_verify_blocks(char *pool)
+ztest_run_zdb(char *pool)
{
int status;
char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@ -3988,7 +4655,6 @@
nvlist_t *config, *newconfig;
uint64_t pool_guid;
spa_t *spa;
- int error;
if (zopt_verbose >= 4) {
(void) printf("import/export: old = %s, new = %s\n",
@@ -4003,9 +4669,7 @@
/*
* Get the pool's configuration and guid.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error)
- fatal(0, "spa_open('%s') = %d", oldname, error);
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
/*
* Kick off a scrub to tickle scrub/export races.
@@ -4021,9 +4685,7 @@
/*
* Export it.
*/
- error = spa_export(oldname, &config, B_FALSE, B_FALSE);
- if (error)
- fatal(0, "spa_export('%s') = %d", oldname, error);
+ VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
ztest_walk_pool_directory("pools after export");
@@ -4037,39 +4699,29 @@
/*
* Import it under the new name.
*/
- error = spa_import(newname, config, NULL);
- if (error)
- fatal(0, "spa_import('%s') = %d", newname, error);
+ VERIFY3U(0, ==, spa_import(newname, config, NULL));
ztest_walk_pool_directory("pools after import");
/*
* Try to import it again -- should fail with EEXIST.
*/
- error = spa_import(newname, config, NULL);
- if (error != EEXIST)
- fatal(0, "spa_import('%s') twice", newname);
+ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
/*
* Try to import it under a different name -- should fail with EEXIST.
*/
- error = spa_import(oldname, config, NULL);
- if (error != EEXIST)
- fatal(0, "spa_import('%s') under multiple names", newname);
+ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
/*
* Verify that the pool is no longer visible under the old name.
*/
- error = spa_open(oldname, &spa, FTAG);
- if (error != ENOENT)
- fatal(0, "spa_open('%s') = %d", newname, error);
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
/*
* Verify that we can open and close the pool using the new name.
*/
- error = spa_open(newname, &spa, FTAG);
- if (error)
- fatal(0, "spa_open('%s') = %d", newname, error);
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
ASSERT(pool_guid == spa_guid(spa));
spa_close(spa, FTAG);
@@ -4079,12 +4731,12 @@
static void
ztest_resume(spa_t *spa)
{
- if (spa_suspended(spa)) {
- spa_vdev_state_enter(spa, SCL_NONE);
- vdev_clear(spa, NULL);
- (void) spa_vdev_state_exit(spa, NULL, 0);
- (void) zio_resume(spa);
- }
+ if (spa_suspended(spa) && zopt_verbose >= 6)
+ (void) printf("resuming from suspended state\n");
+ spa_vdev_state_enter(spa, SCL_NONE);
+ vdev_clear(spa, NULL);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ (void) zio_resume(spa);
}
static void *
@@ -4093,154 +4745,246 @@
spa_t *spa = arg;
while (!ztest_exiting) {
- (void) poll(NULL, 0, 1000);
- ztest_resume(spa);
+ if (spa_suspended(spa))
+ ztest_resume(spa);
+ (void) poll(NULL, 0, 100);
}
return (NULL);
}
static void *
+ztest_deadman_thread(void *arg)
+{
+ ztest_shared_t *zs = arg;
+ int grace = 300;
+ hrtime_t delta;
+
+ delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+ (void) poll(NULL, 0, (int)(1000 * delta));
+
+ fatal(0, "failed to complete within %d seconds of deadline", grace);
+
+ return (NULL);
+}
+
+static void
+ztest_execute(ztest_info_t *zi, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+ hrtime_t functime = gethrtime();
+
+ for (int i = 0; i < zi->zi_iters; i++)
+ zi->zi_func(zd, id);
+
+ functime = gethrtime() - functime;
+
+ atomic_add_64(&zi->zi_call_count, 1);
+ atomic_add_64(&zi->zi_call_time, functime);
+
+ if (zopt_verbose >= 4) {
+ Dl_info dli;
+ (void) dladdr((void *)zi->zi_func, &dli);
+ (void) printf("%6.2f sec in %s\n",
+ (double)functime / NANOSEC, dli.dli_sname);
+ }
+}
+
+static void *
ztest_thread(void *arg)
{
- ztest_args_t *za = arg;
+ uint64_t id = (uintptr_t)arg;
ztest_shared_t *zs = ztest_shared;
- hrtime_t now, functime;
+ uint64_t call_next;
+ hrtime_t now;
ztest_info_t *zi;
- int f, i;
-
- while ((now = gethrtime()) < za->za_stop) {
+
+ while ((now = gethrtime()) < zs->zs_thread_stop) {
/*
* See if it's time to force a crash.
*/
- if (now > za->za_kill) {
- zs->zs_alloc = spa_get_alloc(za->za_spa);
- zs->zs_space = spa_get_space(za->za_spa);
- (void) kill(getpid(), SIGKILL);
- }
-
- /*
- * Pick a random function.
- */
- f = ztest_random(ZTEST_FUNCS);
- zi = &zs->zs_info[f];
-
- /*
- * Decide whether to call it, based on the requested frequency.
- */
- if (zi->zi_call_target == 0 ||
- (double)zi->zi_call_total / zi->zi_call_target >
- (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
- continue;
-
- atomic_add_64(&zi->zi_calls, 1);
- atomic_add_64(&zi->zi_call_total, 1);
-
- za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
- ZTEST_DIRSIZE;
- za->za_diroff_shared = (1ULL << 63);
-
- for (i = 0; i < zi->zi_iters; i++)
- zi->zi_func(za);
-
- functime = gethrtime() - now;
-
- atomic_add_64(&zi->zi_call_time, functime);
-
- if (zopt_verbose >= 4) {
- Dl_info dli;
- (void) dladdr((void *)zi->zi_func, &dli);
- (void) printf("%6.2f sec in %s\n",
- (double)functime / NANOSEC, dli.dli_sname);
- }
+ if (now > zs->zs_thread_kill)
+ ztest_kill(zs);
/*
* If we're getting ENOSPC with some regularity, stop.
*/
if (zs->zs_enospc_count > 10)
break;
+
+ /*
+ * Pick a random function to execute.
+ */
+ zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+ call_next = zi->zi_call_next;
+
+ if (now >= call_next &&
+ atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+ ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+ ztest_execute(zi, id);
}
return (NULL);
}
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+ (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
+
+static void
+ztest_dataset_destroy(ztest_shared_t *zs, int d)
+{
+ char name[MAXNAMELEN];
+
+ ztest_dataset_name(name, zs->zs_pool, d);
+
+ if (zopt_verbose >= 3)
+ (void) printf("Destroying %s to free up space\n", name);
+
+ /*
+ * Cleanup any non-standard clones and snapshots. In general,
+ * ztest thread t operates on dataset (t % zopt_datasets),
+ * so there may be more than one thing to clean up.
+ */
+ for (int t = d; t < zopt_threads; t += zopt_datasets)
+ ztest_dsl_dataset_cleanup(name, t);
+
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+ uint64_t usedobjs, dirobjs, scratch;
+
+ /*
+ * ZTEST_DIROBJ is the object directory for the entire dataset.
+ * Therefore, the number of objects in use should equal the
+ * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+ * If not, we have an object leak.
+ *
+ * Note that we can only check this in ztest_dataset_open(),
+ * when the open-context and syncing-context values agree.
+ * That's because zap_count() returns the open-context value,
+ * while dmu_objset_space() returns the rootbp fill count.
+ */
+ VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+ dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+ ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(ztest_shared_t *zs, int d)
+{
+ ztest_ds_t *zd = &zs->zs_zd[d];
+ uint64_t committed_seq = zd->zd_seq;
+ objset_t *os;
+ zilog_t *zilog;
+ char name[MAXNAMELEN];
+ int error;
+
+ ztest_dataset_name(name, zs->zs_pool, d);
+
+ (void) rw_rdlock(&zs->zs_name_lock);
+
+ error = dmu_objset_create(name, DMU_OST_OTHER, 0,
+ ztest_objset_create_cb, NULL);
+ if (error == ENOSPC) {
+ (void) rw_unlock(&zs->zs_name_lock);
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT(error == 0 || error == EEXIST);
+
+ VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+ (void) rw_unlock(&zs->zs_name_lock);
+
+ ztest_zd_init(zd, os);
+
+ zilog = zd->zd_zilog;
+
+ if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+ zilog->zl_header->zh_claim_lr_seq < committed_seq)
+ fatal(0, "missing log records: claimed %llu < committed %llu",
+ zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ zil_replay(os, zd, ztest_replay_vector);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ if (zopt_verbose >= 6)
+ (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ zd->zd_name,
+ (u_longlong_t)zilog->zl_parse_blk_count,
+ (u_longlong_t)zilog->zl_parse_lr_count,
+ (u_longlong_t)zilog->zl_replaying_seq);
+
+ zilog = zil_open(os, ztest_get_data);
+
+ if (zilog->zl_replaying_seq != 0 &&
+ zilog->zl_replaying_seq < committed_seq)
+ fatal(0, "missing log records: replayed %llu < committed %llu",
+ zilog->zl_replaying_seq, committed_seq);
+
+ return (0);
+}
+
+static void
+ztest_dataset_close(ztest_shared_t *zs, int d)
+{
+ ztest_ds_t *zd = &zs->zs_zd[d];
+
+ zil_close(zd->zd_zilog);
+ dmu_objset_rele(zd->zd_os, zd);
+
+ ztest_zd_fini(zd);
+}
+
/*
* Kick off threads to run tests on all datasets in parallel.
*/
static void
-ztest_run(char *pool)
+ztest_run(ztest_shared_t *zs)
{
- int t, d, error;
- ztest_shared_t *zs = ztest_shared;
- ztest_args_t *za;
+ thread_t *tid;
spa_t *spa;
- char name[100];
thread_t resume_tid;
+ int error;
ztest_exiting = B_FALSE;
- (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
- (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
-
- (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD,
- NULL);
+ /*
+ * Initialize parent/child shared state.
+ */
+ VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+
+ zs->zs_thread_start = gethrtime();
+ zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+ zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+ zs->zs_thread_kill = zs->zs_thread_stop;
+ if (ztest_random(100) < zopt_killrate)
+ zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
+
+ (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
offsetof(ztest_cb_data_t, zcd_node));
- for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
- (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
-
- /*
- * Destroy one disk before we even start.
- * It's mirrored, so everything should work just fine.
- * This makes us exercise fault handling very early in spa_load().
- */
- ztest_obliterate_one_disk(0);
-
- /*
- * Verify that the sum of the sizes of all blocks in the pool
- * equals the SPA's allocated space total.
- */
- ztest_verify_blocks(pool);
-
- /*
- * Kick off a replacement of the disk we just obliterated.
- */
- kernel_init(FREAD | FWRITE);
- VERIFY(spa_open(pool, &spa, FTAG) == 0);
- ztest_replace_one_disk(spa, 0);
- if (zopt_verbose >= 5)
- show_pool_stats(spa);
- spa_close(spa, FTAG);
- kernel_fini();
-
- kernel_init(FREAD | FWRITE);
-
- /*
- * Verify that we can export the pool and reimport it under a
- * different name.
- */
- if (ztest_random(2) == 0) {
- (void) snprintf(name, 100, "%s_import", pool);
- ztest_spa_import_export(pool, name);
- ztest_spa_import_export(name, pool);
- }
-
- /*
- * Verify that we can loop over all pools.
- */
- mutex_enter(&spa_namespace_lock);
- for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
- if (zopt_verbose > 3) {
- (void) printf("spa_next: found %s\n", spa_name(spa));
- }
- }
- mutex_exit(&spa_namespace_lock);
-
/*
* Open our pool.
*/
- VERIFY(spa_open(pool, &spa, FTAG) == 0);
+ kernel_init(FREAD | FWRITE);
+ VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+ zs->zs_spa = spa;
+
+ spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
/*
* We don't expect the pool to suspend unless maxfaults == 0,
@@ -4259,13 +5003,19 @@
&resume_tid) == 0);
/*
+ * Create a deadman thread to abort() if we hang.
+ */
+ VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
+ NULL) == 0);
+
+ /*
* Verify that we can safely inquire about about any object,
* whether it's allocated or not. To make it interesting,
* we probe a 5-wide window around each power of two.
* This hits all edge cases, including zero and the max.
*/
- for (t = 0; t < 64; t++) {
- for (d = -5; d <= 5; d++) {
+ for (int t = 0; t < 64; t++) {
+ for (int d = -5; d <= 5; d++) {
error = dmu_object_info(spa->spa_meta_objset,
(1ULL << t) + d, NULL);
ASSERT(error == 0 || error == ENOENT ||
@@ -4274,104 +5024,45 @@
}
/*
- * Now kick off all the tests that run in parallel.
+ * If we got any ENOSPC errors on the previous run, destroy something.
*/
+ if (zs->zs_enospc_count != 0) {
+ int d = ztest_random(zopt_datasets);
+ ztest_dataset_destroy(zs, d);
+ }
zs->zs_enospc_count = 0;
- za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+ tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
if (zopt_verbose >= 4)
(void) printf("starting main threads...\n");
- za[0].za_start = gethrtime();
- za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
- za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
- za[0].za_kill = za[0].za_stop;
- if (ztest_random(100) < zopt_killrate)
- za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
- for (t = 0; t < zopt_threads; t++) {
- d = t % zopt_datasets;
-
- (void) strcpy(za[t].za_pool, pool);
- za[t].za_os = za[d].za_os;
- za[t].za_spa = spa;
- za[t].za_zilog = za[d].za_zilog;
- za[t].za_instance = t;
- za[t].za_random = ztest_random(-1ULL);
- za[t].za_start = za[0].za_start;
- za[t].za_stop = za[0].za_stop;
- za[t].za_kill = za[0].za_kill;
-
- if (t < zopt_datasets) {
- int test_future = FALSE;
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- error = dmu_objset_create(name, DMU_OST_OTHER, 0,
- ztest_create_cb, NULL);
- if (error == EEXIST) {
- test_future = TRUE;
- } else if (error == ENOSPC) {
- zs->zs_enospc_count++;
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- break;
- } else if (error != 0) {
- fatal(0, "dmu_objset_create(%s) = %d",
- name, error);
- }
- error = dmu_objset_hold(name, FTAG, &za[d].za_os);
- if (error)
- fatal(0, "dmu_objset_open('%s') = %d",
- name, error);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
- if (test_future)
- ztest_dmu_check_future_leak(&za[t]);
- zil_replay(za[d].za_os, za[d].za_os,
- ztest_replay_vector);
- za[d].za_zilog = zil_open(za[d].za_os, NULL);
- }
-
- VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
- &za[t].za_thread) == 0);
+ /*
+ * Kick off all the tests that run in parallel.
+ */
+ for (int t = 0; t < zopt_threads; t++) {
+ if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+ return;
+ VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+ THR_BOUND, &tid[t]) == 0);
}
- while (--t >= 0) {
- VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
- if (t < zopt_datasets) {
- zil_close(za[t].za_zilog);
- dmu_objset_rele(za[t].za_os, FTAG);
- }
- }
-
- if (zopt_verbose >= 3)
- show_pool_stats(spa);
-
- txg_wait_synced(spa_get_dsl(spa), 0);
-
- zs->zs_alloc = spa_get_alloc(spa);
- zs->zs_space = spa_get_space(spa);
-
/*
- * If we had out-of-space errors, destroy a random objset.
+ * Wait for all of the tests to complete. We go in reverse order
+ * so we don't close datasets while threads are still using them.
*/
- if (zs->zs_enospc_count != 0) {
- (void) rw_rdlock(&ztest_shared->zs_name_lock);
- d = (int)ztest_random(zopt_datasets);
- (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
- if (zopt_verbose >= 3)
- (void) printf("Destroying %s to free up space\n", name);
-
- /* Cleanup any non-standard clones and snapshots */
- ztest_dsl_dataset_cleanup(name, za[d].za_instance);
-
- (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
- DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
- (void) rw_unlock(&ztest_shared->zs_name_lock);
+ for (int t = zopt_threads - 1; t >= 0; t--) {
+ VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+ if (t < zopt_datasets)
+ ztest_dataset_close(zs, t);
}
txg_wait_synced(spa_get_dsl(spa), 0);
- umem_free(za, zopt_threads * sizeof (ztest_args_t));
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(tid, zopt_threads * sizeof (thread_t));
/* Kill the resume thread */
ztest_exiting = B_TRUE;
@@ -4382,11 +5073,99 @@
* Right before closing the pool, kick off a bunch of async I/O;
* spa_close() should wait for it to complete.
*/
- for (t = 1; t < 50; t++)
- dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+ for (uint64_t object = 1; object < 50; object++)
+ dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
spa_close(spa, FTAG);
+ /*
+ * Verify that we can loop over all pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+ if (zopt_verbose > 3)
+ (void) printf("spa_next: found %s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Verify that we can export the pool and reimport it under a
+ * different name.
+ */
+ if (ztest_random(2) == 0) {
+ char name[MAXNAMELEN];
+ (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+ ztest_spa_import_export(zs->zs_pool, name);
+ ztest_spa_import_export(name, zs->zs_pool);
+ }
+
+ kernel_fini();
+}
+
+static void
+ztest_freeze(ztest_shared_t *zs)
+{
+ ztest_ds_t *zd = &zs->zs_zd[0];
+ spa_t *spa;
+
+ if (zopt_verbose >= 3)
+ (void) printf("testing spa_freeze()...\n");
+
+ kernel_init(FREAD | FWRITE);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+
+ /*
+ * Force the first log block to be transactionally allocated.
+ * We have to do this before we freeze the pool -- otherwise
+ * the log chain won't be anchored.
+ */
+ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+ ztest_dmu_object_alloc_free(zd, 0);
+ zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Freeze the pool. This stops spa_sync() from doing anything,
+ * so that the only way to record changes from now on is the ZIL.
+ */
+ spa_freeze(spa);
+
+ /*
+ * Run tests that generate log records but don't alter the pool config
+ * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+ * We do a txg_wait_synced() after each iteration to force the txg
+ * to increase well beyond the last synced value in the uberblock.
+ * The ZIL should be OK with that.
+ */
+ while (ztest_random(20) != 0) {
+ ztest_dmu_write_parallel(zd, 0);
+ ztest_dmu_object_alloc_free(zd, 0);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ /*
+ * Commit all of the changes we just generated.
+ */
+ zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Close our dataset and close the pool.
+ */
+ ztest_dataset_close(zs, 0);
+ spa_close(spa, FTAG);
+ kernel_fini();
+
+ /*
+ * Open and close the pool and dataset to induce log replay.
+ */
+ kernel_init(FREAD | FWRITE);
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+ ztest_dataset_close(zs, 0);
+ spa_close(spa, FTAG);
kernel_fini();
list_destroy(&zcl.zcl_callbacks);
@@ -4424,41 +5203,40 @@
/*
* Create a storage pool with the given name and initial vdev size.
- * Then create the specified number of datasets in the pool.
+ * Then test spa_freeze() functionality.
*/
static void
-ztest_init(char *pool)
+ztest_init(ztest_shared_t *zs)
{
spa_t *spa;
- int error;
nvlist_t *nvroot;
+ VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+ VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+
kernel_init(FREAD | FWRITE);
/*
* Create the storage pool.
*/
- (void) spa_destroy(pool);
+ (void) spa_destroy(zs->zs_pool);
ztest_shared->zs_vdev_next_leaf = 0;
nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
0, zopt_raidz, zopt_mirrors, 1);
- error = spa_create(pool, nvroot, NULL, NULL, NULL);
+ VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
nvlist_free(nvroot);
- if (error)
- fatal(0, "spa_create() = %d", error);
- error = spa_open(pool, &spa, FTAG);
- if (error)
- fatal(0, "spa_open() = %d", error);
-
+ VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
-
- if (zopt_verbose >= 3)
- show_pool_stats(spa);
-
spa_close(spa, FTAG);
kernel_fini();
+
+ ztest_run_zdb(zs->zs_pool);
+
+ ztest_freeze(zs);
+
+ ztest_run_zdb(zs->zs_pool);
}
int
@@ -4466,11 +5244,12 @@
{
int kills = 0;
int iters = 0;
- int i, f;
ztest_shared_t *zs;
+ size_t shared_size;
ztest_info_t *zi;
char timebuf[100];
char numbuf[6];
+ spa_t *spa;
(void) setvbuf(stdout, NULL, _IOLBF, 0);
@@ -4487,8 +5266,10 @@
if (zopt_init != 0)
(void) remove("/tmp/zpool.cache");
+ shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
+
zs = ztest_shared = (void *)mmap(0,
- P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+ P2ROUNDUP(shared_size, getpagesize()),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
if (zopt_verbose >= 1) {
@@ -4501,46 +5282,43 @@
/*
* Create and initialize our storage pool.
*/
- for (i = 1; i <= zopt_init; i++) {
+ for (int i = 1; i <= zopt_init; i++) {
bzero(zs, sizeof (ztest_shared_t));
if (zopt_verbose >= 3 && zopt_init != 1)
(void) printf("ztest_init(), pass %d\n", i);
- ztest_init(zopt_pool);
+ zs->zs_pool = zopt_pool;
+ ztest_init(zs);
}
- /*
- * Initialize the call targets for each function.
- */
- for (f = 0; f < ZTEST_FUNCS; f++) {
+ zs->zs_pool = zopt_pool;
+ zs->zs_proc_start = gethrtime();
+ zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
zi = &zs->zs_info[f];
-
*zi = ztest_info[f];
-
- if (*zi->zi_interval == 0)
- zi->zi_call_target = UINT64_MAX;
+ if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+ zi->zi_call_next = UINT64_MAX;
else
- zi->zi_call_target = zopt_time / *zi->zi_interval;
+ zi->zi_call_next = zs->zs_proc_start +
+ ztest_random(2 * zi->zi_interval[0] + 1);
}
- zs->zs_start_time = gethrtime();
- zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
/*
* Run the tests in a loop. These tests include fault injection
* to verify that self-healing data works, and forced crashes
* to verify that we never lose on-disk consistency.
*/
- while (gethrtime() < zs->zs_stop_time) {
+ while (gethrtime() < zs->zs_proc_stop) {
int status;
pid_t pid;
- char *tmp;
/*
* Initialize the workload counters for each function.
*/
- for (f = 0; f < ZTEST_FUNCS; f++) {
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
zi = &zs->zs_info[f];
- zi->zi_calls = 0;
+ zi->zi_call_count = 0;
zi->zi_call_time = 0;
}
@@ -4556,7 +5334,7 @@
struct rlimit rl = { 1024, 1024 };
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
- ztest_run(zopt_pool);
+ ztest_run(zs);
exit(0);
}
@@ -4589,8 +5367,8 @@
if (zopt_verbose >= 1) {
hrtime_t now = gethrtime();
- now = MIN(now, zs->zs_stop_time);
- print_time(zs->zs_stop_time - now, timebuf);
+ now = MIN(now, zs->zs_proc_stop);
+ print_time(zs->zs_proc_stop - now, timebuf);
nicenum(zs->zs_space, numbuf);
(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
@@ -4600,7 +5378,7 @@
(u_longlong_t)zs->zs_enospc_count,
100.0 * zs->zs_alloc / zs->zs_space,
numbuf,
- 100.0 * (now - zs->zs_start_time) /
+ 100.0 * (now - zs->zs_proc_start) /
(zopt_time * NANOSEC), timebuf);
}
@@ -4610,35 +5388,40 @@
"Calls", "Time", "Function");
(void) printf("%7s %9s %s\n",
"-----", "----", "--------");
- for (f = 0; f < ZTEST_FUNCS; f++) {
+ for (int f = 0; f < ZTEST_FUNCS; f++) {
Dl_info dli;
zi = &zs->zs_info[f];
print_time(zi->zi_call_time, timebuf);
(void) dladdr((void *)zi->zi_func, &dli);
(void) printf("%7llu %9s %s\n",
- (u_longlong_t)zi->zi_calls, timebuf,
+ (u_longlong_t)zi->zi_call_count, timebuf,
dli.dli_sname);
}
(void) printf("\n");
}
/*
- * It's possible that we killed a child during a rename test, in
- * which case we'll have a 'ztest_tmp' pool lying around instead
- * of 'ztest'. Do a blind rename in case this happened.
+ * It's possible that we killed a child during a rename test,
+ * in which case we'll have a 'ztest_tmp' pool lying around
+ * instead of 'ztest'. Do a blind rename in case this happened.
*/
- tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
- (void) strcpy(tmp, zopt_pool);
- (void) strcat(tmp, "_tmp");
- kernel_init(FREAD | FWRITE);
- (void) spa_rename(tmp, zopt_pool);
+ kernel_init(FREAD);
+ if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+ spa_close(spa, FTAG);
+ } else {
+ char tmpname[MAXNAMELEN];
+ kernel_fini();
+ kernel_init(FREAD | FWRITE);
+ (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+ zopt_pool);
+ (void) spa_rename(tmpname, zopt_pool);
+ }
kernel_fini();
- umem_free(tmp, strlen(tmp) + 1);
+
+ ztest_run_zdb(zopt_pool);
}
- ztest_verify_blocks(zopt_pool);
-
if (zopt_verbose >= 1) {
(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
kills, iters - kills, (100.0 * kills) / MAX(1, iters));
--- a/usr/src/common/avl/avl.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/avl/avl.c Sun Nov 01 14:14:46 2009 -0800
@@ -19,13 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-
/*
* AVL - generic AVL tree implementation for kernel use
*
@@ -243,7 +240,7 @@
* "void *" of the found tree node
*/
void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
{
avl_node_t *node;
avl_node_t *prev = NULL;
--- a/usr/src/common/zfs/zfs_fletcher.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_fletcher.c Sun Nov 01 14:14:46 2009 -0800
@@ -128,6 +128,7 @@
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/byteorder.h>
+#include <sys/zio.h>
#include <sys/spa.h>
void
--- a/usr/src/common/zfs/zfs_prop.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_prop.c Sun Nov 01 14:14:46 2009 -0800
@@ -69,6 +69,18 @@
{ NULL }
};
+ static zprop_index_t dedup_table[] = {
+ { "on", ZIO_CHECKSUM_ON },
+ { "off", ZIO_CHECKSUM_OFF },
+ { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+ { "sha256", ZIO_CHECKSUM_SHA256 },
+ { "sha256,verify",
+ ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+ { "fletcher4,verify",
+ ZIO_CHECKSUM_FLETCHER_4 | ZIO_CHECKSUM_VERIFY },
+ { NULL }
+ };
+
static zprop_index_t compress_table[] = {
{ "on", ZIO_COMPRESS_ON },
{ "off", ZIO_COMPRESS_OFF },
@@ -83,6 +95,7 @@
{ "gzip-7", ZIO_COMPRESS_GZIP_7 },
{ "gzip-8", ZIO_COMPRESS_GZIP_8 },
{ "gzip-9", ZIO_COMPRESS_GZIP_9 },
+ { "zle", ZIO_COMPRESS_ZLE },
{ NULL }
};
@@ -177,10 +190,15 @@
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
checksum_table);
+ register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "on | off | verify | sha256[,verify] | fletcher4,verify", "DEDUP",
+ dedup_table);
register_index(ZFS_PROP_COMPRESSION, "compression",
ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
- "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
+ "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+ compress_table);
register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"hidden | visible", "SNAPDIR", snapdir_table);
@@ -321,9 +339,9 @@
/* hidden properties */
register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
+ PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
@@ -334,9 +352,10 @@
register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
ZFS_TYPE_DATASET, "GUID");
register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
- PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+ "USERACCOUNTING");
register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+ PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
@@ -434,6 +453,12 @@
return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
}
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
/*
* Returns TRUE if the property applies to any of the given dataset types.
*/
--- a/usr/src/common/zfs/zfs_prop.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zfs_prop.h Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _ZFS_PROP_H
#define _ZFS_PROP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/fs/zfs.h>
#include <sys/types.h>
@@ -79,6 +77,7 @@
/* "zfs get" help message */
const zprop_index_t *pd_table; /* for index properties, a table */
/* defining the possible values */
+ size_t pd_table_size; /* number of entries in pd_table[] */
} zprop_desc_t;
/*
@@ -118,6 +117,7 @@
int zprop_name_to_prop(const char *, zfs_type_t);
int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
const char *zprop_values(int, zfs_type_t);
size_t zprop_width(int, boolean_t *, zfs_type_t);
boolean_t zprop_valid_for_type(int, zfs_type_t);
--- a/usr/src/common/zfs/zpool_prop.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zpool_prop.c Sun Nov 01 14:14:46 2009 -0800
@@ -84,6 +84,8 @@
ZFS_TYPE_POOL, "<guid>", "GUID");
register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<state>", "HEALTH");
+ register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY,
+ ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP");
/* default number properties */
register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
@@ -107,6 +109,8 @@
/* hidden properties */
register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+ register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER,
+ PROP_READONLY, ZFS_TYPE_POOL, "DEDUPDITTO");
}
/*
@@ -166,6 +170,12 @@
return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
}
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+ return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
#ifndef _KERNEL
const char *
--- a/usr/src/common/zfs/zprop_common.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/common/zfs/zprop_common.c Sun Nov 01 14:14:46 2009 -0800
@@ -76,6 +76,8 @@
pd = &prop_tbl[prop];
ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+ ASSERT(name != NULL);
+ ASSERT(colname != NULL);
pd->pd_name = name;
pd->pd_propnum = prop;
@@ -89,6 +91,9 @@
pd->pd_rightalign = rightalign;
pd->pd_visible = visible;
pd->pd_table = idx_tbl;
+ pd->pd_table_size = 0;
+ while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+ pd->pd_table_size++;
}
void
@@ -307,6 +312,25 @@
return (-1);
}
+/*
+ * Return a random valid property value. Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+ zprop_desc_t *prop_tbl;
+ const zprop_index_t *idx_tbl;
+
+ ASSERT((uint_t)prop < zprop_get_numprops(type));
+ prop_tbl = zprop_get_proptable(type);
+ idx_tbl = prop_tbl[prop].pd_table;
+
+ if (idx_tbl == NULL)
+ return (seed);
+
+ return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
const char *
zprop_values(int prop, zfs_type_t type)
{
--- a/usr/src/grub/capability Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/capability Sun Nov 01 14:14:46 2009 -0800
@@ -40,7 +40,7 @@
# This file and the associated version are Solaris specific and are
# not a part of the open source distribution of GRUB.
#
-VERSION=12
+VERSION=13
dboot
xVM
zfs
--- a/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/fsys_zfs.c Sun Nov 01 14:14:46 2009 -0800
@@ -513,7 +513,7 @@
* those are the onces that we first pay attention to when
* chosing the bucket.
*/
- crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+ crc &= ~((1ULL << (64 - 28)) - 1);
return (crc);
}
@@ -623,7 +623,8 @@
int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
/* Verify if this is a fat zap header block */
- if (zap->zap_magic != (uint64_t)ZAP_MAGIC)
+ if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
+ zap->zap_flags != 0)
return (ERR_FSYS_CORRUPT);
hash = zap_hash(zap->zap_salt, name);
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/spa.h Sun Nov 01 14:14:46 2009 -0800
@@ -17,15 +17,13 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SPA_H
#define _SYS_SPA_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* General-purpose 32-bit and 64-bit bitfield encodings.
*/
@@ -65,6 +63,11 @@
#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
/*
+ * Size of block to hold the configuration data (a packed nvlist)
+ */
+#define SPA_CONFIG_BLOCKSIZE (1 << 14)
+
+/*
* The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
* The ASIZE encoding should be at least 64 times larger (6 more bits)
* to support up to 4-way RAID-Z mirror mode with worst-case gang block
@@ -108,15 +111,15 @@
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
+ * 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
+ * a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -140,25 +143,29 @@
* cksum checksum function
* comp compression function
* G gang block indicator
- * E endianness
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
+ * lvl level of indirection
* type DMU object type
- * lvl level of indirection
- * birth txg transaction group in which the block was born
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
/*
* Macros to get and set fields in a bp or DVA.
*/
@@ -182,8 +189,7 @@
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
- (BP_IS_HOLE(bp) ? 0 : \
- BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+ BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
@@ -192,20 +198,35 @@
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -229,13 +250,18 @@
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
((zc1).zc_word[2] - (zc2).zc_word[2]) | \
((zc1).zc_word[3] - (zc2).zc_word[3])))
-
#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
@@ -249,7 +275,6 @@
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
#define BP_ZERO(bp) \
{ \
@@ -262,7 +287,7 @@
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
+ (bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zap_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zap_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -17,18 +17,15 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZAP_IMPL_H
#define _SYS_ZAP_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#define ZAP_MAGIC 0x2F52AB2ABULL
-#define ZAP_HASHBITS 28
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
@@ -101,6 +98,8 @@
uint64_t zap_num_leafs; /* number of leafs */
uint64_t zap_num_entries; /* number of entries */
uint64_t zap_salt; /* salt to stir into hash function */
+ uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flag_t */
/*
* This structure is followed by padding, and then the embedded
* pointer table. The embedded pointer table takes up second
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Sun Nov 01 14:14:46 2009 -0800
@@ -27,7 +27,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 19ULL
+#define SPA_VERSION 21ULL
/*
* The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c Sun Nov 01 14:14:46 2009 -0800
@@ -1814,8 +1814,9 @@
case ZFS_PROP_COMPRESSRATIO:
if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
- (void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
- val / 100, (longlong_t)val % 100);
+ (void) snprintf(propbuf, proplen, "%llu.%02llux",
+ (u_longlong_t)(val / 100),
+ (u_longlong_t)(val % 100));
break;
case ZFS_PROP_TYPE:
--- a/usr/src/lib/libzfs/common/libzfs_pool.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c Sun Nov 01 14:14:46 2009 -0800
@@ -321,6 +321,12 @@
(u_longlong_t)intval);
break;
+ case ZPOOL_PROP_DEDUPRATIO:
+ (void) snprintf(buf, len, "%llu.%02llux",
+ (u_longlong_t)(intval / 100),
+ (u_longlong_t)(intval % 100));
+ break;
+
case ZPOOL_PROP_HEALTH:
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
--- a/usr/src/lib/libzpool/common/llib-lzpool Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/lib/libzpool/common/llib-lzpool Sun Nov 01 14:14:46 2009 -0800
@@ -47,6 +47,9 @@
#include <sys/bplist.h>
#include <sys/zfs_znode.h>
#include <sys/arc.h>
+#include <sys/dbuf.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
--- a/usr/src/uts/common/Makefile.files Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/Makefile.files Sun Nov 01 14:14:46 2009 -0800
@@ -1296,6 +1296,8 @@
arc.o \
bplist.o \
dbuf.o \
+ ddt.o \
+ ddt_zap.o \
dmu.o \
dmu_send.o \
dmu_object.o \
@@ -1346,7 +1348,8 @@
zio.o \
zio_checksum.o \
zio_compress.o \
- zio_inject.o
+ zio_inject.o \
+ zle.o
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
--- a/usr/src/uts/common/fs/vfs.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/vfs.c Sun Nov 01 14:14:46 2009 -0800
@@ -81,6 +81,7 @@
#include <sys/console.h>
#include <sys/reboot.h>
#include <sys/attr.h>
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/lofi.h>
#include <sys/bootprops.h>
--- a/usr/src/uts/common/fs/zfs/arc.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/arc.c Sun Nov 01 14:14:46 2009 -0800
@@ -119,7 +119,6 @@
#include <sys/spa.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
@@ -133,6 +132,7 @@
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
+#include <zfs_fletcher.h>
static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
@@ -178,7 +178,6 @@
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
@@ -347,7 +346,7 @@
#define ARCSTAT_INCR(stat, val) \
atomic_add_64(&arc_stats.stat.value.ui64, (val));
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
#define ARCSTAT_MAX(stat, val) { \
@@ -381,7 +380,7 @@
}
kstat_t *arc_ksp;
-static arc_state_t *arc_anon;
+static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
@@ -498,7 +497,6 @@
#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
-#define ARC_STORED (1 << 19) /* has been store()d to */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -815,6 +813,7 @@
{
arc_buf_hdr_t *buf = vbuf;
+ ASSERT(BUF_EMPTY(buf));
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
@@ -1038,6 +1037,8 @@
ASSERT(new_state != old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+ ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
+ ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
from_delta = to_delta = ab->b_datacnt * ab->b_size;
@@ -1255,6 +1256,8 @@
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
+ ASSERT(hdr->b_state != arc_anon);
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
@@ -1336,6 +1339,7 @@
arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
+
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
arc_buf_data_free(buf->b_hdr, zio_buf_free,
@@ -1387,34 +1391,36 @@
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT(!(hdr->b_flags & ARC_STORED));
-
- if (hdr->b_l2hdr != NULL) {
- if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
- /*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
- */
+ l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+
+ if (l2hdr != NULL) {
+ boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ *
+ * The hdr may be removed from l2ad_buflist before we
+ * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ */
+ if (!buflist_held) {
mutex_enter(&l2arc_buflist_mtx);
- if (hdr->b_l2hdr != NULL) {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
- hdr);
- }
+ l2hdr = hdr->b_l2hdr;
+ }
+
+ if (l2hdr != NULL) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
+ if (!buflist_held)
mutex_exit(&l2arc_buflist_mtx);
- } else {
- list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
- }
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
}
if (!BUF_EMPTY(hdr)) {
@@ -1466,10 +1472,13 @@
mutex_enter(hash_lock);
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1)
+ if (hdr->b_datacnt > 1) {
arc_buf_destroy(buf, FALSE, TRUE);
- else
+ } else {
+ ASSERT(buf == hdr->b_buf);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
mutex_exit(hash_lock);
} else if (HDR_IO_IN_PROGRESS(hdr)) {
int destroy_hdr;
@@ -1503,6 +1512,7 @@
int no_callback = (buf->b_efunc == NULL);
if (hdr->b_state == arc_anon) {
+ ASSERT(hdr->b_datacnt == 1);
arc_buf_free(buf, tag);
return (no_callback);
}
@@ -1517,6 +1527,7 @@
arc_buf_destroy(buf, FALSE, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ ASSERT(buf->b_efunc == NULL);
hdr->b_flags |= ARC_BUF_AVAILABLE;
}
ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -2463,6 +2474,16 @@
arc_cksum_compute(buf, B_FALSE);
+ if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ /*
+ * Only call arc_access on anonymous buffers. This is because
+ * if we've issued an I/O for an evicted buffer, we've already
+ * called arc_access (to prevent any simultaneous readers from
+ * getting confused).
+ */
+ arc_access(hdr, hash_lock);
+ }
+
/* create copies of the data buffer for the callers */
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2476,8 +2497,11 @@
hdr->b_acb = NULL;
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf)
+ if (abuf == buf) {
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(hdr->b_datacnt == 1);
hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
@@ -2498,14 +2522,6 @@
cv_broadcast(&hdr->b_cv);
if (hash_lock) {
- /*
- * Only call arc_access on anonymous buffers. This is because
- * if we've issued an I/O for an evicted buffer, we've already
- * called arc_access (to prevent any simultaneous readers from
- * getting confused).
- */
- if (zio->io_error == 0 && hdr->b_state == arc_anon)
- arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
/*
@@ -2559,7 +2575,7 @@
* arc_read_bp.
*/
int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
@@ -2577,7 +2593,7 @@
}
int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb)
{
@@ -2588,7 +2604,8 @@
uint64_t guid = spa_guid(spa);
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
if (hdr && hdr->b_datacnt > 0) {
*arc_flags |= ARC_CACHED;
@@ -2642,6 +2659,7 @@
} else {
buf = arc_buf_clone(buf);
}
+
} else if (*arc_flags & ARC_PREFETCH &&
refcount_count(&hdr->b_refcnt) == 0) {
hdr->b_flags |= ARC_PREFETCH;
@@ -2672,7 +2690,7 @@
buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = bp->blk_birth;
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
if (exists) {
@@ -2718,7 +2736,6 @@
arc_get_data_buf(buf);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
-
}
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -2848,6 +2865,9 @@
ASSERT(buf->b_hdr != NULL);
ASSERT(buf->b_hdr->b_state != arc_anon);
ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
buf->b_efunc = func;
buf->b_private = private;
}
@@ -2953,7 +2973,6 @@
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
- ASSERT(!(hdr->b_flags & ARC_STORED));
if (hdr->b_state == arc_anon) {
/* this buffer is already released */
@@ -3124,11 +3143,16 @@
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- hdr->b_acb = NULL;
-
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = zio->io_bp->blk_birth;
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ ASSERT(hdr->b_acb == NULL);
+
+ if (zio->io_error == 0) {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ } else {
+ ASSERT(BUF_EMPTY(hdr));
+ }
+
/*
* If the block to be written was all-zero, we may have
* compressed it away. In this case no write was performed
@@ -3139,6 +3163,8 @@
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
+ ASSERT(zio->io_error == 0);
+
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
@@ -3148,109 +3174,54 @@
* sync-to-convergence, because we remove
* buffers from the hash table when we arc_free().
*/
- if (!(zio->io_flags & ZIO_FLAG_IO_REWRITE) ||
- !DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
- BP_IDENTITY(zio->io_bp)) ||
- zio->io_bp_orig.blk_birth !=
- zio->io_bp->blk_birth) {
- panic("bad overwrite, hdr=%p exists=%p",
- (void *)hdr, (void *)exists);
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+ panic("bad overwrite, hdr=%p exists=%p",
+ (void *)hdr, (void *)exists);
+ ASSERT(refcount_is_zero(&exists->b_refcnt));
+ arc_change_state(arc_anon, exists, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(exists);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ ASSERT3P(exists, ==, NULL);
+ } else {
+ /* Dedup */
+ ASSERT(hdr->b_datacnt == 1);
+ ASSERT(hdr->b_state == arc_anon);
+ ASSERT(BP_GET_DEDUP(zio->io_bp));
+ ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
-
- ASSERT(refcount_is_zero(&exists->b_refcnt));
- arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(exists);
- exists = buf_hash_insert(hdr, &hash_lock);
- ASSERT3P(exists, ==, NULL);
}
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
/* if it's not anon, we are doing a scrub */
- if (hdr->b_state == arc_anon)
+ if (!exists && hdr->b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
- } else if (callback->awcb_done == NULL) {
- int destroy_hdr;
- /*
- * This is an anonymous buffer with no user callback,
- * destroy it if there are no active references.
- */
- mutex_enter(&arc_eviction_mtx);
- destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
} else {
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- hdr->b_flags &= ~ARC_STORED;
-
- if (callback->awcb_done) {
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
- callback->awcb_done(zio, buf, callback->awcb_private);
- }
+
+ ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
}
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
- boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
- /* Determine checksum setting */
- if (ismd) {
- /*
- * Metadata always gets checksummed. If the data
- * checksum is multi-bit correctable, and it's not a
- * ZBT-style checksum, then it's suitable for metadata
- * as well. Otherwise, the metadata checksum defaults
- * to fletcher4.
- */
- if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
- !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
- zp->zp_checksum = wp->wp_oschecksum;
- else
- zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
- } else {
- zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
- wp->wp_oschecksum);
- }
-
- /* Determine compression setting */
- if (ismd) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
- } else {
- zp->zp_compress = zio_compress_select(wp->wp_dncompress,
- wp->wp_oscompress);
- }
-
- zp->zp_type = wp->wp_type;
- zp->zp_level = wp->wp_level;
- zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
- zio_prop_t zp;
ASSERT(ready != NULL);
+ ASSERT(done != NULL);
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == 0);
+ ASSERT(hdr->b_acb == NULL);
if (l2arc)
hdr->b_flags |= ARC_L2CACHE;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3259,37 +3230,25 @@
callback->awcb_private = private;
callback->awcb_buf = buf;
- write_policy(spa, wp, &zp);
- zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
return (zio);
}
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+arc_free(spa_t *spa, const blkptr_t *bp)
{
arc_buf_hdr_t *ab;
kmutex_t *hash_lock;
- zio_t *zio;
uint64_t guid = spa_guid(spa);
/*
- * If this buffer is in the cache, release it, so it
- * can be re-used.
+ * If this buffer is in the cache, release it, so it can be re-used.
*/
- ab = buf_hash_find(guid, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+ ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+ &hash_lock);
if (ab != NULL) {
- /*
- * The checksum of blocks to free is not always
- * preserved (eg. on the deadlist). However, if it is
- * nonzero, it should match what we have in the cache.
- */
- ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
- bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
- bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
if (ab->b_state != arc_anon)
arc_change_state(arc_anon, ab, hash_lock);
if (HDR_IO_IN_PROGRESS(ab)) {
@@ -3308,37 +3267,14 @@
ab->b_buf->b_efunc = NULL;
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
- } else if (refcount_is_zero(&ab->b_refcnt)) {
+ } else {
+ ASSERT(refcount_is_zero(&ab->b_refcnt));
ab->b_flags |= ARC_FREE_IN_PROGRESS;
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
ARCSTAT_BUMP(arcstat_deleted);
- } else {
- /*
- * We still have an active reference on this
- * buffer. This can happen, e.g., from
- * dbuf_unoverride().
- */
- ASSERT(!HDR_IN_HASH_TABLE(ab));
- ab->b_arc_access = 0;
- bzero(&ab->b_dva, sizeof (dva_t));
- ab->b_birth = 0;
- ab->b_cksum0 = 0;
- ab->b_buf->b_efunc = NULL;
- ab->b_buf->b_private = NULL;
- mutex_exit(hash_lock);
}
}
-
- zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
- if (arc_flags & ARC_WAIT)
- return (zio_wait(zio));
-
- ASSERT(arc_flags & ARC_NOWAIT);
- zio_nowait(zio);
-
- return (0);
}
static int
@@ -4237,7 +4173,7 @@
}
mutex_exit(&l2arc_buflist_mtx);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
dev->l2ad_evict = taddr;
}
@@ -4397,15 +4333,15 @@
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
- spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+ vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- spa_l2cache_space_update(dev->l2ad_vdev, 0,
- dev->l2ad_end - dev->l2ad_hand);
+ vdev_space_update(dev->l2ad_vdev,
+ dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
@@ -4556,7 +4492,7 @@
list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2node));
- spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+ vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
/*
* Add device to global list
--- a/usr/src/uts/common/fs/zfs/bplist.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/bplist.c Sun Nov 01 14:14:46 2009 -0800
@@ -19,13 +19,27 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include <sys/bplist.h>
#include <sys/zfs_context.h>
+void
+bplist_init(bplist_t *bpl)
+{
+ bzero(bpl, sizeof (*bpl));
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+bplist_fini(bplist_t *bpl)
+{
+ ASSERT(bpl->bpl_queue == NULL);
+ mutex_destroy(&bpl->bpl_lock);
+}
+
static int
bplist_hold(bplist_t *bpl)
{
@@ -208,12 +222,13 @@
bparray[off].blk_fill = 0;
/* The bplist will compress better if we can leave off the checksum */
- bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+ if (!BP_GET_DEDUP(&bparray[off]))
+ bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
bpl->bpl_phys->bpl_entries++;
bpl->bpl_phys->bpl_bytes +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+ bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
if (bpl->bpl_havecomp) {
bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
@@ -223,8 +238,14 @@
return (0);
}
+void
+bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
+}
+
/*
- * Deferred entry; will be written later by bplist_sync().
+ * Deferred entry; will be processed later by bplist_sync().
*/
void
bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
@@ -240,7 +261,7 @@
}
void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
{
bplist_q_t *bpq;
@@ -248,7 +269,7 @@
while ((bpq = bpl->bpl_queue) != NULL) {
bpl->bpl_queue = bpq->bpq_next;
mutex_exit(&bpl->bpl_lock);
- VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+ func(arg, &bpq->bpq_blk, tx);
kmem_free(bpq, sizeof (*bpq));
mutex_enter(&bpl->bpl_lock);
}
@@ -311,12 +332,12 @@
}
/*
- * Return (in *dasizep) the amount of space on the deadlist which is:
+ * Return (in *dsizep) the amount of space on the deadlist which is:
* mintxg < blk_birth <= maxtxg
*/
int
bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
- uint64_t *dasizep)
+ uint64_t *dsizep)
{
uint64_t size = 0;
uint64_t itor = 0;
@@ -331,19 +352,18 @@
mutex_enter(&bpl->bpl_lock);
err = bplist_hold(bpl);
if (err == 0)
- *dasizep = bpl->bpl_phys->bpl_bytes;
+ *dsizep = bpl->bpl_phys->bpl_bytes;
mutex_exit(&bpl->bpl_lock);
return (err);
}
while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
- size +=
- bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+ size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
}
}
if (err == ENOENT)
err = 0;
- *dasizep = size;
+ *dsizep = size;
return (err);
}
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Sun Nov 01 14:14:46 2009 -0800
@@ -38,10 +38,6 @@
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-static zio_done_func_t dbuf_skip_write_ready;
-static zio_done_func_t dbuf_skip_write_done;
/*
* Global data structures and functions for the dbuf cache.
@@ -285,6 +281,7 @@
dbuf_verify(dmu_buf_impl_t *db)
{
dnode_t *dn = db->db_dnode;
+ dbuf_dirty_record_t *dr;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -310,13 +307,19 @@
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
}
+ for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
+ for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+ ASSERT(dr->dr_dbuf == db);
+
/*
* We can't assert that db_size matches dn_datablksz because it
* can be momentarily different when another thread is doing
* dnode_set_blksz().
*/
if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
- dbuf_dirty_record_t *dr = db->db_data_pending;
+ dr = db->db_data_pending;
/*
* It should only be modified in syncing context, so
* make sure we only have one copy of the data.
@@ -505,11 +508,9 @@
if (DBUF_IS_L2CACHEABLE(db))
aflags |= ARC_L2CACHE;
- zb.zb_objset = db->db_objset->os_dsl_dataset ?
- db->db_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
+ SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -682,6 +683,7 @@
dbuf_unoverride(dbuf_dirty_record_t *dr)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -692,13 +694,12 @@
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
return;
+ ASSERT(db->db_data_pending != dr);
+
/* free this block */
- if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
- /* XXX can get silent EIO here */
- (void) dsl_free(NULL,
- spa_get_dsl(db->db_dnode->dn_objset->os_spa),
- txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
- }
+ if (!BP_IS_HOLE(bp))
+ dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
/*
* Release the already-written buffer, so we leave it in
@@ -961,7 +962,8 @@
* we now need to reset its state.
*/
dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT)
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
@@ -1000,7 +1002,7 @@
* Update the accounting.
* Note: we delay "free accounting" until after we drop
* the db_mtx. This keeps us from grabbing other locks
- * (and possibly deadlocking) in bp_get_dasize() while
+ * (and possibly deadlocking) in bp_get_dsize() while
* also holding the db_mtx.
*/
dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1079,7 +1081,7 @@
} else if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
- bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ bp_get_dsize(os->os_spa, bp) : db->db.db_size;
/*
* This is only a guess -- if the dbuf is dirty
* in a previous txg, we don't know how much
@@ -1172,6 +1174,7 @@
return (0);
}
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
/*
* If this buffer is currently held, we cannot undirty
@@ -1231,7 +1234,7 @@
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
arc_buf_t *buf = db->db_buf;
- ASSERT(arc_released(buf));
+ ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
dbuf_set_data(db, NULL);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
dbuf_evict(db);
@@ -1649,13 +1652,12 @@
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
arc_buf_t *pbuf;
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
- dn->dn_objset->os_dsl_dataset->ds_object : 0;
- zb.zb_object = dn->dn_object;
- zb.zb_level = 0;
- zb.zb_blkid = blkid;
+
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, 0, blkid);
if (db)
pbuf = db->db_buf;
@@ -1801,9 +1803,20 @@
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf. This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+{
int64_t holds;
- mutex_enter(&db->db_mtx);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
DBUF_VERIFY(db);
holds = refcount_remove(&db->db_holds, tag);
@@ -2056,12 +2069,12 @@
while (*drp != dr)
drp = &(*drp)->dr_next;
ASSERT(dr->dr_next == NULL);
+ ASSERT(dr->dr_dbuf == db);
*drp = dr->dr_next;
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
return;
}
@@ -2083,44 +2096,10 @@
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- /*
- * If this dbuf has already been written out via an immediate write,
- * just complete the write by copying over the new block pointer and
- * updating the accounting via the write-completion functions.
- */
- if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- zio_t zio_fake;
-
- zio_fake.io_private = &db;
- zio_fake.io_error = 0;
- zio_fake.io_bp = db->db_blkptr;
- zio_fake.io_bp_orig = *db->db_blkptr;
- zio_fake.io_txg = txg;
- zio_fake.io_flags = 0;
-
- *db->db_blkptr = dr->dt.dl.dr_overridden_by;
- dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
- db->db_data_pending = dr;
- dr->dr_zio = &zio_fake;
- mutex_exit(&db->db_mtx);
-
- ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
- BP_IDENTITY(&zio_fake.io_bp_orig)) ||
- BP_IS_HOLE(zio_fake.io_bp));
-
- if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
- dbuf_write_ready(&zio_fake, db->db_buf, db);
- dbuf_write_done(&zio_fake, db->db_buf, db);
-
- return;
- }
-
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
+ dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
*datap == db->db_buf) {
/*
* If this buffer is currently "in use" (i.e., there
@@ -2177,130 +2156,27 @@
}
}
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
- dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
- dmu_buf_impl_t *parent = db->db_parent;
- uint64_t txg = tx->tx_txg;
- zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
-
- if (!BP_IS_HOLE(db->db_blkptr) &&
- (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(data, db);
- } else if (db->db_state != DB_NOFILL) {
- ASSERT(arc_released(data));
- /* XXX why do we need to thaw here? */
- arc_buf_thaw(data);
- }
-
- if (parent != dn->dn_dbuf) {
- ASSERT(parent && parent->db_data_pending);
- ASSERT(db->db_level == parent->db_level-1);
- ASSERT(arc_released(parent->db_buf));
- zio = parent->db_data_pending->dr_zio;
- } else {
- ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
- ASSERT3P(db->db_blkptr, ==,
- &dn->dn_phys->dn_blkptr[db->db_blkid]);
- zio = dn->dn_zio;
- }
-
- ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
- ASSERT(zio);
-
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
-
- wp.wp_type = dn->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dncompress = dn->dn_compress;
- wp.wp_oscompress = os->os_compress;
- wp.wp_dnchecksum = dn->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
-
- if (BP_IS_OLDER(db->db_blkptr, txg))
- (void) dsl_dataset_block_kill(
- os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
- if (db->db_state == DB_NOFILL) {
- zio_prop_t zp = { 0 };
-
- write_policy(os->os_spa, &wp, &zp);
- dr->dr_zio = zio_write(zio, os->os_spa,
- txg, db->db_blkptr, NULL,
- db->db.db_size, &zp, dbuf_skip_write_ready,
- dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
- } else {
- dr->dr_zio = arc_write(zio, os->os_spa, &wp,
- DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
- data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- }
-}
-
-/* wrapper function for dbuf_write_ready bypassing ARC */
-static void
-dbuf_skip_write_ready(zio_t *zio)
-{
- blkptr_t *bp = zio->io_bp;
-
- if (!BP_IS_GANG(bp))
- zio_skip_write(zio);
-
- dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-/* wrapper function for dbuf_write_done bypassing ARC */
-static void
-dbuf_skip_write_done(zio_t *zio)
-{
- dbuf_write_done(zio, NULL, zio->io_private);
-}
-
/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
- dnode_t *dn = db->db_dnode;
- objset_t *os = dn->dn_objset;
blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
+ dnode_t *dn = db->db_dnode;
+ spa_t *spa = zio->io_spa;
+ int64_t delta;
uint64_t fill = 0;
- int old_size, new_size, i;
+ int i;
ASSERT(db->db_blkptr == bp);
- dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
- old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, bp);
-
- dnode_diduse_space(dn, new_size - old_size);
+ delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+ zio->io_prev_space_delta = delta;
if (BP_IS_HOLE(bp)) {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- ASSERT3U(bp->blk_fill, ==, 0);
+ ASSERT(bp->blk_fill == 0);
return;
}
@@ -2341,17 +2217,6 @@
bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
-
- if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
- } else {
- dsl_dataset_t *ds = os->os_dsl_dataset;
- dmu_tx_t *tx = os->os_synctx;
-
- if (bp_orig->blk_birth == tx->tx_txg)
- (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
- dsl_dataset_block_born(ds, bp, tx);
- }
}
/* ARGSUSED */
@@ -2359,37 +2224,50 @@
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ dnode_t *dn = db->db_dnode;
+ objset_t *os = dn->dn_objset;
uint64_t txg = zio->io_txg;
dbuf_dirty_record_t **drp, *dr;
ASSERT3U(zio->io_error, ==, 0);
+ ASSERT(db->db_blkptr == bp);
+
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(BP_EQUAL(bp, bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
+ }
mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+
drp = &db->db_last_dirty;
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
if (db->db_level == 0) {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
if (db->db_state != DB_NOFILL) {
if (dr->dt.dl.dr_data != db->db_buf)
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
db) == 1);
- else if (!BP_IS_HOLE(db->db_blkptr))
+ else if (!arc_released(db->db_buf))
arc_set_callback(db->db_buf, dbuf_do_evict, db);
- else
- ASSERT(arc_released(db->db_buf));
}
} else {
- dnode_t *dn = db->db_dnode;
-
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2410,9 +2288,122 @@
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+ dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+ dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+ mutex_enter(&db->db_mtx);
+ if (!BP_EQUAL(zio->io_bp, obp)) {
+ if (!BP_IS_HOLE(obp))
+ dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+ arc_release(dr->dt.dl.dr_data, db);
+ }
mutex_exit(&db->db_mtx);
- dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+ dbuf_write_done(zio, NULL, db);
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ dnode_t *dn = db->db_dnode;
+ objset_t *os = dn->dn_objset;
+ dmu_buf_impl_t *parent = db->db_parent;
+ uint64_t txg = tx->tx_txg;
+ zbookmark_t zb;
+ zio_prop_t zp;
+ zio_t *zio;
+
+ if (db->db_state != DB_NOFILL) {
+ if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ if (BP_IS_HOLE(db->db_blkptr)) {
+ arc_buf_thaw(data);
+ } else {
+ arc_release(data, db);
+ }
+ }
+ }
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ if (parent != dn->dn_dbuf) {
+ ASSERT(parent && parent->db_data_pending);
+ ASSERT(db->db_level == parent->db_level-1);
+ ASSERT(arc_released(parent->db_buf));
+ zio = parent->db_data_pending->dr_zio;
+ } else {
+ ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+ ASSERT3P(db->db_blkptr, ==,
+ &dn->dn_phys->dn_blkptr[db->db_blkid]);
+ zio = dn->dn_zio;
+ }
+
+ ASSERT(db->db_level == 0 || data == db->db_buf);
+ ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT(zio);
+
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ dmu_write_policy(os, dn, db->db_level,
+ db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
+
+ if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ ASSERT(db->db_state != DB_NOFILL);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+ dbuf_write_override_ready, dbuf_write_override_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ mutex_enter(&db->db_mtx);
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+ dr->dt.dl.dr_copies);
+ mutex_exit(&db->db_mtx);
+ } else if (db->db_state == DB_NOFILL) {
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ dr->dr_zio = zio_write(zio, os->os_spa, txg,
+ db->db_blkptr, NULL, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+ } else {
+ ASSERT(arc_released(data));
+ dr->dr_zio = arc_write(zio, os->os_spa, txg,
+ db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ }
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/ddt.c Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,955 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+ &ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+ "ditto",
+ "duplicate",
+ "unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp == 0);
+ VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+ ASSERT(*objectp != 0);
+
+ VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, objectp, tx) == 0);
+
+ VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = ddt->ddt_spa;
+ objset_t *os = ddt->ddt_os;
+ uint64_t *objectp = &ddt->ddt_object[type][class];
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ ASSERT(*objectp != 0);
+ ASSERT(ddt_object_count(ddt, type, class) == 0);
+ ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+ VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+ VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+ VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+
+ *objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ char name[DDT_NAMELEN];
+ int error;
+
+ ddt_object_name(ddt, type, class, name);
+
+ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+ sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+ if (error)
+ return (error);
+
+ error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class]);
+
+ ASSERT(error == 0);
+ return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_tx_t *tx)
+{
+ char name[DDT_NAMELEN];
+
+ ddt_object_name(ddt, type, class, name);
+
+ VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+ &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde));
+}
+
+static int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ ddt_entry_t *dde, uint64_t *walk)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+ ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ ASSERT(ddt_object_exists(ddt, type, class));
+
+ return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+ ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ dmu_object_info_t *doi)
+{
+ if (!ddt_object_exists(ddt, type, class))
+ return (ENOENT);
+
+ return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+ doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+ char *name)
+{
+ (void) sprintf(name, DMU_POOL_DDT,
+ zio_checksum_table[ddt->ddt_checksum].ci_name,
+ ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+ ASSERT(txg != 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ bp->blk_dva[d] = ddp->ddp_dva[d];
+ BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp,
+ blkptr_t *bp)
+{
+ BP_ZERO(bp);
+
+ if (ddp != NULL)
+ ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+ bp->blk_cksum = ddk->ddk_cksum;
+
+ BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+ BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+ BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+ BP_SET_CHECKSUM(bp, ddt->ddt_checksum);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+ ddk->ddk_cksum = bp->blk_cksum;
+ ddk->ddk_prop = 0;
+
+ DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+ ASSERT(ddp->ddp_phys_birth == 0);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ ddp->ddp_dva[d] = bp->blk_dva[d];
+ ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+ bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+ ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+ ASSERT((int64_t)ddp->ddp_refcnt > 0);
+ ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+ blkptr_t blk;
+
+ ddt_bp_create(ddt, ddk, ddp, &blk);
+ ddt_phys_clear(ddp);
+ zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+ ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+ BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+ return (ddp);
+ }
+ return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+ uint64_t refcnt = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+ refcnt += dde->dde_phys[p].ddp_refcnt;
+
+ return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ uint64_t lsize = DDK_GET_LSIZE(ddk);
+ uint64_t psize = DDK_GET_PSIZE(ddk);
+
+ bzero(dds, sizeof (*dds));
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ uint64_t dsize = 0;
+ uint64_t refcnt = ddp->ddp_refcnt;
+
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+ dds->dds_blocks += 1;
+ dds->dds_lsize += lsize;
+ dds->dds_psize += psize;
+ dds->dds_dsize += dsize;
+
+ dds->dds_ref_blocks += refcnt;
+ dds->dds_ref_lsize += lsize * refcnt;
+ dds->dds_ref_psize += psize * refcnt;
+ dds->dds_ref_dsize += dsize * refcnt;
+ }
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+ const uint64_t *s = (const uint64_t *)src;
+ uint64_t *d = (uint64_t *)dst;
+ uint64_t *d_end = (uint64_t *)(dst + 1);
+
+ ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */
+
+ while (d < d_end)
+ *d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+ ddt_stat_t dds;
+ ddt_histogram_t *ddh;
+ int bucket;
+
+ ddt_stat_generate(ddt, dde, &dds);
+
+ bucket = highbit(dds.dds_ref_blocks) - 1;
+ ASSERT(bucket >= 0);
+
+ ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+ ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+ bzero(dds, sizeof (*dds));
+
+ for (int h = 0; h < 64; h++)
+ ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+ const uint64_t *s = (const uint64_t *)ddh;
+ const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+ while (s < s_end)
+ if (*s++ != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+ ddt_histogram_t ddh_total = { 0 };
+ ddt_stat_t dds_total = { 0 };
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_histogram_add(&ddh_total,
+ &ddt->ddt_histogram[type][class]);
+ }
+ }
+ }
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ if (dds_total.dds_dsize == 0)
+ return (100);
+
+ return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+ spa_t *spa = ddt->ddt_spa;
+ uint64_t total_refcnt = 0;
+ uint64_t ditto = spa->spa_dedup_ditto;
+ int total_copies = 0;
+ int desired_copies = 0;
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *zio = dde->dde_lead_zio[p];
+ uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */
+ if (zio != NULL)
+ refcnt += zio->io_parent_count; /* pending refs */
+ if (ddp == ddp_willref)
+ refcnt++; /* caller's ref */
+ if (refcnt != 0) {
+ total_refcnt += refcnt;
+ total_copies += p;
+ }
+ }
+
+ if (ditto == 0 || ditto > UINT32_MAX)
+ ditto = UINT32_MAX;
+
+ if (total_refcnt >= 1)
+ desired_copies++;
+ if (total_refcnt >= ditto)
+ desired_copies++;
+ if (total_refcnt >= ditto * ditto)
+ desired_copies++;
+
+ return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+ ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+ dva_t *dva = ddp->ddp_dva;
+ int copies = 0 - DVA_GET_GANG(dva);
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+ if (DVA_IS_VALID(dva))
+ copies++;
+
+ ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+ return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+ uchar_t *version = dst++;
+ int cpfunc = ZIO_COMPRESS_ZLE;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ size_t c_len;
+
+ ASSERT(d_len >= s_len + 1); /* no compression plus version byte */
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+ if (c_len == s_len) {
+ cpfunc = ZIO_COMPRESS_OFF;
+ bcopy(src, dst, s_len);
+ }
+
+ *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+ return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+ uchar_t version = *src++;
+ int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+ zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+ if (ci->ci_decompress != NULL)
+ (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+ else
+ bcopy(src, dst, d_len);
+
+ if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+ return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+ return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+ mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+ mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+ ddt_entry_t *dde;
+
+ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+ cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+ dde->dde_key = *ddk;
+
+ return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++)
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+
+ if (dde->dde_repair_data != NULL)
+ zio_buf_free(dde->dde_repair_data,
+ DDK_GET_PSIZE(&dde->dde_key));
+
+ cv_destroy(&dde->dde_cv);
+ kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ avl_remove(&ddt->ddt_tree, dde);
+ ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+ ddt_entry_t *dde, dde_search;
+ enum ddt_type type;
+ enum ddt_class class;
+ avl_index_t where;
+ int error;
+
+ ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+ ddt_key_fill(&dde_search.dde_key, bp);
+
+ dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+ if (dde == NULL) {
+ if (!add)
+ return (NULL);
+ dde = ddt_alloc(&dde_search.dde_key);
+ avl_insert(&ddt->ddt_tree, dde, where);
+ }
+
+ while (dde->dde_loading)
+ cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+ if (dde->dde_loaded)
+ return (dde);
+
+ dde->dde_loading = B_TRUE;
+
+ ddt_exit(ddt);
+
+ error = ENOENT;
+
+ for (type = 0; type < DDT_TYPES; type++) {
+ for (class = 0; class < DDT_CLASSES; class++) {
+ error = ddt_object_lookup(ddt, type, class, dde);
+ if (error != ENOENT)
+ break;
+ }
+ if (error != ENOENT)
+ break;
+ }
+
+ ASSERT(error == 0 || error == ENOENT);
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_loaded == B_FALSE);
+ ASSERT(dde->dde_loading == B_TRUE);
+
+ dde->dde_type = type; /* will be DDT_TYPES if no entry found */
+ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */
+ dde->dde_loaded = B_TRUE;
+ dde->dde_loading = B_FALSE;
+
+ if (error == 0)
+ ddt_stat_update(ddt, dde, -1ULL);
+
+ cv_broadcast(&dde->dde_cv);
+
+ return (dde);
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+ const ddt_entry_t *dde1 = x1;
+ const ddt_entry_t *dde2 = x2;
+ const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+ const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+ for (int i = 0; i < DDT_KEY_WORDS; i++) {
+ if (u1[i] < u2[i])
+ return (-1);
+ if (u1[i] > u2[i])
+ return (1);
+ }
+
+ return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+ ddt_t *ddt;
+
+ ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+ mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&ddt->ddt_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+ sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+ ddt->ddt_checksum = c;
+ ddt->ddt_spa = spa;
+ ddt->ddt_os = spa->spa_meta_objset;
+
+ return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+ ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+ ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+ avl_destroy(&ddt->ddt_tree);
+ avl_destroy(&ddt->ddt_repair_tree);
+ mutex_destroy(&ddt->ddt_lock);
+ kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+ spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+ int error;
+
+ ddt_create(spa);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object);
+
+ if (error)
+ return (error == ENOENT ? 0 : error);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ error = ddt_object_load(ddt, type, class);
+ if (error != 0 && error != ENOENT)
+ return (error);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ if (spa->spa_ddt[c]) {
+ ddt_table_free(spa->spa_ddt[c]);
+ spa->spa_ddt[c] = NULL;
+ }
+ }
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+ ddt_key_t ddk;
+ ddt_entry_t *dde;
+
+ ddt_key_fill(&ddk, bp);
+
+ dde = ddt_alloc(&ddk);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ /*
+ * We can only do repair if there are multiple copies
+ * of the block. For anything in the UNIQUE class,
+ * there's definitely only one copy, so don't even try.
+ */
+ if (class != DDT_CLASS_UNIQUE &&
+ ddt_object_lookup(ddt, type, class, dde) == 0)
+ return (dde);
+ }
+ }
+
+ bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+ return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+ avl_index_t where;
+
+ ddt_enter(ddt);
+
+ if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+ avl_insert(&ddt->ddt_repair_tree, dde, where);
+ else
+ ddt_free(dde);
+
+ ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+ ddt_entry_t *rdde = zio->io_private;
+
+ ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *rddp = rdde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ ddt_key_t *rddk = &rdde->dde_key;
+ zio_t *zio;
+ blkptr_t blk;
+
+ zio = zio_null(rio, rio->io_spa, NULL,
+ ddt_repair_entry_done, rdde, rio->io_flags);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+ bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+ continue;
+ ddt_bp_create(ddt, ddk, ddp, &blk);
+ zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+ rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+ }
+
+ zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde, *rdde_next, *rdde;
+ avl_tree_t *t = &ddt->ddt_repair_tree;
+ blkptr_t blk;
+
+ if (spa_sync_pass(spa) > 1)
+ return;
+
+ ddt_enter(ddt);
+ for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+ rdde_next = AVL_NEXT(t, rdde);
+ avl_remove(&ddt->ddt_repair_tree, rdde);
+ ddt_exit(ddt);
+ ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk);
+ dde = ddt_repair_start(ddt, &blk);
+ ddt_repair_entry(ddt, dde, rdde, rio);
+ ddt_repair_done(ddt, dde);
+ ddt_enter(ddt);
+ }
+ ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_key_t *ddk = &dde->dde_key;
+ enum ddt_type otype = dde->dde_type;
+ enum ddt_type ntype = DDT_TYPE_CURRENT;
+ enum ddt_class oclass = dde->dde_class;
+ enum ddt_class nclass;
+ uint64_t total_refcnt = 0;
+
+ ASSERT(dde->dde_loaded);
+ ASSERT(!dde->dde_loading);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ ASSERT(dde->dde_lead_zio[p] == NULL);
+ ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+ if (ddp->ddp_phys_birth == 0) {
+ ASSERT(ddp->ddp_refcnt == 0);
+ continue;
+ }
+ if (p == DDT_PHYS_DITTO) {
+ if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ continue;
+ }
+ if (ddp->ddp_refcnt == 0)
+ ddt_phys_free(ddt, ddk, ddp, txg);
+ total_refcnt += ddp->ddp_refcnt;
+ }
+
+ if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+ nclass = DDT_CLASS_DITTO;
+ else if (total_refcnt > 1)
+ nclass = DDT_CLASS_DUPLICATE;
+ else
+ nclass = DDT_CLASS_UNIQUE;
+
+ if (otype != DDT_TYPES &&
+ (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+ ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+ }
+
+ if (total_refcnt != 0) {
+ dde->dde_type = ntype;
+ dde->dde_class = nclass;
+ ddt_stat_update(ddt, dde, 0);
+ if (!ddt_object_exists(ddt, ntype, nclass))
+ ddt_object_create(ddt, ntype, nclass, tx);
+ VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+ }
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+ spa_t *spa = ddt->ddt_spa;
+ ddt_entry_t *dde;
+ void *cookie = NULL;
+
+ if (avl_numnodes(&ddt->ddt_tree) == 0)
+ return;
+
+ ASSERT(spa_sync_pass(spa) == 1);
+ ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+ if (spa->spa_ddt_stat_object == 0) {
+ spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+ DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+ VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+ &spa->spa_ddt_stat_object, tx) == 0);
+ }
+
+ while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+ ddt_sync_entry(ddt, dde, tx, txg);
+ ddt_free(dde);
+ }
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+ if (!ddt_object_exists(ddt, type, class))
+ continue;
+ ddt_object_sync(ddt, type, class, tx);
+ if (ddt_object_count(ddt, type, class) == 0)
+ ddt_object_destroy(ddt, type, class, tx);
+ }
+ }
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ zio_t *rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+ ASSERT(spa_syncing_txg(spa) == txg);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ if (ddt == NULL)
+ continue;
+ ddt_sync_table(ddt, tx, txg);
+ ddt_repair_table(ddt, rio);
+ }
+
+ (void) zio_wait(rio);
+
+ dmu_tx_commit(tx);
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/ddt_zap.c Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,150 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+ zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+ if (prehash)
+ flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+ *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+ ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+ DMU_OT_NONE, 0, tx);
+
+ return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+ return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t one, csize;
+ int error;
+
+ error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, &one, &csize);
+ if (error)
+ return (error);
+
+ ASSERT(one == 1);
+ ASSERT(csize <= sizeof (cbuf));
+
+ error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ if (error)
+ return (error);
+
+ ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+ return (0);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize;
+
+ csize = ddt_compress(dde->dde_phys, cbuf,
+ sizeof (dde->dde_phys), sizeof (cbuf));
+
+ return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+ DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ int error;
+
+ zap_cursor_init_serialized(&zc, os, object, *walk);
+ if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+ uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+ uint64_t csize = za.za_num_integers;
+ ASSERT(za.za_integer_length == 1);
+ error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+ DDT_KEY_WORDS, 1, csize, cbuf);
+ ASSERT(error == 0);
+ if (error == 0) {
+ ddt_decompress(cbuf, dde->dde_phys, csize,
+ sizeof (dde->dde_phys));
+ dde->dde_key = *(ddt_key_t *)za.za_name;
+ }
+ zap_cursor_advance(&zc);
+ *walk = zap_cursor_serialize(&zc);
+ }
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+ uint64_t count = 0;
+
+ VERIFY(zap_count(os, object, &count) == 0);
+
+ return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+ "zap",
+ ddt_zap_create,
+ ddt_zap_destroy,
+ ddt_zap_lookup,
+ ddt_zap_update,
+ ddt_zap_remove,
+ ddt_zap_walk,
+ ddt_zap_count,
+};
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu.c Sun Nov 01 14:14:46 2009 -0800
@@ -88,6 +88,8 @@
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
{ zap_byteswap, TRUE, "snapshot refcount tags"},
+ { zap_byteswap, TRUE, "DDT ZAP algorithm" },
+ { zap_byteswap, TRUE, "DDT statistics" },
};
int
@@ -859,55 +861,114 @@
}
typedef struct {
- dbuf_dirty_record_t *dr;
- dmu_sync_cb_t *done;
- void *arg;
+ dbuf_dirty_record_t *dsa_dr;
+ dmu_sync_cb_t *dsa_done;
+ zgd_t *dsa_zgd;
+ dmu_tx_t *dsa_tx;
} dmu_sync_arg_t;
/* ARGSUSED */
static void
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
{
+ dmu_sync_arg_t *dsa = varg;
+ dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
blkptr_t *bp = zio->io_bp;
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
- dmu_buf_impl_t *db = dr->dr_dbuf;
- if (!BP_IS_HOLE(bp)) {
- ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
- ASSERT(BP_GET_LEVEL(bp) == 0);
- bp->blk_fill = 1;
- } else {
- /*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to replay
- */
- BP_SET_LSIZE(bp, db->db.db_size);
+ if (zio->io_error == 0) {
+ if (BP_IS_HOLE(bp)) {
+ /*
+ * A block of zeros may compress to a hole, but the
+ * block size still needs to be known for replay.
+ */
+ BP_SET_LSIZE(bp, db->db_size);
+ } else {
+ ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
}
}
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+ dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
/* ARGSUSED */
static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
- dmu_sync_arg_t *in = varg;
- dbuf_dirty_record_t *dr = in->dr;
+ dmu_sync_arg_t *dsa = varg;
+ dbuf_dirty_record_t *dr = dsa->dsa_dr;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dmu_sync_cb_t *done = in->done;
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
- dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
- if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
- BP_ZERO(&dr->dt.dl.dr_overridden_by);
- dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ if (zio->io_error == 0) {
+ dr->dt.dl.dr_overridden_by = *zio->io_bp;
+ dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+ dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+ BP_ZERO(&dr->dt.dl.dr_overridden_by);
+ } else {
+ dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+ }
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
- if (done)
- done(&(db->db), in->arg);
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ dmu_sync_arg_t *dsa = zio->io_private;
+
+ if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+ ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+ ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+ zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+ }
+
+ dmu_tx_commit(dsa->dsa_tx);
+
+ dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+ kmem_free(dsa, sizeof (*dsa));
+}
- kmem_free(in, sizeof (dmu_sync_arg_t));
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+ zio_prop_t *zp, zbookmark_t *zb)
+{
+ dmu_sync_arg_t *dsa;
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+ if (dmu_tx_assign(tx, TXG_NOWAIT) != 0) {
+ dmu_tx_abort(tx);
+ return (EIO); /* Make zl_get_data do txg_waited_synced() */
+ }
+
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = NULL;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = tx;
+
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+ zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+ return (0);
}
/*
@@ -926,156 +987,108 @@
* EALREADY: this block is already in the process of being synced.
* The caller should track its progress (somehow).
*
- * EINPROGRESS: the IO has been initiated.
- * The caller should log this blkptr in the callback.
+ * EIO: could not do the I/O.
+ * The caller should do a txg_wait_synced().
*
- * 0: completed. Sets *bp to the blkptr just written.
- * The caller should log this blkptr immediately.
+ * 0: the I/O has been initiated.
+ * The caller should log this blkptr in the done callback.
+ * It is possible that the I/O will fail, in which case
+ * the error will be reported to the done callback and
+ * propagated to pio from zio_done().
*/
int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
- blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ blkptr_t *bp = zgd->zgd_bp;
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
objset_t *os = db->db_objset;
- dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
- tx_state_t *tx = &dp->dp_tx;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
- dmu_sync_arg_t *in;
+ dmu_sync_arg_t *dsa;
zbookmark_t zb;
- writeprops_t wp = { 0 };
- zio_t *zio;
- int err;
+ zio_prop_t zp;
+ ASSERT(pio != NULL);
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
- dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
- txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+ SET_BOOKMARK(&zb, ds->ds_object,
+ db->db.db_object, db->db_level, db->db_blkid);
+
+ dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
/*
- * XXX - would be nice if we could do this without suspending...
+ * If we're frozen (running ziltest), we always need to generate a bp.
*/
- txg_suspend(dp);
+ if (txg > spa_freeze_txg(os->os_spa))
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
/*
- * If this txg already synced, there's nothing to do.
+ * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+ * and us. If we determine that this txg is not yet syncing,
+ * but it begins to sync a moment later, that's OK because the
+ * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
*/
- if (txg <= tx->tx_synced_txg) {
- txg_resume(dp);
+ mutex_enter(&db->db_mtx);
+
+ if (txg <= spa_last_synced_txg(os->os_spa)) {
/*
- * If we're running ziltest, we need the blkptr regardless.
+ * This txg has already synced. There's nothing to do.
*/
- if (txg > spa_freeze_txg(dp->dp_spa)) {
- /* if db_blkptr == NULL, this was an empty write */
- if (db->db_blkptr)
- *bp = *db->db_blkptr; /* structure assignment */
- return (0);
- }
+ mutex_exit(&db->db_mtx);
return (EEXIST);
}
- mutex_enter(&db->db_mtx);
-
- if (txg == tx->tx_syncing_txg) {
- while (db->db_data_pending) {
- /*
- * IO is in-progress. Wait for it to finish.
- * XXX - would be nice to be able to somehow "attach"
- * this zio to the parent zio passed in.
- */
- cv_wait(&db->db_changed, &db->db_mtx);
- if (!db->db_data_pending &&
- db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
- /*
- * IO was compressed away
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
- ASSERT(db->db_data_pending ||
- (db->db_blkptr && db->db_blkptr->blk_birth == txg));
- }
-
- if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
- /*
- * IO is already completed.
- */
- *bp = *db->db_blkptr; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
- }
+ if (txg <= spa_syncing_txg(os->os_spa)) {
+ /*
+ * This txg is currently syncing, so we can't mess with
+ * the dirty record anymore; just write a new log block.
+ */
+ mutex_exit(&db->db_mtx);
+ return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
}
dr = db->db_last_dirty;
- while (dr && dr->dr_txg > txg)
+ while (dr && dr->dr_txg != txg)
dr = dr->dr_next;
- if (dr == NULL || dr->dr_txg < txg) {
+
+ if (dr == NULL) {
/*
- * This dbuf isn't dirty, must have been free_range'd.
+ * There's no dr for this dbuf, so it must have been freed.
* There's no need to log writes to freed blocks, so we're done.
*/
mutex_exit(&db->db_mtx);
- txg_resume(dp);
return (ENOENT);
}
ASSERT(dr->dr_txg == txg);
- if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+ if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
/*
- * We have already issued a sync write for this buffer.
+ * We have already issued a sync write for this buffer,
+ * or this buffer has already been synced. It could not
+ * have been dirtied since, or we would have cleared the state.
*/
mutex_exit(&db->db_mtx);
- txg_resume(dp);
return (EALREADY);
- } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- /*
- * This buffer has already been synced. It could not
- * have been dirtied since, or we would have cleared the state.
- */
- *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
- mutex_exit(&db->db_mtx);
- txg_resume(dp);
- return (0);
}
+ ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
- in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
- in->dr = dr;
- in->done = done;
- in->arg = arg;
mutex_exit(&db->db_mtx);
- txg_resume(dp);
-
- zb.zb_objset = os->os_dsl_dataset->ds_object;
- zb.zb_object = db->db.db_object;
- zb.zb_level = db->db_level;
- zb.zb_blkid = db->db_blkid;
- wp.wp_type = db->db_dnode->dn_type;
- wp.wp_level = db->db_level;
- wp.wp_copies = os->os_copies;
- wp.wp_dnchecksum = db->db_dnode->dn_checksum;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_dncompress = db->db_dnode->dn_compress;
- wp.wp_oscompress = os->os_compress;
-
- ASSERT(BP_IS_HOLE(bp));
+ dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+ dsa->dsa_dr = dr;
+ dsa->dsa_done = done;
+ dsa->dsa_zgd = zgd;
+ dsa->dsa_tx = NULL;
- zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
- txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
- if (pio) {
- zio_nowait(zio);
- err = EINPROGRESS;
- } else {
- err = zio_wait(zio);
- ASSERT(err == 0);
- }
- return (err);
+ zio_nowait(arc_write(pio, os->os_spa, txg,
+ bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+ dmu_sync_ready, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+
+ return (0);
}
int
@@ -1121,6 +1134,84 @@
dnode_rele(dn, FTAG);
}
+int zfs_mdcomp_disable = 0;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+ dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+ boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+ enum zio_checksum checksum = os->os_checksum;
+ enum zio_compress compress = os->os_compress;
+ enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+ boolean_t dedup;
+ boolean_t dedup_verify = os->os_dedup_verify;
+ int copies = os->os_copies;
+
+ /*
+ * Determine checksum setting.
+ */
+ if (ismd) {
+ /*
+ * Metadata always gets checksummed. If the data
+ * checksum is multi-bit correctable, and it's not a
+ * ZBT-style checksum, then it's suitable for metadata
+ * as well. Otherwise, the metadata checksum defaults
+ * to fletcher4.
+ */
+ if (zio_checksum_table[checksum].ci_correctable < 1 ||
+ zio_checksum_table[checksum].ci_zbt)
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+ } else {
+ checksum = zio_checksum_select(dn->dn_checksum, checksum);
+ }
+
+ /*
+ * Determine compression setting.
+ */
+ if (ismd) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+ ZIO_COMPRESS_LZJB;
+ } else {
+ compress = zio_compress_select(dn->dn_compress, compress);
+ }
+
+ /*
+ * Determine dedup setting. If we are in dmu_sync(), we won't
+ * actually dedup now because that's all done in syncing context;
+ * but we do want to use the dedup checkum. If the checksum is not
+ * strong enough to ensure unique signatures, force dedup_verify.
+ */
+ dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+ if (dedup) {
+ checksum = dedup_checksum;
+ if (!zio_checksum_table[checksum].ci_dedup)
+ dedup_verify = 1;
+ }
+
+ if (wp & WP_DMU_SYNC)
+ dedup = 0;
+
+ if (wp & WP_NOFILL) {
+ ASSERT(!ismd && level == 0);
+ checksum = ZIO_CHECKSUM_OFF;
+ compress = ZIO_COMPRESS_OFF;
+ dedup = B_FALSE;
+ }
+
+ zp->zp_checksum = checksum;
+ zp->zp_compress = compress;
+ zp->zp_type = type;
+ zp->zp_level = level;
+ zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+ zp->zp_dedup = dedup;
+ zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
@@ -1155,21 +1246,27 @@
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
+ dnode_phys_t *dnp;
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
mutex_enter(&dn->dn_mtx);
+ dnp = dn->dn_phys;
+
doi->doi_data_block_size = dn->dn_datablksz;
doi->doi_metadata_block_size = dn->dn_indblkshift ?
1ULL << dn->dn_indblkshift : 0;
+ doi->doi_type = dn->dn_type;
+ doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_bonus_size = dn->dn_bonuslen;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
- doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
- SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
- doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
- doi->doi_type = dn->dn_type;
- doi->doi_bonus_size = dn->dn_bonuslen;
- doi->doi_bonus_type = dn->dn_bonustype;
+ doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+ doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+ doi->doi_fill_count = 0;
+ for (int i = 0; i < dnp->dn_nblkptr; i++)
+ doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Sun Nov 01 14:14:46 2009 -0800
@@ -36,7 +36,6 @@
#include <sys/dbuf.h>
#include <sys/zvol.h>
#include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/dmu_impl.h>
@@ -138,6 +137,24 @@
}
static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+ spa_t *spa = os->os_spa;
+ enum zio_checksum checksum;
+
+ /*
+ * Inheritance should have been done by now.
+ */
+ ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+ checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+ os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+ os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
+}
+
+static void
primary_cache_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -209,10 +226,9 @@
if (!BP_IS_HOLE(os->os_rootbp)) {
uint32_t aflags = ARC_WAIT;
zbookmark_t zb;
- zb.zb_objset = ds ? ds->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
if (DMU_OS_IS_L2CACHEABLE(os))
aflags |= ARC_L2CACHE;
@@ -281,6 +297,9 @@
err = dsl_prop_register(ds, "copies",
copies_changed_cb, os);
if (err == 0)
+ err = dsl_prop_register(ds, "dedup",
+ dedup_changed_cb, os);
+ if (err == 0)
err = dsl_prop_register(ds, "logbias",
logbias_changed_cb, os);
}
@@ -295,6 +314,9 @@
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
os->os_compress = ZIO_COMPRESS_LZJB;
os->os_copies = spa_max_replication(spa);
+ os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+ os->os_dedup_verify = 0;
+ os->os_logbias = 0;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
@@ -454,12 +476,9 @@
dmu_objset_evict(objset_t *os)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
- int i;
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(list_head(&os->os_dirty_dnodes[i]) == NULL);
- ASSERT(list_head(&os->os_free_dnodes[i]) == NULL);
- }
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
if (!dsl_dataset_is_snapshot(ds)) {
@@ -469,6 +488,8 @@
compression_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "copies",
copies_changed_cb, os));
+ VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+ dedup_changed_cb, os));
VERIFY(0 == dsl_prop_unregister(ds, "logbias",
logbias_changed_cb, os));
}
@@ -873,10 +894,9 @@
/* ARGSUSED */
static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
{
blkptr_t *bp = zio->io_bp;
- blkptr_t *bp_orig = &zio->io_bp_orig;
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
@@ -893,14 +913,24 @@
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+ blkptr_t *bp = zio->io_bp;
+ blkptr_t *bp_orig = &zio->io_bp_orig;
+ objset_t *os = arg;
if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- &zio->io_bp_orig, zio, os->os_synctx);
- dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
@@ -910,7 +940,7 @@
{
int txgoff;
zbookmark_t zb;
- writeprops_t wp = { 0 };
+ zio_prop_t zp;
zio_t *zio;
list_t *list;
list_t *newlist = NULL;
@@ -934,26 +964,17 @@
/*
* Create the root block IO
*/
- zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
- zb.zb_object = 0;
- zb.zb_level = -1; /* for block ordering; it's level 0 on disk */
- zb.zb_blkid = 0;
-
- wp.wp_type = DMU_OT_OBJSET;
- wp.wp_level = 0; /* on-disk BP level; see above */
- wp.wp_copies = os->os_copies;
- wp.wp_oschecksum = os->os_checksum;
- wp.wp_oscompress = os->os_compress;
-
- if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
- (void) dsl_dataset_block_kill(os->os_dsl_dataset,
- os->os_rootbp, pio, tx);
- }
-
arc_release(os->os_phys_buf, &os->os_phys_buf);
- zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
- tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+ SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+ dmu_write_policy(os, NULL, 0, 0, &zp);
+
+ zio = arc_write(pio, os->os_spa, tx->tx_txg,
+ os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+ dmu_objset_write_ready, dmu_objset_write_done, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
@@ -1002,6 +1023,13 @@
zio_nowait(zio);
}
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+ return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+ !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
@@ -1431,10 +1459,8 @@
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
- zb.zb_objset = ds->ds_object;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = 0;
+ SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+ ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
&ds->ds_phys->ds_bp, NULL, NULL,
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Sun Nov 01 14:14:46 2009 -0800
@@ -150,9 +150,10 @@
(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/* ARGSUSED */
static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct backuparg *ba = arg;
dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -161,9 +162,10 @@
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
- if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+ if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+ DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
- } else if (bp == NULL && zb->zb_object == 0) {
+ } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Sun Nov 01 14:14:46 2009 -0800
@@ -35,14 +35,6 @@
#include <sys/dmu_impl.h>
#include <sys/callb.h>
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
struct prefetch_data {
kmutex_t pd_mtx;
kcondvar_t pd_cv;
@@ -68,27 +60,28 @@
arc_buf_t *buf, uint64_t objset, uint64_t object);
/* ARGSUSED */
-static void
+static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
struct traverse_data *td = arg;
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
- return;
+ return (0);
- zb.zb_objset = td->td_objset;
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
- (void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
+ SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+ return (0);
}
/* ARGSUSED */
-static void
+static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
struct traverse_data *td = arg;
@@ -99,17 +92,18 @@
zbookmark_t zb;
if (bp->blk_birth == 0)
- return;
+ return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
+ return (0);
+
+ SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
- zb.zb_objset = td->td_objset;
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
- (void) td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg);
+ (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+ td->td_arg);
}
+ return (0);
}
static void
@@ -120,7 +114,7 @@
/*
* We only want to visit blocks that have been claimed but not yet
- * replayed (or, in read-only mode, blocks that *would* be claimed).
+ * replayed; plus, in read-only mode, blocks that are already stable.
*/
if (claim_txg == 0 && spa_writeable(td->td_spa))
return;
@@ -143,7 +137,7 @@
struct prefetch_data *pd = td->td_pfd;
if (bp->blk_birth == 0) {
- err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
return (err);
}
@@ -163,7 +157,7 @@
}
if (td->td_flags & TRAVERSE_PRE) {
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
if (err)
return (err);
}
@@ -224,7 +218,8 @@
traverse_zil(td, &osp->os_zil_header);
dnp = &osp->os_meta_dnode;
- err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+ err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+ DMU_META_DNODE_OBJECT);
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_userused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@@ -241,7 +236,7 @@
(void) arc_buf_remove_ref(buf, &buf);
if (err == 0 && (td->td_flags & TRAVERSE_POST))
- err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
return (err);
}
@@ -265,8 +260,8 @@
/* ARGSUSED */
static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct prefetch_data *pfd = arg;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -305,7 +300,8 @@
td.td_arg = td_main->td_pfd;
td.td_pfd = NULL;
- SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, td.td_objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -346,7 +342,8 @@
&td, TQ_NOQUEUE))
pd.pd_exited = B_TRUE;
- SET_BOOKMARK(&czb, objset, 0, -1, 0);
+ SET_BOOKMARK(&czb, objset,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
mutex_enter(&pd.pd_mtx);
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c Sun Nov 01 14:14:46 2009 -0800
@@ -163,38 +163,47 @@
}
static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
- boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+ int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
{
- int i = db->db_level + 1;
- dnode_t *dn = db->db_dnode;
+ objset_t *os = dn->dn_objset;
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent = NULL;
+ blkptr_t *bp = NULL;
+ uint64_t space;
- if (i >= dn->dn_nlevels)
+ if (level >= dn->dn_nlevels || history[level] == blkid)
return;
- db = db->db_parent;
- if (db == NULL) {
- uint64_t lvls = dn->dn_nlevels - i;
+ history[level] = blkid;
+
+ space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
- txh->txh_space_towrite += lvls << dn->dn_indblkshift;
- return;
+ if (db == NULL || db == dn->dn_dbuf) {
+ ASSERT(level != 0);
+ db = NULL;
+ } else {
+ ASSERT(db->db_dnode == dn);
+ ASSERT(db->db_level == level);
+ ASSERT(db->db.db_size == space);
+ ASSERT(db->db_blkid == blkid);
+ bp = db->db_blkptr;
+ parent = db->db_parent;
}
- if (db != history[i]) {
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- uint64_t space = 1ULL << dn->dn_indblkshift;
+ freeable = (bp && (freeable ||
+ dsl_dataset_block_freeable(ds, bp->blk_birth)));
- freeable = (db->db_blkptr && (freeable ||
- dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
- if (freeable)
- txh->txh_space_tooverwrite += space;
- else
- txh->txh_space_towrite += space;
- if (db->db_blkptr)
- txh->txh_space_tounref += space;
- history[i] = db;
- dmu_tx_count_indirects(txh, db, freeable, history);
- }
+ if (freeable)
+ txh->txh_space_tooverwrite += space;
+ else
+ txh->txh_space_towrite += space;
+ if (bp)
+ txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+ dmu_tx_count_twig(txh, dn, parent, level + 1,
+ blkid >> epbs, freeable, history);
}
/* ARGSUSED */
@@ -215,7 +224,7 @@
max_ibs = DN_MAX_INDBLKSHIFT;
if (dn) {
- dmu_buf_impl_t *last[DN_MAX_LEVELS];
+ uint64_t history[DN_MAX_LEVELS];
int nlvls = dn->dn_nlevels;
int delta;
@@ -291,29 +300,18 @@
* If this write is not off the end of the file
* we need to account for overwrites/unref.
*/
- if (start <= dn->dn_maxblkid)
- bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+ if (start <= dn->dn_maxblkid) {
+ for (int l = 0; l < DN_MAX_LEVELS; l++)
+ history[l] = -1ULL;
+ }
while (start <= dn->dn_maxblkid) {
- spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold_level(dn, 0, start, FTAG);
rw_exit(&dn->dn_struct_rwlock);
- if (db->db_blkptr && dsl_dataset_block_freeable(ds,
- db->db_blkptr->blk_birth)) {
- dprintf_bp(db->db_blkptr, "can free old%s", "");
- txh->txh_space_tooverwrite += dn->dn_datablksz;
- txh->txh_space_tounref += dn->dn_datablksz;
- dmu_tx_count_indirects(txh, db, TRUE, last);
- } else {
- txh->txh_space_towrite += dn->dn_datablksz;
- if (db->db_blkptr)
- txh->txh_space_tounref +=
- bp_get_dasize(spa, db->db_blkptr);
- dmu_tx_count_indirects(txh, db, FALSE, last);
- }
+ dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+ history);
dbuf_rele(db, FTAG);
if (++start > end) {
/*
@@ -461,7 +459,7 @@
bp += blkid + i;
if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
- space += bp_get_dasize(spa, bp);
+ space += bp_get_dsize(spa, bp);
}
unref += BP_GET_ASIZE(bp);
}
@@ -538,7 +536,7 @@
for (i = 0; i < tochk; i++) {
if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
dprintf_bp(&bp[i], "can free old%s", "");
- space += bp_get_dasize(spa, &bp[i]);
+ space += bp_get_dsize(spa, &bp[i]);
}
unref += BP_GET_ASIZE(bp);
}
@@ -583,6 +581,8 @@
if (len != DMU_OBJECT_END)
dmu_tx_count_write(txh, off+len, 1);
+ dmu_tx_count_dnode(txh);
+
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
@@ -625,7 +625,6 @@
}
}
- dmu_tx_count_dnode(txh);
dmu_tx_count_free(txh, off, len);
}
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Sun Nov 01 14:14:46 2009 -0800
@@ -120,7 +120,7 @@
if (BP_IS_HOLE(bp))
continue;
- bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+ bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
bzero(bp, sizeof (blkptr_t));
blocks_freed += 1;
@@ -424,6 +424,9 @@
dmu_buf_impl_t *db = dr->dr_dbuf;
uint64_t txg = dr->dr_txg;
+ if (db->db_level != 0)
+ dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
mutex_enter(&db->db_mtx);
/* XXX - use dbuf_undirty()? */
list_remove(list, dr);
@@ -434,13 +437,9 @@
ASSERT(db->db_blkid == DB_BONUS_BLKID ||
dr->dt.dl.dr_data == db->db_buf);
dbuf_unoverride(dr);
- mutex_exit(&db->db_mtx);
- } else {
- mutex_exit(&db->db_mtx);
- dnode_undirty_dbufs(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
- dbuf_rele(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
}
}
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Sun Nov 01 14:14:46 2009 -0800
@@ -75,9 +75,9 @@
}
void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
@@ -118,29 +118,26 @@
}
int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
- dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+ boolean_t async)
{
- int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+ if (BP_IS_HOLE(bp))
+ return (0);
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(bp->blk_birth <= tx->tx_txg);
+
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
- ASSERT(pio != NULL);
- ASSERT(dmu_tx_is_syncing(tx));
- /* No block pointer => nothing to free */
- if (BP_IS_HOLE(bp))
- return (0);
-
ASSERT(used > 0);
if (ds == NULL) {
- int err;
/*
* Account for the meta-objset space in its placeholder
* dataset.
*/
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-used, -compressed, -uncompressed, tx);
@@ -153,13 +150,10 @@
dmu_buf_will_dirty(ds->ds_dbuf, tx);
if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
- int err;
int64_t delta;
dprintf_bp(bp, "freeing: %s", "");
- err = dsl_free(pio, tx->tx_pool,
- tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
- ASSERT(err == 0);
+ dsl_free(tx->tx_pool, tx->tx_txg, bp);
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
@@ -175,7 +169,18 @@
mutex_exit(&ds->ds_dir->dd_lock);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ if (async) {
+ /*
+ * We are here as part of zio's write done callback,
+ * which means we're a zio interrupt thread. We can't
+ * call bplist_enqueue() now because it may block
+ * waiting for I/O. Instead, put bp on the deferred
+ * queue and let dsl_pool_sync() finish the job.
+ */
+ bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+ } else {
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ }
ASSERT3U(ds->ds_prev->ds_object, ==,
ds->ds_phys->ds_prev_snap_obj);
ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -261,9 +266,9 @@
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
+ bplist_fini(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
}
@@ -361,10 +366,9 @@
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
- NULL);
rw_init(&ds->ds_rwlock, 0, 0, 0);
cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
+ bplist_init(&ds->ds_deadlist);
err = bplist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
@@ -380,9 +384,9 @@
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
+ bplist_fini(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
@@ -459,9 +463,9 @@
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
- mutex_destroy(&ds->ds_deadlist.bpl_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
+ bplist_fini(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err) {
dmu_buf_rele(dbuf, tag);
@@ -1238,31 +1242,31 @@
struct killarg {
dsl_dataset_t *ds;
- zio_t *zio;
dmu_tx_t *tx;
};
/* ARGSUSED */
static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
+ dmu_tx_t *tx = ka->tx;
if (bp == NULL)
return (0);
- if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
- (zb->zb_object != 0 && dnp == NULL)) {
+ if (zb->zb_level == ZB_ZIL_LEVEL) {
+ ASSERT(zilog != NULL);
/*
* It's a block in the intent log. It has no
* accounting, so just free it.
*/
- VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
- ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
+ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
+ ASSERT(zilog == NULL);
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
- (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
+ (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
return (0);
@@ -1490,7 +1494,6 @@
{
struct dsl_ds_destroyarg *dsda = arg1;
dsl_dataset_t *ds = dsda->ds;
- zio_t *zio;
int err;
int after_branch_point = FALSE;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1577,8 +1580,6 @@
}
}
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
if (ds->ds_phys->ds_next_snap_obj != 0) {
blkptr_t bp;
dsl_dataset_t *ds_next;
@@ -1616,15 +1617,13 @@
bp.blk_birth >
ds_prev->ds_phys->ds_prev_snap_txg) {
ds_prev->ds_phys->ds_unique_bytes +=
- bp_get_dasize(dp->dp_spa, &bp);
+ bp_get_dsize_sync(dp->dp_spa, &bp);
}
} else {
- used += bp_get_dasize(dp->dp_spa, &bp);
+ used += bp_get_dsize_sync(dp->dp_spa, &bp);
compressed += BP_GET_PSIZE(&bp);
uncompressed += BP_GET_UCSIZE(&bp);
- /* XXX check return value? */
- (void) dsl_free(zio, dp, tx->tx_txg,
- &bp, NULL, NULL, ARC_NOWAIT);
+ dsl_free(dp, tx->tx_txg, &bp);
}
}
@@ -1726,7 +1725,6 @@
* freed all the objects in open context.
*/
ka.ds = ds;
- ka.zio = zio;
ka.tx = tx;
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka);
@@ -1740,9 +1738,6 @@
}
}
- err = zio_wait(zio);
- ASSERT3U(err, ==, 0);
-
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
/* Erase the link in the dir */
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
@@ -2785,7 +2780,7 @@
if (err == 0) {
err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
dsl_dataset_promote_sync, ds, &pa,
- 2 + 2 * doi.doi_physical_blks);
+ 2 + 2 * doi.doi_physical_blocks_512);
if (err && pa.err_ds && conflsnap)
(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
}
--- a/usr/src/uts/common/fs/zfs/dsl_deleg.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_deleg.c Sun Nov 01 14:14:46 2009 -0800
@@ -75,8 +75,6 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
#include <sys/cred.h>
@@ -739,5 +737,5 @@
boolean_t
dsl_delegation_on(objset_t *os)
{
- return (os->os_spa->spa_delegation);
+ return (!!spa_delegation(os->os_spa));
}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Sun Nov 01 14:14:46 2009 -0800
@@ -32,6 +32,7 @@
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
+#include <sys/metaslab.h>
#include <sys/zap.h>
#include <sys/zio.h>
#include <sys/arc.h>
@@ -650,7 +651,8 @@
* dsl_pool_adjustedsize()), something is very
* wrong.
*/
- ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+ ASSERT3U(used, <=, metaslab_class_get_space(
+ spa_normal_class(dd->dd_pool->dp_spa)));
} else {
/*
* the lesser of the space provided by our parent and
@@ -736,8 +738,9 @@
* removes to get through.
*/
if (dd->dd_parent == NULL) {
+ spa_t *spa = dd->dd_pool->dp_spa;
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
- deferred = spa_get_defers(dd->dd_pool->dp_spa);
+ deferred = metaslab_class_get_deferred(spa_normal_class(spa));
if (poolsize - deferred < quota) {
quota = poolsize - deferred;
retval = ENOSPC;
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Sun Nov 01 14:14:46 2009 -0800
@@ -346,6 +346,15 @@
}
err = zio_wait(zio);
+ /*
+ * If anything was added to a deadlist during a zio done callback,
+ * it had to be put on the deferred queue. Enqueue it for real now.
+ */
+ for (ds = list_head(&dp->dp_synced_datasets); ds;
+ ds = list_next(&dp->dp_synced_datasets, ds))
+ bplist_sync(&ds->ds_deadlist,
+ bplist_enqueue_cb, &ds->ds_deadlist, tx);
+
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
/*
* No more sync tasks should have been added while we
@@ -422,16 +431,19 @@
}
void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
{
dsl_dataset_t *ds;
+ objset_t *os;
while (ds = list_head(&dp->dp_synced_datasets)) {
list_remove(&dp->dp_synced_datasets, ds);
- ASSERT(ds->ds_objset != NULL);
- zil_clean(ds->ds_objset->os_zil);
+ os = ds->ds_objset;
+ zil_clean(os->os_zil);
+ ASSERT(!dmu_objset_is_dirty(os, txg));
dmu_buf_rele(ds->ds_dbuf, ds);
}
+ ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
}
/*
@@ -460,7 +472,7 @@
* cut the reservation in half to allow forward progress
* (e.g. make it possible to rm(1) files from a full pool).
*/
- space = spa_get_dspace(dp->dp_spa);
+ space = metaslab_class_get_dspace(spa_normal_class(dp->dp_spa));
resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
if (netfree)
resv >>= 1;
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c Sun Nov 01 14:14:46 2009 -0800
@@ -31,7 +31,6 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
#include <sys/zap.h>
#include <sys/fs/zfs.h>
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c Sun Nov 01 14:14:46 2009 -0800
@@ -40,6 +40,8 @@
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
@@ -59,14 +61,6 @@
dsl_pool_scrub_clean_cb
};
-#define SET_BOOKMARK(zb, objset, object, level, blkid) \
-{ \
- (zb)->zb_objset = objset; \
- (zb)->zb_object = object; \
- (zb)->zb_level = level; \
- (zb)->zb_blkid = blkid; \
-}
-
/* ARGSUSED */
static void
dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
@@ -126,6 +120,7 @@
ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
dp->dp_scrub_restart = B_FALSE;
+ dp->dp_scrub_ditto = B_FALSE;
dp->dp_spa->spa_scrub_errors = 0;
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -241,15 +236,13 @@
dsl_pool_scrub_cancel_sync, dp, &complete, 3));
}
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
{
/*
* This function will be used by bp-rewrite wad to intercept frees.
*/
- return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
- done, private, arc_flags));
+ zio_free(dp->dp_spa, txg, bpp);
}
static boolean_t
@@ -267,14 +260,14 @@
uint64_t zb1nextL0, zb2thisobj;
ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb1->zb_object != -1ULL);
+ ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
ASSERT(zb2->zb_level == 0);
/*
* A bookmark in the deadlist is considered to be after
* everything else.
*/
- if (zb2->zb_object == -1ULL)
+ if (zb2->zb_object == DMU_DEADLIST_OBJECT)
return (B_TRUE);
/* The objset_phys_t isn't before anything. */
@@ -287,7 +280,7 @@
zb2thisobj = zb2->zb_object ? zb2->zb_object :
zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
- if (zb1->zb_object == 0) {
+ if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
uint64_t nextobj = zb1nextL0 *
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
return (nextobj <= zb2thisobj);
@@ -297,7 +290,7 @@
return (B_TRUE);
if (zb1->zb_object > zb2thisobj)
return (B_FALSE);
- if (zb2->zb_object == 0)
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT)
return (B_FALSE);
return (zb1nextL0 <= zb2->zb_blkid);
}
@@ -339,7 +332,7 @@
} zil_traverse_arg_t;
/* ARGSUSED */
-static void
+static int
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zil_traverse_arg_t *zta = arg;
@@ -348,7 +341,7 @@
zbookmark_t zb;
if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
+ return (0);
/*
* One block ("stubby") can be allocated a long time ago; we
@@ -357,17 +350,17 @@
* plain scrub there's nothing to do to it).
*/
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
- return;
+ return (0);
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+ return (0);
}
/* ARGSUSED */
-static void
+static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
if (lrc->lrc_txtype == TX_WRITE) {
@@ -379,7 +372,7 @@
zbookmark_t zb;
if (bp->blk_birth <= dp->dp_scrub_min_txg)
- return;
+ return (0);
/*
* birth can be < claim_txg if this record's txg is
@@ -387,14 +380,15 @@
* other records that are not synced)
*/
if (claim_txg == 0 || bp->blk_birth < claim_txg)
- return;
+ return (0);
- zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = lr->lr_foid;
- zb.zb_level = BP_GET_LEVEL(bp);
- zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
}
+ return (0);
}
static void
@@ -522,12 +516,12 @@
traverse_zil(dp, &osp->os_zil_header);
scrub_visitdnode(dp, &osp->os_meta_dnode,
- buf, zb->zb_objset, 0);
+ buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
scrub_visitdnode(dp, &osp->os_userused_dnode,
- buf, zb->zb_objset, 0);
+ buf, zb->zb_objset, DMU_USERUSED_OBJECT);
scrub_visitdnode(dp, &osp->os_groupused_dnode,
- buf, zb->zb_objset, 0);
+ buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
}
}
@@ -556,7 +550,8 @@
{
zbookmark_t zb;
- SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+ SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+ ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
scrub_visitbp(dp, NULL, NULL, bp, &zb);
}
@@ -569,7 +564,8 @@
return;
if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+ SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
+ 0, 0, 0);
} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
ds->ds_object, tx) != 0) {
return;
@@ -775,6 +771,36 @@
return (0);
}
+static void
+dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
+ enum ddt_class class)
+{
+ ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
+ ddt_entry_t dde;
+ blkptr_t blk;
+ zbookmark_t zb = { 0 };
+ uint64_t walk = 0;
+ int error;
+
+ if (!ddt_object_exists(ddt, type, class))
+ return;
+
+ while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+ int p = DDT_PHYS_DITTO;
+ ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
+ scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+ }
+ ASSERT(error == ENOENT);
+}
+
+static void
+dsl_pool_scrub_ditto(dsl_pool_t *dp)
+{
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
+}
+
void
dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
@@ -814,7 +840,12 @@
dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
spa->spa_scrub_active = B_TRUE;
- if (dp->dp_scrub_bookmark.zb_objset == 0) {
+ if (!dp->dp_scrub_ditto) {
+ dsl_pool_scrub_ditto(dp);
+ dp->dp_scrub_ditto = B_TRUE;
+ }
+
+ if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
/* First do the MOS & ORIGIN */
scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
if (dp->dp_scrub_pausing)
@@ -827,12 +858,12 @@
scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
}
ASSERT(!dp->dp_scrub_pausing);
- } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+ } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
/*
- * If we were paused, continue from here. Note if the
- * ds we were paused on was deleted, the zb_objset will
- * be -1, so we will skip this and find a new objset
- * below.
+ * If we were paused, continue from here. Note if the ds
+ * we were paused on was destroyed, the zb_objset will be
+ * ZB_DESTROYED_OBJSET, so we will skip this and find a new
+ * objset below.
*/
scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
if (dp->dp_scrub_pausing)
@@ -961,13 +992,13 @@
{
size_t size = BP_GET_PSIZE(bp);
spa_t *spa = dp->dp_spa;
+ uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io;
int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
int zio_priority;
- ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
-
- if (bp->blk_birth >= dp->dp_scrub_max_txg)
+ if (phys_birth <= dp->dp_scrub_min_txg ||
+ phys_birth >= dp->dp_scrub_max_txg)
return (0);
count_block(dp->dp_blkstats, bp);
@@ -985,7 +1016,7 @@
}
/* If it's an intent log block, failure is expected. */
- if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ if (zb->zb_level == ZB_ZIL_LEVEL)
zio_flags |= ZIO_FLAG_SPECULATIVE;
for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
@@ -1015,7 +1046,7 @@
needs_io = B_TRUE;
} else {
needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
- bp->blk_birth, 1);
+ phys_birth, 1);
}
}
}
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c Sun Nov 01 14:14:46 2009 -0800
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
@@ -118,8 +116,10 @@
txg_wait_synced(dstg->dstg_pool, txg);
- if (dstg->dstg_err == EAGAIN)
+ if (dstg->dstg_err == EAGAIN) {
+ txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
goto top;
+ }
return (dstg->dstg_err);
}
--- a/usr/src/uts/common/fs/zfs/lzjb.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/lzjb.c Sun Nov 01 14:14:46 2009 -0800
@@ -20,18 +20,18 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* We keep our own copy of this algorithm for 2 main reasons:
- * 1. If we didn't, anyone modifying common/os/compress.c would
+ * 1. If we didn't, anyone modifying common/os/compress.c would
* directly break our on disk format
- * 2. Our version of lzjb does not have a number of checks that the
+ * 2. Our version of lzjb does not have a number of checks that the
* common/os version needs and uses
+ * 3. We initialize the lempel to ensure deterministic results,
+ * so that identical blocks can always be deduplicated.
* In particular, we are adding the "feature" that compress() can
* take a destination buffer size and return -1 if the data will not
* compress to d_len or less.
@@ -43,7 +43,7 @@
#define MATCH_MIN 3
#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1))
#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1)
-#define LEMPEL_SIZE 256
+#define LEMPEL_SIZE 1024
/*ARGSUSED*/
size_t
@@ -53,20 +53,14 @@
uchar_t *dst = d_start;
uchar_t *cpy, *copymap;
int copymask = 1 << (NBBY - 1);
- int mlen, offset;
+ int mlen, offset, hash;
uint16_t *hp;
- uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */
+ uint16_t lempel[LEMPEL_SIZE] = { 0 };
while (src < (uchar_t *)s_start + s_len) {
if ((copymask <<= 1) == (1 << NBBY)) {
- if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
- if (d_len != s_len)
- return (s_len);
- mlen = s_len;
- for (src = s_start, dst = d_start; mlen; mlen--)
- *dst++ = *src++;
+ if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
return (s_len);
- }
copymask = 1;
copymap = dst;
*dst++ = 0;
@@ -75,8 +69,10 @@
*dst++ = *src++;
continue;
}
- hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
- (LEMPEL_SIZE - 1)];
+ hash = (src[0] << 16) + (src[1] << 8) + src[2];
+ hash += hash >> 9;
+ hash += hash >> 5;
+ hp = &lempel[hash & (LEMPEL_SIZE - 1)];
offset = (intptr_t)(src - *hp) & OFFSET_MASK;
*hp = (uint16_t)(uintptr_t)src;
cpy = src - offset;
--- a/usr/src/uts/common/fs/zfs/metaslab.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/metaslab.c Sun Nov 01 14:14:46 2009 -0800
@@ -24,7 +24,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/space_map.h>
@@ -36,6 +35,11 @@
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
@@ -153,6 +157,45 @@
return (0);
}
+void
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
+{
+ atomic_add_64(&mc->mc_alloc, alloc_delta);
+ atomic_add_64(&mc->mc_deferred, defer_delta);
+ atomic_add_64(&mc->mc_space, space_delta);
+ atomic_add_64(&mc->mc_dspace, dspace_delta);
+
+ ASSERT((int64_t)mc->mc_alloc >= 0 &&
+ (int64_t)mc->mc_deferred >= 0 &&
+ (int64_t)mc->mc_space >= 0 &&
+ (int64_t)mc->mc_dspace >= 0);
+}
+
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+ return (mc->mc_alloc);
+}
+
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+ return (mc->mc_deferred);
+}
+
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+ return (mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+ return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
+}
+
/*
* ==========================================================================
* Metaslab groups
@@ -493,6 +536,13 @@
metaslab_group_add(mg, msp);
+ if (metaslab_debug && smo->smo_object != 0) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+ SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+ mutex_exit(&msp->ms_lock);
+ }
+
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
@@ -515,8 +565,8 @@
{
metaslab_group_t *mg = msp->ms_group;
- vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc, 0, B_TRUE);
+ vdev_space_update(mg->mg_vd,
+ -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
metaslab_group_remove(mg, msp);
@@ -607,7 +657,7 @@
if (!sm->sm_loaded) {
int error = space_map_load(sm, sm_ops, SM_FREE,
&msp->ms_smo,
- msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
+ spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
@@ -641,7 +691,9 @@
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
+#if 0
ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
+#endif
metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
@@ -654,7 +706,7 @@
{
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
- objset_t *mos = spa->spa_meta_objset;
+ objset_t *mos = spa_meta_objset(spa);
space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -780,14 +832,13 @@
space_map_create(&msp->ms_defermap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
- vdev_space_update(vd, sm->sm_size, 0, 0, B_TRUE);
+ vdev_space_update(vd, 0, 0, sm->sm_size);
}
alloc_delta = smosync->smo_alloc - smo->smo_alloc;
defer_delta = freed_map->sm_space - defer_map->sm_space;
- vdev_space_update(vd, 0, alloc_delta + defer_delta,
- defer_delta, B_TRUE);
+ vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -827,7 +878,7 @@
if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
evictable = 0;
- if (evictable)
+ if (evictable && !metaslab_debug)
space_map_unload(sm);
}
@@ -973,7 +1024,7 @@
/*
* Start at the rotor and loop through all mgs until we find something.
- * Note that there's no locking on mc_rotor or mc_allocated because
+ * Note that there's no locking on mc_rotor or mc_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
@@ -1071,32 +1122,28 @@
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
- if (mc->mc_allocated == 0) {
+ if (mc->mc_aliquot == 0) {
vdev_stat_t *vs = &vd->vdev_stat;
- uint64_t alloc, space;
- int64_t vu, su;
-
- alloc = spa_get_alloc(spa);
- space = spa_get_space(spa);
+ int64_t vu, cu;
/*
* Determine percent used in units of 0..1024.
* (This is just to avoid floating point.)
*/
vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
- su = (alloc << 10) / (space + 1);
+ cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
/*
* Bias by at most +/- 25% of the aliquot.
*/
- mg->mg_bias = ((su - vu) *
+ mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / (1024 * 4);
}
- if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+ if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
}
DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -1108,7 +1155,7 @@
}
next:
mc->mc_rotor = mg->mg_next;
- mc->mc_allocated = 0;
+ mc->mc_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
if (!all_zero) {
@@ -1188,7 +1235,7 @@
uint64_t size = DVA_GET_ASIZE(dva);
vdev_t *vd;
metaslab_t *msp;
- int error;
+ int error = 0;
ASSERT(DVA_IS_VALID(dva));
@@ -1203,7 +1250,12 @@
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+ if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+ error = ENOENT;
+
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
@@ -1231,6 +1283,7 @@
int error = 0;
ASSERT(bp->blk_birth == 0);
+ ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -1260,7 +1313,7 @@
spa_config_exit(spa, SCL_ALLOC, FTAG);
- bp->blk_birth = txg;
+ BP_SET_BIRTH(bp, txg, txg);
return (0);
}
@@ -1272,7 +1325,7 @@
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+ ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
--- a/usr/src/uts/common/fs/zfs/sha256.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sha256.c Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
/*
* SHA-256 checksum, as specified in FIPS 180-3, available at:
--- a/usr/src/uts/common/fs/zfs/spa.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa.c Sun Nov 01 14:14:46 2009 -0800
@@ -35,11 +35,11 @@
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
#include <sys/dmu.h>
#include <sys/dmu_tx.h>
#include <sys/zap.h>
#include <sys/zil.h>
+#include <sys/ddt.h>
#include <sys/vdev_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -154,8 +154,8 @@
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (spa->spa_root_vdev != NULL) {
- size = spa_get_space(spa);
- used = spa_get_alloc(spa);
+ used = metaslab_class_get_alloc(spa_normal_class(spa));
+ size = metaslab_class_get_space(spa_normal_class(spa));
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
@@ -165,6 +165,9 @@
cap = (size == 0) ? 0 : (used * 100 / size);
spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+ ddt_get_pool_dedup_ratio(spa), src);
+
spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
spa->spa_root_vdev->vdev_state, src);
@@ -199,9 +202,9 @@
int
spa_prop_get(spa_t *spa, nvlist_t **nvp)
{
+ objset_t *mos = spa->spa_meta_objset;
zap_cursor_t zc;
zap_attribute_t za;
- objset_t *mos = spa->spa_meta_objset;
int err;
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -436,6 +439,16 @@
strcmp(slash, "/..") == 0)
error = EINVAL;
break;
+
+ case ZPOOL_PROP_DEDUPDITTO:
+ if (spa_version(spa) < SPA_VERSION_DEDUP)
+ error = ENOTSUP;
+ else
+ error = nvpair_value_uint64(elem, &intval);
+ if (error == 0 &&
+ intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+ error = EINVAL;
+ break;
}
if (error)
@@ -771,6 +784,8 @@
spa->spa_dsl_pool = NULL;
}
+ ddt_unload(spa);
+
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
/*
@@ -1145,12 +1160,24 @@
static void
spa_aux_check_removed(spa_aux_vdev_t *sav)
{
- int i;
-
- for (i = 0; i < sav->sav_count; i++)
+ for (int i = 0; i < sav->sav_count; i++)
spa_check_removed(sav->sav_vdevs[i]);
}
+void
+spa_claim_notify(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+
+ if (zio->io_error)
+ return;
+
+ mutex_enter(&spa->spa_props_lock); /* any mutex will do */
+ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+ spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+ mutex_exit(&spa->spa_props_lock);
+}
+
typedef struct spa_load_error {
uint64_t sle_metadata_count;
uint64_t sle_data_count;
@@ -1176,8 +1203,8 @@
/*ARGSUSED*/
static int
-spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
if (bp != NULL) {
zio_t *rio = arg;
@@ -1380,6 +1407,8 @@
TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE;
spa->spa_first_txg = spa->spa_last_ubsync_txg ?
spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+ spa->spa_claim_max_txg = spa->spa_first_txg;
+
error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
if (error) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1448,7 +1477,7 @@
if (zap_lookup(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+ sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj) != 0) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
error = EIO;
@@ -1562,20 +1591,6 @@
spa_config_exit(spa, SCL_ALL, FTAG);
}
- VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
- &nvroot) == 0);
- spa_load_log_state(spa, nvroot);
- nvlist_free(nvconfig);
-
- if (spa_check_logs(spa)) {
- vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_BAD_LOG);
- error = ENXIO;
- ereport = FM_EREPORT_ZFS_LOG_REPLAY;
- goto out;
- }
-
-
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -1610,6 +1625,10 @@
spa->spa_pool_props_object,
zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND),
sizeof (uint64_t), 1, &spa->spa_autoexpand);
+ (void) zap_lookup(spa->spa_meta_objset,
+ spa->spa_pool_props_object,
+ zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO),
+ sizeof (uint64_t), 1, &spa->spa_dedup_ditto);
}
/*
@@ -1653,6 +1672,17 @@
goto out;
}
+ /*
+ * Load the DDTs (dedup tables).
+ */
+ error = ddt_load(spa);
+ if (error != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
if (state != SPA_LOAD_TRYIMPORT) {
error = spa_load_verify(spa);
if (error) {
@@ -1662,6 +1692,22 @@
}
}
+ /*
+ * Load the intent log state and check log integrity.
+ */
+ VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ spa_load_log_state(spa, nvroot);
+ nvlist_free(nvconfig);
+
+ if (spa_check_logs(spa)) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LOG);
+ error = ENXIO;
+ ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ goto out;
+ }
+
if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
@@ -1672,22 +1718,32 @@
/*
* Claim log blocks that haven't been committed yet.
* This must all happen in a single txg.
+ * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+ * invoked from zil_claim_log_block()'s i/o done callback.
* Price of rollback is that we abandon the log.
*/
+ spa->spa_claiming = B_TRUE;
+
tx = dmu_tx_create_assigned(spa_get_dsl(spa),
spa_first_txg(spa));
(void) dmu_objset_find(spa_name(spa),
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
+ spa->spa_claiming = B_FALSE;
+
spa->spa_log_state = SPA_LOG_GOOD;
spa->spa_sync_on = B_TRUE;
txg_sync_start(spa->spa_dsl_pool);
/*
- * Wait for all claims to sync.
+ * Wait for all claims to sync. We sync up to the highest
+ * claimed log block birth time so that claimed log blocks
+ * don't appear to be from the future. spa_claim_max_txg
+ * will have been set for us by either zil_check_log_chain()
+ * (invoked from spa_check_logs()) or zil_claim() above.
*/
- txg_wait_synced(spa->spa_dsl_pool, 0);
+ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
* If the config cache is stale, or we have uninitialized
@@ -2350,8 +2406,6 @@
spa = spa_add(pool, NULL, altroot);
spa_activate(spa, spa_mode_global);
- spa->spa_uberblock.ub_txg = txg - 1;
-
if (props && (error = spa_prop_validate(spa, props))) {
spa_deactivate(spa);
spa_remove(spa);
@@ -2363,6 +2417,9 @@
&version) != 0)
version = SPA_VERSION;
ASSERT(version <= SPA_VERSION);
+
+ spa->spa_first_txg = txg;
+ spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
@@ -2468,14 +2525,14 @@
* because sync-to-convergence takes longer if the blocksize
* keeps changing.
*/
- spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+ spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
1 << 14, tx);
- dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
- ZIO_COMPRESS_OFF, tx);
+ dmu_object_set_compress(spa->spa_meta_objset,
+ spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+ sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
cmn_err(CE_PANIC, "failed to add bplist");
}
@@ -2492,11 +2549,17 @@
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(spa, props, CRED(), tx);
}
+ /*
+ * Create DDTs (dedup tables).
+ */
+ ddt_create(spa);
+
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
@@ -3732,6 +3795,7 @@
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vd == vd->vdev_top);
/*
* Remove our vdev from the allocatable vdevs
@@ -3751,6 +3815,7 @@
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+ ASSERT(vd == vd->vdev_top);
/*
* Evacuate the device. We don't hold the config lock as writer
@@ -3799,8 +3864,15 @@
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vd == vd->vdev_top);
(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+ if (list_link_active(&vd->vdev_state_dirty_node))
+ vdev_state_clean(vd);
+ if (list_link_active(&vd->vdev_config_dirty_node))
+ vdev_config_clean(vd);
+
vdev_free(vd);
/*
@@ -3873,6 +3945,7 @@
spa->spa_l2cache.sav_sync = B_TRUE;
} else if (vd != NULL && vd->vdev_islog) {
ASSERT(!locked);
+ ASSERT(vd == vd->vdev_top);
/*
* XXX - Once we have bp-rewrite this should
@@ -3887,7 +3960,13 @@
*/
spa_vdev_remove_start(spa, vd);
- spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
if ((error = spa_vdev_remove_evacuate(spa, vd)) != 0)
return (error);
txg = spa_vdev_config_enter(spa);
@@ -4165,24 +4244,23 @@
* See if the config needs to be updated.
*/
if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
- uint64_t oldsz, space_update;
+ uint64_t old_space, new_space;
mutex_enter(&spa_namespace_lock);
- oldsz = spa_get_space(spa);
+ old_space = metaslab_class_get_space(spa_normal_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
- space_update = spa_get_space(spa) - oldsz;
+ new_space = metaslab_class_get_space(spa_normal_class(spa));
mutex_exit(&spa_namespace_lock);
/*
* If the pool grew as a result of the config update,
* then log an internal history event.
*/
- if (space_update) {
+ if (new_space != old_space) {
spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
spa, NULL, CRED(),
"pool '%s' size: %llu(+%llu)",
- spa_name(spa), spa_get_space(spa),
- space_update);
+ spa_name(spa), new_space, new_space - old_space);
}
}
@@ -4280,38 +4358,34 @@
* SPA syncing routines
* ==========================================================================
*/
-
static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
{
- bplist_t *bpl = &spa->spa_sync_bplist;
- dmu_tx_t *tx;
blkptr_t blk;
uint64_t itor = 0;
- zio_t *zio;
- int error;
uint8_t c = 1;
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
while (bplist_iterate(bpl, &itor, &blk) == 0) {
ASSERT(blk.blk_birth < txg);
- zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
+ zio_free(spa, txg, &blk);
}
- error = zio_wait(zio);
- ASSERT3U(error, ==, 0);
-
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
bplist_vacate(bpl, tx);
/*
* Pre-dirty the first block so we sync to convergence faster.
* (Usually only the first block is needed.)
*/
- dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
- dmu_tx_commit(tx);
+ dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+}
+
+static void
+spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zio_t *zio = arg;
+
+ zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+ zio->io_flags));
}
static void
@@ -4469,8 +4543,6 @@
* Set pool property values in the poolprops mos object.
*/
if (spa->spa_pool_props_object == 0) {
- objset_t *mos = spa->spa_meta_objset;
-
VERIFY((spa->spa_pool_props_object =
zap_create(mos, DMU_OT_POOL_PROPS,
DMU_OT_NONE, 0, tx)) > 0);
@@ -4521,6 +4593,9 @@
spa->spa_autoexpand = intval;
spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
break;
+ case ZPOOL_PROP_DEDUPDITTO:
+ spa->spa_dedup_ditto = intval;
+ break;
default:
break;
}
@@ -4547,11 +4622,11 @@
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bplist_t *bpl = &spa->spa_sync_bplist;
+ bplist_t *defer_bpl = &spa->spa_deferred_bplist;
+ bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
- int dirty_vdevs;
int error;
/*
@@ -4586,7 +4661,7 @@
}
spa_config_exit(spa, SCL_STATE, FTAG);
- VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+ VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
tx = dmu_tx_create_assigned(dp, txg);
@@ -4632,13 +4707,13 @@
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
!txg_list_empty(&dp->dp_dirty_dirs, txg) ||
!txg_list_empty(&dp->dp_sync_tasks, txg))
- spa_sync_deferred_frees(spa, txg);
+ spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
/*
* Iterate to convergence.
*/
do {
- spa->spa_sync_pass++;
+ int pass = ++spa->spa_sync_pass;
spa_sync_config_object(spa, tx);
spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -4648,18 +4723,24 @@
spa_errlog_sync(spa, txg);
dsl_pool_sync(dp, txg);
- dirty_vdevs = 0;
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
- vdev_sync(vd, txg);
- dirty_vdevs++;
+ if (pass <= SYNC_PASS_DEFERRED_FREE) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_sync(free_bpl, spa_sync_free, zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+ } else {
+ bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
}
- bplist_sync(bpl, tx);
- } while (dirty_vdevs);
-
- bplist_close(bpl);
-
- dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+ ddt_sync(spa, txg);
+
+ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+ vdev_sync(vd, txg);
+
+ } while (dmu_objset_is_dirty(mos, txg));
+
+ ASSERT(free_bpl->bpl_queue == NULL);
+
+ bplist_close(defer_bpl);
/*
* Rewrite the vdev configuration (which includes the uberblock)
@@ -4730,10 +4811,7 @@
spa->spa_ubsync = spa->spa_uberblock;
- /*
- * Clean up the ZIL records for the synced txg.
- */
- dsl_pool_zil_clean(dp);
+ dsl_pool_sync_done(dp, txg);
/*
* Update usable space statistics.
@@ -4748,7 +4826,10 @@
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
- ASSERT(bpl->bpl_queue == NULL);
+ ASSERT(defer_bpl->bpl_queue == NULL);
+ ASSERT(free_bpl->bpl_queue == NULL);
+
+ spa->spa_sync_pass = 0;
spa_config_exit(spa, SCL_CONFIG, FTAG);
--- a/usr/src/uts/common/fs/zfs/spa_history.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_history.c Sun Nov 01 14:14:46 2009 -0800
@@ -103,7 +103,8 @@
* Figure out maximum size of history log. We set it at
* 1% of pool size, with a max of 32MB and min of 128KB.
*/
- shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+ shpp->sh_phys_max_off =
+ metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c Sun Nov 01 14:14:46 2009 -0800
@@ -186,7 +186,7 @@
*
* SCL_VDEV
* Held as reader to prevent changes to the vdev tree during trivial
- * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the
+ * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
* other locks, and lower than all of them, to ensure that it's safe
* to acquire regardless of caller context.
*
@@ -433,7 +433,6 @@
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -441,6 +440,10 @@
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_init(&spa->spa_free_bplist[t]);
+ bplist_init(&spa->spa_deferred_bplist);
+
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
spa->spa_freeze_txg = UINT64_MAX;
@@ -514,6 +517,10 @@
spa_config_lock_destroy(spa);
+ for (int t = 0; t < TXG_SIZE; t++)
+ bplist_fini(&spa->spa_free_bplist[t]);
+ bplist_fini(&spa->spa_deferred_bplist);
+
cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
@@ -522,7 +529,6 @@
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_errlist_lock);
- mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
mutex_destroy(&spa->spa_history_lock);
mutex_destroy(&spa->spa_props_lock);
mutex_destroy(&spa->spa_suspend_lock);
@@ -819,12 +825,6 @@
mutex_exit(&spa_l2cache_lock);
}
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
- vdev_space_update(vd, space, alloc, 0, B_FALSE);
-}
-
/*
* ==========================================================================
* SPA vdev locking
@@ -890,8 +890,8 @@
/*
* Verify the metaslab classes.
*/
- ASSERT(metaslab_class_validate(spa->spa_normal_class) == 0);
- ASSERT(metaslab_class_validate(spa->spa_log_class) == 0);
+ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
@@ -955,6 +955,10 @@
int
spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
{
+ if (vd != NULL || error == 0)
+ vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+ 0, 0, B_FALSE);
+
if (vd != NULL) {
vdev_state_dirty(vd->vdev_top);
spa->spa_config_generation++;
@@ -1105,50 +1109,13 @@
}
void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+sprintf_blkptr(char *buf, const blkptr_t *bp)
{
- int d;
-
- if (bp == NULL) {
- (void) snprintf(buf, len, "<NULL>");
- return;
- }
-
- if (BP_IS_HOLE(bp)) {
- (void) snprintf(buf, len, "<hole>");
- return;
- }
-
- (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
- (u_longlong_t)BP_GET_LEVEL(bp),
- BP_GET_TYPE(bp) < DMU_OT_NUMTYPES ?
- dmu_ot[BP_GET_TYPE(bp)].ot_name : "UNKNOWN",
- (u_longlong_t)BP_GET_LSIZE(bp),
- (u_longlong_t)BP_GET_PSIZE(bp));
+ char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+ char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
- for (d = 0; d < BP_GET_NDVAS(bp); d++) {
- const dva_t *dva = &bp->blk_dva[d];
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "DVA[%d]=<%llu:%llx:%llx> ", d,
- (u_longlong_t)DVA_GET_VDEV(dva),
- (u_longlong_t)DVA_GET_OFFSET(dva),
- (u_longlong_t)DVA_GET_ASIZE(dva));
- }
-
- (void) snprintf(buf + strlen(buf), len - strlen(buf),
- "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
- BP_GET_CHECKSUM(bp) < ZIO_CHECKSUM_FUNCTIONS ?
- zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name : "UNKNOWN",
- BP_GET_COMPRESS(bp) < ZIO_COMPRESS_FUNCTIONS ?
- zio_compress_table[BP_GET_COMPRESS(bp)].ci_name : "UNKNOWN",
- BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
- BP_IS_GANG(bp) ? "gang" : "contiguous",
- (u_longlong_t)bp->blk_birth,
- (u_longlong_t)bp->blk_fill,
- (u_longlong_t)bp->blk_cksum.zc_word[0],
- (u_longlong_t)bp->blk_cksum.zc_word[1],
- (u_longlong_t)bp->blk_cksum.zc_word[2],
- (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
}
void
@@ -1254,6 +1221,12 @@
return (spa->spa_first_txg);
}
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+ return (spa->spa_syncing_txg);
+}
+
pool_state_t
spa_state(spa_t *spa)
{
@@ -1266,56 +1239,18 @@
return (spa->spa_freeze_txg);
}
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
-uint64_t
-spa_get_alloc(spa_t *spa)
-{
- return (spa->spa_root_vdev->vdev_stat.vs_alloc);
-}
-
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
-uint64_t
-spa_get_space(spa_t *spa)
-{
- return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
-uint64_t
-spa_get_dspace(spa_t *spa)
-{
- if (spa->spa_deflate)
- return (spa->spa_root_vdev->vdev_stat.vs_dspace);
- else
- return (spa->spa_root_vdev->vdev_stat.vs_space);
-}
-
-/*
- * Return the amount of space deferred from freeing (in in-core maps only)
- */
-uint64_t
-spa_get_defers(spa_t *spa)
-{
- return (spa->spa_root_vdev->vdev_stat.vs_defer);
-}
-
/* ARGSUSED */
uint64_t
spa_get_asize(spa_t *spa, uint64_t lsize)
{
/*
- * For now, the worst case is 512-byte RAID-Z blocks, in which
- * case the space requirement is exactly 2x; so just assume that.
- * Add to this the fact that we can have up to 3 DVAs per bp, and
- * we have to multiply by a total of 6x.
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().
*/
- return (lsize * 6);
+ return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
}
/*
@@ -1340,6 +1275,24 @@
return (spa->spa_ubsync.ub_version);
}
+boolean_t
+spa_deflate(spa_t *spa)
+{
+ return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+ return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+ return (spa->spa_log_class);
+}
+
int
spa_max_replication(spa_t *spa)
{
@@ -1354,23 +1307,45 @@
}
uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
{
- int sz = 0, i;
+ uint64_t asize = DVA_GET_ASIZE(dva);
+ uint64_t dsize = asize;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ if (asize != 0 && spa->spa_deflate) {
+ vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+ }
+
+ return (dsize);
+}
- if (!spa->spa_deflate)
- return (BP_GET_ASIZE(bp));
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+ return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+ uint64_t dsize = 0;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (i = 0; i < SPA_DVAS_PER_BP; i++) {
- vdev_t *vd =
- vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- if (vd)
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
- SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
- }
+
+ for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
spa_config_exit(spa, SCL_VDEV, FTAG);
- return (sz);
+
+ return (dsize);
}
/*
@@ -1472,9 +1447,18 @@
return (spa->spa_log_class->mc_rotor != NULL);
}
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+ return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+ spa->spa_log_state = state;
+}
+
boolean_t
spa_is_root(spa_t *spa)
{
@@ -1492,3 +1476,27 @@
{
return (spa->spa_mode);
}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+ return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+ return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+ return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+ return (spa->spa_dedup_checksum);
+}
--- a/usr/src/uts/common/fs/zfs/space_map.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/space_map.c Sun Nov 01 14:14:46 2009 -0800
@@ -258,8 +258,10 @@
{
ASSERT(MUTEX_HELD(sm->sm_lock));
- while (sm->sm_loading)
+ while (sm->sm_loading) {
+ ASSERT(!sm->sm_loaded);
cv_wait(&sm->sm_load_cv, sm->sm_lock);
+ }
}
/*
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h Sun Nov 01 14:14:46 2009 -0800
@@ -99,27 +99,17 @@
int arc_referenced(arc_buf_t *buf);
#endif
-typedef struct writeprops {
- dmu_object_type_t wp_type;
- uint8_t wp_level;
- uint8_t wp_copies;
- uint8_t wp_dncompress, wp_oscompress;
- uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
arc_done_func_t *done, void *private, int priority, int zio_flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, int priority, int flags,
uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
- boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
- int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *done, void *private,
+ int priority, int zio_flags, const zbookmark_t *zb);
+void arc_free(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
int arc_buf_evict(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h Sun Nov 01 14:14:46 2009 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,6 +29,7 @@
#include <sys/dmu.h>
#include <sys/spa.h>
#include <sys/txg.h>
+#include <sys/zio.h>
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -67,6 +68,10 @@
dmu_buf_t *bpl_cached_dbuf;
} bplist_t;
+typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+extern void bplist_init(bplist_t *bpl);
+extern void bplist_fini(bplist_t *bpl);
extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
@@ -74,13 +79,15 @@
extern boolean_t bplist_empty(bplist_t *bpl);
extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+ void *arg, dmu_tx_t *tx);
extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
extern int bplist_space(bplist_t *bpl,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
extern int bplist_space_birthrange(bplist_t *bpl,
- uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+ uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
#ifdef __cplusplus
}
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Sun Nov 01 14:14:46 2009 -0800
@@ -133,6 +133,7 @@
arc_buf_t *dr_data;
blkptr_t dr_overridden_by;
override_states_t dr_override_state;
+ uint8_t dr_copies;
} dl;
} dt;
} dbuf_dirty_record_t;
@@ -254,6 +255,7 @@
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
@@ -323,7 +325,7 @@
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
+ sprintf_blkptr(__blkbuf, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,239 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DDT_H
+#define _SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+ DDT_TYPE_ZAP = 0,
+ DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+ DDT_CLASS_DITTO = 0,
+ DDT_CLASS_DUPLICATE,
+ DDT_CLASS_UNIQUE,
+ DDT_CLASSES
+};
+
+#define DDT_TYPE_CURRENT 0
+
+#define DDT_COMPRESS_BYTEORDER_MASK 0x80
+#define DDT_COMPRESS_FUNCTION_MASK 0x7f
+
+/*
+ * DDT statistics.
+ */
+typedef struct ddt_stat {
+ uint64_t dds_blocks; /* blocks */
+ uint64_t dds_lsize; /* logical size */
+ uint64_t dds_psize; /* physical size */
+ uint64_t dds_dsize; /* deflated allocated size */
+ uint64_t dds_ref_blocks; /* referenced blocks */
+ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */
+ uint64_t dds_ref_psize; /* referenced psize * refcnt */
+ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+ ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */
+} ddt_histogram_t;
+
+/*
+ * On-disk ddt entry: key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+ zio_cksum_t ddk_cksum; /* 256-bit block checksum */
+ uint64_t ddk_prop; /* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define DDK_GET_LSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_LSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_PSIZE(ddk) \
+ BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define DDK_SET_PSIZE(ddk, x) \
+ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8)
+#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+ dva_t ddp_dva[SPA_DVAS_PER_BP];
+ uint64_t ddp_refcnt;
+ uint64_t ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+ DDT_PHYS_DITTO = 0,
+ DDT_PHYS_SINGLE = 1,
+ DDT_PHYS_DOUBLE = 2,
+ DDT_PHYS_TRIPLE = 3,
+ DDT_PHYS_TYPES
+} ddt_phys_type_t;
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+ ddt_key_t dde_key;
+ ddt_phys_t dde_phys[DDT_PHYS_TYPES];
+ zio_t *dde_lead_zio[DDT_PHYS_TYPES];
+ void *dde_repair_data;
+ enum ddt_type dde_type;
+ enum ddt_class dde_class;
+ uint8_t dde_loading;
+ uint8_t dde_loaded;
+ kcondvar_t dde_cv;
+ avl_node_t dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+ kmutex_t ddt_lock;
+ avl_tree_t ddt_tree;
+ avl_tree_t ddt_repair_tree;
+ enum zio_checksum ddt_checksum;
+ spa_t *ddt_spa;
+ objset_t *ddt_os;
+ uint64_t ddt_stat_object;
+ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];
+ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
+ avl_node_t ddt_node;
+};
+
+typedef struct ddt_ops {
+ char ddt_op_name[32];
+ int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+ boolean_t prehash);
+ int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+ int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+ int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ dmu_tx_t *tx);
+ int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+ uint64_t *walk);
+ uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define DDT_NAMELEN 80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, ddt_entry_t *dde, uint64_t *walk);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+ enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+ uint64_t txg);
+extern void ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk,
+ const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+ uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+ ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern ddt_t *ddt_select_by_checksum(spa_t *spa, enum zio_checksum c);
+
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DDT_H */
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Sun Nov 01 14:14:46 2009 -0800
@@ -61,6 +61,7 @@
struct spa;
struct nvlist;
struct arc_buf;
+struct zio_prop;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
@@ -118,6 +119,8 @@
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
DMU_OT_USERREFS, /* ZAP */
+ DMU_OT_DDT_ZAP, /* ZAP */
+ DMU_OT_DDT_STATS, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -152,6 +155,7 @@
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
+#define DMU_DEADLIST_OBJECT (-3ULL)
/*
* Public routines to create, destroy, open, and close objsets.
@@ -202,6 +206,8 @@
#define DMU_POOL_PROPS "pool_props"
#define DMU_POOL_L2CACHE "l2cache"
#define DMU_POOL_TMP_USERREFS "tmp_userrefs"
+#define DMU_POOL_DDT "DDT-%s-%s-%s"
+#define DMU_POOL_DDT_STATS "DDT-statistics"
/* 4x8 zbookmark_t */
#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark"
@@ -299,11 +305,13 @@
dmu_tx_t *tx);
/*
- * Decide how many copies of a given block we should make. Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
*/
-int dmu_get_replication_level(objset_t *os, struct zbookmark *zb,
- dmu_object_type_t ot);
+#define WP_NOFILL 0x1
+#define WP_DMU_SYNC 0x2
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+ struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
* You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -498,19 +506,19 @@
uint64_t len);
typedef struct dmu_object_info {
- /* All sizes are in bytes. */
+ /* All sizes are in bytes unless otherwise indicated. */
uint32_t doi_data_block_size;
uint32_t doi_metadata_block_size;
- uint64_t doi_bonus_size;
dmu_object_type_t doi_type;
dmu_object_type_t doi_bonus_type;
+ uint64_t doi_bonus_size;
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_pad[5];
- /* Values below are number of 512-byte blocks. */
- uint64_t doi_physical_blks; /* data + metadata */
- uint64_t doi_max_block_offset;
+ uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
+ uint64_t doi_max_offset;
+ uint64_t doi_fill_count; /* number of non-empty blocks */
} dmu_object_info_t;
typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -623,9 +631,20 @@
* storage when the write completes this new data does not become a
* permanent part of the file until the associated transaction commits.
*/
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
- struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+ struct zilog *zgd_zilog;
+ struct blkptr *zgd_bp;
+ dmu_buf_t *zgd_db;
+ struct rl *zgd_rl;
+ void *zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
/*
* Find the next hole or data block in file starting at *off
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Sun Nov 01 14:14:46 2009 -0800
@@ -67,12 +67,16 @@
dnode_t *os_userused_dnode;
dnode_t *os_groupused_dnode;
zilog_t *os_zil;
- uint8_t os_checksum; /* can change, under dsl_dir's locks */
- uint8_t os_compress; /* can change, under dsl_dir's locks */
- uint8_t os_copies; /* can change, under dsl_dir's locks */
- uint8_t os_primary_cache; /* can change, under dsl_dir's locks */
- uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */
- uint8_t os_logbias; /* can change, under dsl_dir's locks */
+
+ /* can change, under dsl_dir's locks: */
+ uint8_t os_checksum;
+ uint8_t os_compress;
+ uint8_t os_copies;
+ uint8_t os_dedup_checksum;
+ uint8_t os_dedup_verify;
+ uint8_t os_logbias;
+ uint8_t os_primary_cache;
+ uint8_t os_secondary_cache;
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
@@ -97,6 +101,7 @@
void *os_user_ptr;
};
+#define DMU_META_OBJSET 0
#define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
@@ -135,6 +140,7 @@
/* called from dsl */
void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Sun Nov 01 14:14:46 2009 -0800
@@ -36,8 +36,9 @@
struct dnode_phys;
struct dsl_dataset;
+struct zilog;
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
#define TRAVERSE_PRE (1<<0)
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Sun Nov 01 14:14:46 2009 -0800
@@ -213,9 +213,10 @@
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+ dmu_tx_t *tx, boolean_t async);
boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Sun Nov 01 14:14:46 2009 -0800
@@ -102,6 +102,7 @@
uint64_t dp_scrub_start_time;
kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
boolean_t dp_scrub_restart;
+ boolean_t dp_scrub_ditto;
/* Has its own locking */
tx_state_t dp_tx;
@@ -124,7 +125,7 @@
void dsl_pool_close(dsl_pool_t *dp);
dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
@@ -132,8 +133,7 @@
void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
void dsl_pool_memory_pressure(dsl_pool_t *dp);
void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
- zio_done_func_t *done, void *private, uint32_t arc_flags);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Sun Nov 01 14:14:46 2009 -0800
@@ -36,9 +36,6 @@
extern "C" {
#endif
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
extern space_map_ops_t *zfs_metaslab_ops;
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@@ -64,6 +61,14 @@
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
extern int metaslab_class_validate(metaslab_class_t *mc);
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+ int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+
extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
vdev_t *vd);
extern void metaslab_group_destroy(metaslab_group_t *mg);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -39,8 +39,12 @@
struct metaslab_class {
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
- uint64_t mc_allocated;
space_map_ops_t *mc_ops;
+ uint64_t mc_aliquot;
+ uint64_t mc_alloc; /* total allocated space */
+ uint64_t mc_deferred; /* total deferred frees */
+ uint64_t mc_space; /* total space (alloc + free) */
+ uint64_t mc_dspace; /* total deflated space */
};
struct metaslab_group {
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,13 @@
typedef struct spa spa_t;
typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
typedef struct zilog zilog_t;
typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
struct dsl_pool;
/*
@@ -134,15 +139,15 @@
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 9 | padding |
+ * 9 | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * a | birth txg |
+ * a | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +171,29 @@
* cksum checksum function
* comp compression function
* G gang block indicator
- * E endianness
+ * B byteorder (endianness)
+ * D dedup
+ * X unused
+ * lvl level of indirection
* type DMU object type
- * lvl level of indirection
- * birth txg transaction group in which the block was born
+ * phys birth txg of block allocation; zero if same as logical birth txg
+ * log. birth transaction group in which the block was logically born
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
-typedef struct blkptr {
- dva_t blk_dva[3]; /* 128-bit Data Virtual Address */
- uint64_t blk_prop; /* size, compression, type, etc */
- uint64_t blk_pad[3]; /* Extra space for the future */
- uint64_t blk_birth; /* transaction group at birth */
- uint64_t blk_fill; /* fill count */
- zio_cksum_t blk_cksum; /* 256-bit checksum */
-} blkptr_t;
-
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+typedef struct blkptr {
+ dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+ uint64_t blk_prop; /* size, compression, type, etc */
+ uint64_t blk_pad[2]; /* Extra space for the future */
+ uint64_t blk_phys_birth; /* txg when block was allocated */
+ uint64_t blk_birth; /* transaction group at birth */
+ uint64_t blk_fill; /* fill count */
+ zio_cksum_t blk_cksum; /* 256-bit checksum */
+} blkptr_t;
+
/*
* Macros to get and set fields in a bp or DVA.
*/
@@ -209,7 +218,6 @@
#define BP_GET_LSIZE(bp) \
BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-
#define BP_SET_LSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
@@ -218,20 +226,35 @@
#define BP_SET_PSIZE(bp, x) \
BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
+#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
+#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
+#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
+#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
+#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
-#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
+#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
+#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
-#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
-#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
+#define BP_PHYSICAL_BIRTH(bp) \
+ ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+#define BP_SET_BIRTH(bp, logical, physical) \
+{ \
+ (bp)->blk_birth = (logical); \
+ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
#define BP_GET_ASIZE(bp) \
(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -255,6 +278,12 @@
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
(dva1)->dva_word[0] == (dva2)->dva_word[0])
+#define BP_EQUAL(bp1, bp2) \
+ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
+ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \
(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +303,6 @@
#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
-#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
#define BP_ZERO(bp) \
{ \
@@ -287,14 +315,12 @@
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
- (bp)->blk_pad[2] = 0; \
+ (bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
-#define BLK_FILL_ALREADY_FREED (-1ULL)
-
/*
* Note: the byteorder is either 0 or -1, both of which are palindromes.
* This simplifies the endianness handling a bit.
@@ -309,14 +335,71 @@
#define BP_SPRINTF_LEN 320
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+{ \
+ static const char *copyname[] = \
+ { "zero", "single", "double", "triple" }; \
+ int size = BP_SPRINTF_LEN; \
+ int len = 0; \
+ int copies = 0; \
+ \
+ if (bp == NULL) { \
+ len = func(buf + len, size - len, "<NULL>"); \
+ } else if (BP_IS_HOLE(bp)) { \
+ len = func(buf + len, size - len, "<hole>"); \
+ } else { \
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
+ const dva_t *dva = &bp->blk_dva[d]; \
+ if (DVA_IS_VALID(dva)) \
+ copies++; \
+ len += func(buf + len, size - len, \
+ "DVA[%d]=<%llu:%llx:%llx>%c", d, \
+ (u_longlong_t)DVA_GET_VDEV(dva), \
+ (u_longlong_t)DVA_GET_OFFSET(dva), \
+ (u_longlong_t)DVA_GET_ASIZE(dva), \
+ ws); \
+ } \
+ if (BP_IS_GANG(bp) && \
+ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \
+ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \
+ copies--; \
+ len += func(buf + len, size - len, \
+ "[L%llu %s] %s %s %s %s %s %s%c" \
+ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \
+ "cksum=%llx:%llx:%llx:%llx", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ checksum, \
+ compress, \
+ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \
+ BP_IS_GANG(bp) ? "gang" : "contiguous", \
+ BP_GET_DEDUP(bp) ? "dedup" : "unique", \
+ copyname[copies], \
+ ws, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)BP_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth, \
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
+ (u_longlong_t)bp->blk_fill, \
+ ws, \
+ (u_longlong_t)bp->blk_cksum.zc_word[0], \
+ (u_longlong_t)bp->blk_cksum.zc_word[1], \
+ (u_longlong_t)bp->blk_cksum.zc_word[2], \
+ (u_longlong_t)bp->blk_cksum.zc_word[3]); \
+ } \
+ ASSERT(len < size); \
+}
+
#include <sys/dmu.h>
#define BP_GET_BUFC_TYPE(bp) \
(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
@@ -370,7 +453,6 @@
extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
extern void spa_l2cache_activate(vdev_t *vd);
extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
/* scrubbing */
extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
@@ -379,6 +461,10 @@
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
extern void spa_sync_allpools(void);
+#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
+#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
+#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
+
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
@@ -396,7 +482,6 @@
extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
int getstats);
extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
/*
* Miscellaneous SPA routines in spa_misc.c
@@ -442,6 +527,20 @@
extern void spa_vdev_state_enter(spa_t *spa, int oplock);
extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
+/* Log state */
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
/* Accessor functions */
extern boolean_t spa_shutting_down(spa_t *spa);
extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -453,19 +552,23 @@
extern uint64_t spa_guid(spa_t *spa);
extern uint64_t spa_last_synced_txg(spa_t *spa);
extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern pool_state_t spa_state(spa_t *spa);
extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
-extern uint64_t spa_get_defers(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_busy(void);
extern uint8_t spa_get_failmode(spa_t *spa);
extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
@@ -473,14 +576,16 @@
extern char *spa_strdup(const char *);
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
extern void spa_evict_all(void);
extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
boolean_t l2cache);
extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
@@ -516,10 +621,9 @@
/* error handling */
struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
- struct zio *zio, uint64_t stateoroffset, uint64_t length);
+ zio_t *zio, uint64_t stateoroffset, uint64_t length);
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
@@ -552,7 +656,7 @@
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
+ sprintf_blkptr(__blkbuf, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -78,13 +78,6 @@
char *scd_path;
} spa_config_dirent_t;
-typedef enum spa_log_state {
- SPA_LOG_UNKNOWN = 0, /* unknown log state */
- SPA_LOG_MISSING, /* missing log(s) */
- SPA_LOG_CLEAR, /* clear the log(s) */
- SPA_LOG_GOOD, /* log(s) are good */
-} spa_log_state_t;
-
enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_INTERRUPT,
@@ -114,6 +107,7 @@
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
uint64_t spa_load_max_txg; /* best initial ub_txg */
+ uint64_t spa_claim_max_txg; /* highest claimed birth txg */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
@@ -125,8 +119,9 @@
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
- uint64_t spa_sync_bplist_obj; /* object for deferred frees */
- bplist_t spa_sync_bplist; /* deferred-free bplist */
+ uint64_t spa_deferred_bplist_obj; /* object for deferred frees */
+ bplist_t spa_deferred_bplist; /* deferred-free bplist */
+ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
@@ -177,11 +172,16 @@
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
+ uint8_t spa_claiming; /* pool is doing zil_claim() */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */
uint64_t spa_autoexpand; /* lun expansion on/off */
+ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+ uint64_t spa_ddt_stat_object; /* DDT statistics */
+ uint64_t spa_dedup_ditto; /* dedup ditto threshold */
+ uint64_t spa_dedup_checksum; /* default dedup checksum */
boolean_t spa_autoreplace; /* autoreplace set in open */
int spa_vdev_locks; /* locks grabbed */
/*
@@ -196,12 +196,6 @@
extern const char *spa_config_path;
-#define BOOTFS_COMPRESS_VALID(compress) \
- ((compress) == ZIO_COMPRESS_LZJB || \
- ((compress) == ZIO_COMPRESS_ON && \
- ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
- (compress) == ZIO_COMPRESS_OFF)
-
#ifdef __cplusplus
}
#endif
--- a/usr/src/uts/common/fs/zfs/sys/txg.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h Sun Nov 01 14:14:46 2009 -0800
@@ -73,8 +73,6 @@
extern void txg_rele_to_quiesce(txg_handle_t *txghp);
extern void txg_rele_to_sync(txg_handle_t *txghp);
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
/*
* Delay the caller by the specified number of ticks or until
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -44,7 +44,6 @@
typedef struct tx_state {
tx_cpu_t *tx_cpu; /* protects right to enter txg */
kmutex_t tx_sync_lock; /* protects tx_state_t */
- krwlock_t tx_suspend;
uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
--- a/usr/src/uts/common/fs/zfs/sys/uberblock.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/uberblock.h Sun Nov 01 14:14:46 2009 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,19 +19,16 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_UBERBLOCK_H
#define _SYS_UBERBLOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/spa.h>
#include <sys/vdev.h>
#include <sys/zio.h>
-#include <sys/zio_checksum.h>
#ifdef __cplusplus
extern "C" {
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h Sun Nov 01 14:14:46 2009 -0800
@@ -84,8 +84,8 @@
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
- int64_t alloc_delta, int64_t defer_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+ int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -198,6 +198,8 @@
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
};
+#define VDEV_RAIDZ_MAXPARITY 3
+
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
--- a/usr/src/uts/common/fs/zfs/sys/zap.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h Sun Nov 01 14:14:46 2009 -0800
@@ -101,6 +101,18 @@
MT_FIRST
} matchtype_t;
+typedef enum zap_flags {
+ /* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+ ZAP_FLAG_HASH64 = 1 << 0,
+ /* Key is binary, not string (zap_add_uint64() can be used) */
+ ZAP_FLAG_UINT64_KEY = 1 << 1,
+ /*
+ * First word of key (which must be an array of uint64) is
+ * already randomly distributed.
+ */
+ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
/*
* Create a new zapobj with no attributes and return its object number.
* MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -118,6 +130,9 @@
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
/*
* Create a new zapobj with no attributes from the given (unallocated)
@@ -180,6 +195,8 @@
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
int add, uint64_t *towrite, uint64_t *tooverwrite);
@@ -190,9 +207,12 @@
* If an attribute with the given name already exists, the call will
* fail and return EEXIST.
*/
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx);
/*
* Set the attribute with the given name to the given value. If an
@@ -204,6 +224,9 @@
*/
int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
/*
* Get the length (in integers) and the integer size of the specified
@@ -214,6 +237,8 @@
*/
int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers);
/*
* Remove the specified attribute.
@@ -224,6 +249,8 @@
int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx);
/*
* Returns (in *count) the number of attributes in the specified zap
@@ -266,6 +293,7 @@
struct zap *zc_zap;
struct zap_leaf *zc_leaf;
uint64_t zc_zapobj;
+ uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
} zap_cursor_t;
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -40,13 +40,13 @@
#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift)
-#define ZAP_MAXCD (uint32_t)(-1)
-#define ZAP_HASHBITS 28
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define ZAP_NEED_CD (-1U)
+
typedef struct mzap_ent_phys {
uint64_t mze_value;
uint32_t mze_cd;
@@ -70,7 +70,6 @@
mzap_ent_phys_t mze_phys;
} mzap_ent_t;
-
/*
* The (fat) zap is stored in one object. It is an array of
* 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
@@ -127,6 +126,7 @@
uint64_t zap_num_entries; /* number of entries */
uint64_t zap_salt; /* salt to stir into hash function */
uint64_t zap_normflags; /* flags for u8_textprep_str() */
+ uint64_t zap_flags; /* zap_flags_t */
/*
* This structure is followed by padding, and then the embedded
* pointer table. The embedded pointer table takes up second
@@ -168,10 +168,13 @@
typedef struct zap_name {
zap_t *zn_zap;
- const char *zn_name_orij;
+ int zn_key_intlen;
+ const void *zn_key_orig;
+ int zn_key_orig_len;
+ const void *zn_key_norm;
+ int zn_key_norm_len;
uint64_t zn_hash;
matchtype_t zn_matchtype;
- const char *zn_name_norm;
char zn_normbuf[ZAP_MAXNAMELEN];
} zap_name_t;
@@ -183,8 +186,11 @@
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
void zap_unlockdir(zap_t *zap);
void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
@@ -209,7 +215,7 @@
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
#ifdef __cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZAP_LEAF_H
#define _SYS_ZAP_LEAF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -132,7 +130,7 @@
uint8_t le_int_size; /* size of ints */
uint16_t le_next; /* next entry in hash chain */
uint16_t le_name_chunk; /* first chunk of the name */
- uint16_t le_name_length; /* bytes in name, incl null */
+ uint16_t le_name_length; /* ints in name (incl null) */
uint16_t le_value_chunk; /* first chunk of the value */
uint16_t le_value_length; /* value length in ints */
uint32_t le_cd; /* collision differentiator */
@@ -193,10 +191,10 @@
* num_integers in the attribute.
*/
extern int zap_entry_read(const zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, void *buf);
+ uint8_t integer_size, uint64_t num_integers, void *buf);
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
- uint16_t buflen, char *buf);
+extern int zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh,
+ uint16_t buflen, char *buf);
/*
* Replace the value of an existing entry.
@@ -204,7 +202,7 @@
* zap_entry_update may fail if it runs out of space (ENOSPC).
*/
extern int zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf);
+ uint8_t integer_size, uint64_t num_integers, const void *buf);
/*
* Remove an entry.
@@ -216,10 +214,9 @@
* belong in this leaf (according to its hash value). Fills in the
* entry handle on success. Returns 0 on success or ENOSPC on failure.
*/
-extern int zap_entry_create(zap_leaf_t *l,
- const char *name, uint64_t h, uint32_t cd,
- uint8_t integer_size, uint64_t num_integers, const void *buf,
- zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
+ uint8_t integer_size, uint64_t num_integers, const void *buf,
+ zap_entry_handle_t *zeh);
/*
* Return true if there are additional entries with the same normalized
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Sun Nov 01 14:14:46 2009 -0800
@@ -55,15 +55,17 @@
uint64_t zh_claim_txg; /* txg in which log blocks were claimed */
uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */
- uint64_t zh_claim_seq; /* highest claimed sequence number */
+ uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
uint64_t zh_flags; /* header flags */
- uint64_t zh_pad[4];
+ uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+ uint64_t zh_pad[3];
} zil_header_t;
/*
* zh_flags bit settings
*/
-#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
+#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */
/*
* Log block trailer - structure at the end of the header and each log block
@@ -150,6 +152,20 @@
#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order. For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define TX_OOO(txtype) \
+ ((txtype) == TX_WRITE || \
+ (txtype) == TX_TRUNCATE || \
+ (txtype) == TX_SETATTR || \
+ (txtype) == TX_ACL_V0 || \
+ (txtype) == TX_ACL || \
+ (txtype) == TX_WRITE2)
+
+
+/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
@@ -169,6 +185,14 @@
} lr_t;
/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+ lr_t lr_common; /* common portion of log record */
+ uint64_t lr_foid; /* object id */
+} lr_ooo_t;
+
+/*
* Handle option extended vattr attributes.
*
* Whenever new attributes are added the version number
@@ -258,7 +282,7 @@
uint64_t lr_foid; /* file object to write */
uint64_t lr_offset; /* offset to write to */
uint64_t lr_length; /* user data length to write */
- uint64_t lr_blkoff; /* offset represented by lr_blkptr */
+ uint64_t lr_blkoff; /* no longer used */
blkptr_t lr_blkptr; /* spa block pointer for replay */
/* write data will follow for small writes */
} lr_write_t;
@@ -333,6 +357,7 @@
/* and put blkptr in log, rather than actual data) */
WR_COPIED, /* immediate - data is copied into lr_write_t */
WR_NEED_COPY, /* immediate - data needs to be copied if pushed */
+ WR_NUM_STATES /* number of states */
} itx_wr_state_t;
typedef struct itx {
@@ -345,26 +370,14 @@
/* followed by type-specific part of lr_xx_t and its immediate data */
} itx_t;
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
- zilog_t *zgd_zilog; /* zilog */
- blkptr_t *zgd_bp; /* block pointer */
- struct rl *zgd_rl; /* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
uint64_t txg);
typedef int zil_replay_func_t();
typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
-extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
extern void zil_init(void);
@@ -378,10 +391,12 @@
extern void zil_replay(objset_t *os, void *arg,
zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_destroy(zilog_t *zilog, boolean_t keep_first);
extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void zil_itx_destroy(itx_t *itx);
extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
@@ -391,14 +406,14 @@
extern int zil_check_log_chain(char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog);
-extern int zil_is_committed(zilog_t *zilog);
extern int zil_suspend(zilog_t *zilog);
extern void zil_resume(zilog_t *zilog);
-extern void zil_add_block(zilog_t *zilog, blkptr_t *bp);
+extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval);
-extern void zil_get_replay_data(zilog_t *zilog, lr_write_t *lr);
extern int zil_disable;
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,8 @@
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
zio_t *lwb_zio; /* zio for this buffer */
+ dmu_tx_t *lwb_tx; /* tx for log block allocation */
uint64_t lwb_max_txg; /* highest txg in this lwb */
- txg_handle_t lwb_txgh; /* txg handle for txg_exit() */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
} lwb_t;
@@ -68,9 +68,10 @@
objset_t *zl_os; /* object set we're logging */
zil_get_data_t *zl_get_data; /* callback to get object content */
zio_t *zl_root_zio; /* log writer root zio */
- uint64_t zl_itx_seq; /* next itx sequence number */
+ uint64_t zl_itx_seq; /* next in-core itx sequence number */
+ uint64_t zl_lr_seq; /* on-disk log record sequence number */
uint64_t zl_commit_seq; /* committed upto this number */
- uint64_t zl_lr_seq; /* log record sequence number */
+ uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
uint64_t zl_replaying_seq; /* current replay seq number */
@@ -82,8 +83,12 @@
uint8_t zl_replay; /* replaying records while set */
uint8_t zl_stop_sync; /* for debugging */
uint8_t zl_writer; /* boolean: write setup in progress */
- uint8_t zl_log_error; /* boolean: log write error */
uint8_t zl_logbias; /* latency or throughput */
+ int zl_parse_error; /* last zil_parse() error */
+ uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */
+ uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */
+ uint64_t zl_parse_blk_count; /* number of blocks parsed */
+ uint64_t zl_parse_lr_count; /* number of log records parsed */
list_t zl_itx_list; /* in-memory itx list */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
@@ -92,15 +97,16 @@
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */
taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */
- avl_tree_t zl_dva_tree; /* track DVAs during log parse */
+ avl_tree_t zl_bp_tree; /* track bps during log parse */
clock_t zl_replay_time; /* lbolt of when replay started */
uint64_t zl_replay_blks; /* number of log blocks replayed */
+ zil_header_t zl_old_header; /* debugging aid */
};
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
dva_t zn_dva;
avl_node_t zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
sizeof (lr_write_t))
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Sun Nov 01 14:14:46 2009 -0800
@@ -79,6 +79,12 @@
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
+#define ZIO_CHECKSUM_MASK 0xffULL
+#define ZIO_CHECKSUM_VERIFY (1 << 8)
+
+#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256
+#define ZIO_DEDUPDITTO_MIN 100
+
enum zio_compress {
ZIO_COMPRESS_INHERIT = 0,
ZIO_COMPRESS_ON,
@@ -94,12 +100,19 @@
ZIO_COMPRESS_GZIP_7,
ZIO_COMPRESS_GZIP_8,
ZIO_COMPRESS_GZIP_9,
+ ZIO_COMPRESS_ZLE,
ZIO_COMPRESS_FUNCTIONS
};
#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+#define BOOTFS_COMPRESS_VALID(compress) \
+ ((compress) == ZIO_COMPRESS_LZJB || \
+ ((compress) == ZIO_COMPRESS_ON && \
+ ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+ (compress) == ZIO_COMPRESS_OFF)
+
#define ZIO_FAILURE_MODE_WAIT 0
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
@@ -116,75 +129,78 @@
#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
#define ZIO_PRIORITY_TABLE_SIZE 10
-#define ZIO_FLAG_MUSTSUCCEED 0x000000
-#define ZIO_FLAG_CANFAIL 0x000001
-#define ZIO_FLAG_SPECULATIVE 0x000002
-#define ZIO_FLAG_CONFIG_WRITER 0x000004
-#define ZIO_FLAG_DONT_RETRY 0x000008
-
-#define ZIO_FLAG_DONT_CACHE 0x000010
-#define ZIO_FLAG_DONT_QUEUE 0x000020
-#define ZIO_FLAG_DONT_AGGREGATE 0x000040
-#define ZIO_FLAG_DONT_PROPAGATE 0x000080
-
-#define ZIO_FLAG_IO_BYPASS 0x000100
-#define ZIO_FLAG_IO_REPAIR 0x000200
-#define ZIO_FLAG_IO_RETRY 0x000400
-#define ZIO_FLAG_IO_REWRITE 0x000800
-
-#define ZIO_FLAG_SELF_HEAL 0x001000
-#define ZIO_FLAG_RESILVER 0x002000
-#define ZIO_FLAG_SCRUB 0x004000
-#define ZIO_FLAG_SCRUB_THREAD 0x008000
-
-#define ZIO_FLAG_PROBE 0x010000
-#define ZIO_FLAG_GANG_CHILD 0x020000
-#define ZIO_FLAG_RAW 0x040000
-#define ZIO_FLAG_GODFATHER 0x080000
-
-#define ZIO_FLAG_TRYHARD 0x100000
-#define ZIO_FLAG_NODATA 0x200000
-#define ZIO_FLAG_OPTIONAL 0x400000
-
-#define ZIO_FLAG_GANG_INHERIT \
- (ZIO_FLAG_CANFAIL | \
- ZIO_FLAG_SPECULATIVE | \
- ZIO_FLAG_CONFIG_WRITER | \
- ZIO_FLAG_DONT_RETRY | \
- ZIO_FLAG_DONT_CACHE | \
- ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
-
-#define ZIO_FLAG_VDEV_INHERIT \
- (ZIO_FLAG_GANG_INHERIT | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_IO_RETRY | \
- ZIO_FLAG_PROBE | \
- ZIO_FLAG_TRYHARD | \
- ZIO_FLAG_NODATA | \
- ZIO_FLAG_OPTIONAL)
-
-#define ZIO_FLAG_AGG_INHERIT \
- (ZIO_FLAG_DONT_AGGREGATE | \
- ZIO_FLAG_IO_REPAIR | \
- ZIO_FLAG_SELF_HEAL | \
- ZIO_FLAG_RESILVER | \
- ZIO_FLAG_SCRUB | \
- ZIO_FLAG_SCRUB_THREAD)
-
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
+enum zio_flag {
+ /*
+ * Flags inherited by gang, ddt, and vdev children,
+ * and that must be equal for two zios to aggregate
+ */
+ ZIO_FLAG_DONT_AGGREGATE = 1 << 0,
+ ZIO_FLAG_IO_REPAIR = 1 << 1,
+ ZIO_FLAG_SELF_HEAL = 1 << 2,
+ ZIO_FLAG_RESILVER = 1 << 3,
+ ZIO_FLAG_SCRUB = 1 << 4,
+ ZIO_FLAG_SCRUB_THREAD = 1 << 5,
+
+#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
+
+ /*
+ * Flags inherited by ddt, gang, and vdev children.
+ */
+ ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 7,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 8,
+ ZIO_FLAG_DONT_RETRY = 1 << 9,
+ ZIO_FLAG_DONT_CACHE = 1 << 10,
+ ZIO_FLAG_NODATA = 1 << 11,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+
+#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
+
+ /*
+ * Flags inherited by vdev children.
+ */
+ ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 14,
+ ZIO_FLAG_TRYHARD = 1 << 15,
+ ZIO_FLAG_OPTIONAL = 1 << 16,
+
+#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
+
+ /*
+ * Flags not inherited by any children.
+ */
+ ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
+ ZIO_FLAG_IO_BYPASS = 1 << 19,
+ ZIO_FLAG_IO_REWRITE = 1 << 20,
+ ZIO_FLAG_RAW = 1 << 21,
+ ZIO_FLAG_GANG_CHILD = 1 << 22,
+ ZIO_FLAG_DDT_CHILD = 1 << 23,
+ ZIO_FLAG_GODFATHER = 1 << 24
+};
+
+#define ZIO_FLAG_MUSTSUCCEED 0
+
+#define ZIO_DDT_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \
+ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
#define ZIO_GANG_CHILD_FLAGS(zio) \
(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \
ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
+#define ZIO_VDEV_CHILD_FLAGS(zio) \
+ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
+ ZIO_FLAG_CANFAIL)
+
enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
+ ZIO_CHILD_DDT,
ZIO_CHILD_LOGICAL,
ZIO_CHILD_TYPES
};
@@ -202,7 +218,6 @@
#define ECKSUM EBADE
#define EFRAGS EBADR
-typedef struct zio zio_t;
typedef void zio_done_func_t(zio_t *zio);
extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -211,18 +226,15 @@
/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
* identifies any block in the pool. By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number. In summary:
+ * is objset 0, and the meta-dnode is object 0. This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
*
- * mos is objset 0
- * meta-dnode is object 0
- * root block is <objset, 0, -1, 0>
- * intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
*
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse. The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
*
* Note: this structure is passed between userland and the kernel.
* Therefore it must not change size or alignment between 32/64 bit
@@ -235,12 +247,31 @@
uint64_t zb_blkid;
} zbookmark_t;
+#define SET_BOOKMARK(zb, objset, object, level, blkid) \
+{ \
+ (zb)->zb_objset = objset; \
+ (zb)->zb_object = object; \
+ (zb)->zb_level = level; \
+ (zb)->zb_blkid = blkid; \
+}
+
+#define ZB_DESTROYED_OBJSET (-1ULL)
+
+#define ZB_ROOT_OBJECT (0ULL)
+#define ZB_ROOT_LEVEL (-1LL)
+#define ZB_ROOT_BLKID (0ULL)
+
+#define ZB_ZIL_OBJECT (0ULL)
+#define ZB_ZIL_LEVEL (-2LL)
+
typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
dmu_object_type_t zp_type;
uint8_t zp_level;
- uint8_t zp_ndvas;
+ uint8_t zp_copies;
+ uint8_t zp_dedup;
+ uint8_t zp_dedup_verify;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
@@ -255,9 +286,9 @@
struct zio_cksum_report *zcr_next;
nvlist_t *zcr_ereport;
nvlist_t *zcr_detector;
-
void *zcr_cbdata;
size_t zcr_cbinfo; /* passed to zcr_free() */
+ uint64_t zcr_align;
uint64_t zcr_length;
zio_cksum_finish_f *zcr_finish;
zio_cksum_free_f *zcr_free;
@@ -326,6 +357,7 @@
uint64_t io_txg;
spa_t *io_spa;
blkptr_t *io_bp;
+ blkptr_t *io_bp_override;
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
@@ -337,11 +369,14 @@
zio_done_func_t *io_ready;
zio_done_func_t *io_done;
void *io_private;
+ int64_t io_prev_space_delta; /* DMU private */
blkptr_t io_bp_orig;
/* Data represented by this I/O */
void *io_data;
+ void *io_orig_data;
uint64_t io_size;
+ uint64_t io_orig_size;
/* Stuff for the vdev stack */
vdev_t *io_vd;
@@ -355,15 +390,17 @@
avl_tree_t *io_vdev_tree;
/* Internal pipeline state */
- int io_flags;
- zio_stage_t io_stage;
- uint32_t io_pipeline;
- int io_orig_flags;
- zio_stage_t io_orig_stage;
- uint32_t io_orig_pipeline;
+ enum zio_flag io_flags;
+ enum zio_stage io_stage;
+ enum zio_stage io_pipeline;
+ enum zio_flag io_orig_flags;
+ enum zio_stage io_orig_stage;
+ enum zio_stage io_orig_pipeline;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+ uint64_t io_child_count;
+ uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree;
@@ -378,48 +415,51 @@
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_root(spa_t *spa,
- zio_done_func_t *done, void *private, int flags);
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb);
+ int priority, enum zio_flag flags, const zbookmark_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, zbookmark_t *zb);
+ int priority, enum zio_flag flags, zbookmark_t *zb);
-extern void zio_skip_write(zio_t *zio);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags);
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags,
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
boolean_t labels);
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t txg, boolean_t use_slog);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+ const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern int zio_wait(zio_t *zio);
@@ -441,11 +481,11 @@
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, int priority,
- int flags, zio_done_func_t *done, void *private);
+ enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
extern void zio_vdev_io_reissue(zio_t *zio);
@@ -454,8 +494,12 @@
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+ enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+ enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+ enum zio_compress parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
extern int zio_resume(spa_t *spa);
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Sun Nov 01 14:14:46 2009 -0800
@@ -27,6 +27,7 @@
#define _SYS_ZIO_CHECKSUM_H
#include <sys/zio.h>
+#include <zfs_fletcher.h>
#ifdef __cplusplus
extern "C" {
@@ -44,6 +45,7 @@
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
int ci_correctable; /* number of correctable bits */
int ci_zbt; /* uses zio block tail? */
+ int ci_dedup; /* strong enough for dedup? */
char *ci_name; /* descriptive name */
} zio_checksum_info_t;
@@ -61,14 +63,6 @@
/*
* Checksum routines.
*/
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
extern zio_checksum_t zio_checksum_SHA256;
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h Sun Nov 01 14:14:46 2009 -0800
@@ -20,15 +20,13 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zio.h>
#ifdef __cplusplus
@@ -66,14 +64,18 @@
int level);
extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+ int level);
/*
* Compress and decompress data if necessary.
*/
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
- void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len);
#ifdef __cplusplus
}
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h Sun Nov 01 14:14:46 2009 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -34,104 +34,136 @@
#endif
/*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
*/
-typedef enum zio_stage {
- ZIO_STAGE_OPEN = 0, /* RWFCI */
+enum zio_stage {
+ ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
- ZIO_STAGE_ISSUE_ASYNC, /* -W--- */
+ ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
- ZIO_STAGE_READ_BP_INIT, /* R---- */
- ZIO_STAGE_WRITE_BP_INIT, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */
+ ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE, /* -W--- */
- ZIO_STAGE_DVA_FREE, /* --F-- */
- ZIO_STAGE_DVA_CLAIM, /* ---C- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */
- ZIO_STAGE_READY, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 15, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START, /* RW--I */
- ZIO_STAGE_VDEV_IO_DONE, /* RW--I */
- ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */
- ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */
- ZIO_STAGE_DONE, /* RWFCI */
- ZIO_STAGES
-} zio_stage_t;
+ ZIO_STAGE_DONE = 1 << 20 /* RWFCI */
+};
-#define ZIO_INTERLOCK_STAGES \
- ((1U << ZIO_STAGE_READY) | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_INTERLOCK_STAGES \
+ (ZIO_STAGE_READY | \
+ ZIO_STAGE_DONE)
-#define ZIO_INTERLOCK_PIPELINE \
+#define ZIO_INTERLOCK_PIPELINE \
ZIO_INTERLOCK_STAGES
-#define ZIO_VDEV_IO_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_DONE) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_VDEV_IO_STAGES \
+ (ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_DONE | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
-#define ZIO_VDEV_CHILD_PIPELINE \
- (ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_DONE))
+#define ZIO_VDEV_CHILD_PIPELINE \
+ (ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DONE)
-#define ZIO_READ_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define ZIO_READ_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_CHECKSUM_VERIFY)
-#define ZIO_READ_PHYS_PIPELINE \
+#define ZIO_READ_PHYS_PIPELINE \
ZIO_READ_COMMON_STAGES
-#define ZIO_READ_PIPELINE \
- (ZIO_READ_COMMON_STAGES | \
- (1U << ZIO_STAGE_READ_BP_INIT))
+#define ZIO_READ_PIPELINE \
+ (ZIO_READ_COMMON_STAGES | \
+ ZIO_STAGE_READ_BP_INIT)
+
+#define ZIO_DDT_CHILD_READ_PIPELINE \
+ ZIO_READ_COMMON_STAGES
-#define ZIO_WRITE_COMMON_STAGES \
- (ZIO_INTERLOCK_STAGES | \
- ZIO_VDEV_IO_STAGES | \
- (1U << ZIO_STAGE_ISSUE_ASYNC) | \
- (1U << ZIO_STAGE_CHECKSUM_GENERATE))
+#define ZIO_DDT_READ_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_READ_BP_INIT | \
+ ZIO_STAGE_DDT_READ_START | \
+ ZIO_STAGE_DDT_READ_DONE)
-#define ZIO_WRITE_PHYS_PIPELINE \
+#define ZIO_WRITE_COMMON_STAGES \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_CHECKSUM_GENERATE)
+
+#define ZIO_WRITE_PHYS_PIPELINE \
ZIO_WRITE_COMMON_STAGES
-#define ZIO_REWRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT))
+#define ZIO_REWRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT)
+
+#define ZIO_WRITE_PIPELINE \
+ (ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_DVA_ALLOCATE)
-#define ZIO_WRITE_PIPELINE \
- (ZIO_WRITE_COMMON_STAGES | \
- (1U << ZIO_STAGE_WRITE_BP_INIT) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define ZIO_DDT_CHILD_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_ALLOCATE)
-#define ZIO_GANG_STAGES \
- ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \
- (1U << ZIO_STAGE_GANG_ISSUE))
+#define ZIO_DDT_WRITE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_CHECKSUM_GENERATE | \
+ ZIO_STAGE_DDT_WRITE)
+
+#define ZIO_GANG_STAGES \
+ (ZIO_STAGE_GANG_ASSEMBLE | \
+ ZIO_STAGE_GANG_ISSUE)
-#define ZIO_FREE_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_FREE))
+#define ZIO_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_DVA_FREE)
-#define ZIO_CLAIM_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_DDT_FREE_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_FREE_BP_INIT | \
+ ZIO_STAGE_ISSUE_ASYNC | \
+ ZIO_STAGE_DDT_FREE)
-#define ZIO_IOCTL_PIPELINE \
- (ZIO_INTERLOCK_STAGES | \
- (1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define ZIO_CLAIM_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_DVA_CLAIM)
-#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \
- ((1U << ZIO_STAGE_VDEV_IO_START) | \
- (1U << ZIO_STAGE_DVA_ALLOCATE) | \
- (1U << ZIO_STAGE_DVA_CLAIM))
+#define ZIO_IOCTL_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_VDEV_IO_START | \
+ ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define ZIO_BLOCKING_STAGES \
+ (ZIO_STAGE_DVA_ALLOCATE | \
+ ZIO_STAGE_DVA_CLAIM | \
+ ZIO_STAGE_VDEV_IO_START)
extern void zio_inject_init(void);
extern void zio_inject_fini(void);
--- a/usr/src/uts/common/fs/zfs/txg.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/txg.c Sun Nov 01 14:14:46 2009 -0800
@@ -64,7 +64,6 @@
}
}
- rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
@@ -87,7 +86,6 @@
ASSERT(tx->tx_threads == 0);
- rw_destroy(&tx->tx_suspend);
mutex_destroy(&tx->tx_sync_lock);
cv_destroy(&tx->tx_sync_more_cv);
@@ -397,8 +395,6 @@
if (tx->tx_exiting)
txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
- rw_enter(&tx->tx_suspend, RW_WRITER);
-
/*
* Consume the quiesced txg which has been handed off to
* us. This may cause the quiescing thread to now be
@@ -408,7 +404,6 @@
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
cv_broadcast(&tx->tx_quiesce_more_cv);
- rw_exit(&tx->tx_suspend);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
@@ -419,10 +414,8 @@
delta = lbolt - start;
mutex_enter(&tx->tx_sync_lock);
- rw_enter(&tx->tx_suspend, RW_WRITER);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
- rw_exit(&tx->tx_suspend);
cv_broadcast(&tx->tx_sync_done_cv);
/*
@@ -514,7 +507,7 @@
mutex_enter(&tx->tx_sync_lock);
ASSERT(tx->tx_threads == 2);
if (txg == 0)
- txg = tx->tx_open_txg;
+ txg = tx->tx_open_txg + TXG_DEFER_SIZE;
if (tx->tx_sync_txg_waiting < txg)
tx->tx_sync_txg_waiting = txg;
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -565,21 +558,6 @@
tx->tx_quiesced_txg != 0);
}
-void
-txg_suspend(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- /* XXX some code paths suspend when they are already suspended! */
- rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
- tx_state_t *tx = &dp->dp_tx;
- rw_exit(&tx->tx_suspend);
-}
-
/*
* Per-txg object lists.
*/
--- a/usr/src/uts/common/fs/zfs/vdev.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev.c Sun Nov 01 14:14:46 2009 -0800
@@ -409,10 +409,7 @@
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
- /*
- * Currently, we can only support 3 parity devices.
- */
- if (nparity == 0 || nparity > 3)
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (EINVAL);
/*
* Previous versions could only support 1 or 2 parity
@@ -567,6 +564,7 @@
vdev_close(vd);
ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+ ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
/*
* Free all children.
@@ -816,9 +814,9 @@
ASSERT(oldc <= newc);
if (vd->vdev_islog)
- mc = spa->spa_log_class;
+ mc = spa_log_class(spa);
else
- mc = spa->spa_normal_class;
+ mc = spa_normal_class(spa);
if (vd->vdev_mg == NULL)
vd->vdev_mg = metaslab_group_create(mc, vd);
@@ -1573,7 +1571,7 @@
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
- if (vd == spa->spa_root_vdev || vd->vdev_ishole)
+ if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
@@ -2171,11 +2169,10 @@
return (spa_vdev_state_exit(spa, NULL, EBUSY));
/*
- * If the top-level is a slog and it's had allocations
- * then proceed. We check that the vdev's metaslab
- * grop is not NULL since it's possible that we may
- * have just added this vdev and have not yet initialized
- * it's metaslabs.
+ * If the top-level is a slog and it has had allocations
+ * then proceed. We check that the vdev's metaslab group
+ * is not NULL since it's possible that we may have just
+ * added this vdev but not yet initialized its metaslabs.
*/
if (tvd->vdev_islog && mg != NULL) {
/*
@@ -2496,14 +2493,17 @@
if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
- (flags & ZIO_FLAG_SCRUB_THREAD))) {
+ (flags & ZIO_FLAG_SCRUB_THREAD) ||
+ spa->spa_claiming)) {
/*
- * This is either a normal write (not a repair), or it's a
- * repair induced by the scrub thread. In the normal case,
- * we commit the DTL change in the same txg as the block
- * was born. In the scrub-induced repair case, we know that
- * scrubs run in first-pass syncing context, so we commit
- * the DTL change in spa->spa_syncing_txg.
+ * This is either a normal write (not a repair), or it's
+ * a repair induced by the scrub thread, or it's a repair
+ * made by zil_claim() during spa_load() in the first txg.
+ * In the normal case, we commit the DTL change in the same
+ * txg as the block was born. In the scrub-induced repair
+ * case, we know that scrubs run in first-pass syncing context,
+ * so we commit the DTL change in spa_syncing_txg(spa).
+ * In the zil_claim() case, we commit in spa_first_txg(spa).
*
* We currently do not make DTL entries for failed spontaneous
* self-healing writes triggered by normal (non-scrubbing)
@@ -2516,9 +2516,12 @@
ASSERT(flags & ZIO_FLAG_IO_REPAIR);
ASSERT(spa_sync_pass(spa) == 1);
vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
- commit_txg = spa->spa_syncing_txg;
+ commit_txg = spa_syncing_txg(spa);
+ } else if (spa->spa_claiming) {
+ ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+ commit_txg = spa_first_txg(spa);
}
- ASSERT(commit_txg >= spa->spa_syncing_txg);
+ ASSERT(commit_txg >= spa_syncing_txg(spa));
if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
return;
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
@@ -2560,15 +2563,18 @@
}
/*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
- int64_t defer_delta, boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+ int64_t space_delta)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -2584,29 +2590,25 @@
vd->vdev_deflate_ratio;
mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_space += space_delta;
- vd->vdev_stat.vs_alloc += alloc_delta;
vd->vdev_stat.vs_dspace += dspace_delta;
- vd->vdev_stat.vs_defer += defer_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (update_root) {
+ if (mc == spa_normal_class(spa)) {
+ mutex_enter(&rvd->vdev_stat_lock);
+ rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_space += space_delta;
+ rvd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&rvd->vdev_stat_lock);
+ }
+
+ if (mc != NULL) {
ASSERT(rvd == vd->vdev_parent);
ASSERT(vd->vdev_ms_count != 0);
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_islog)
- return;
-
- mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
- rvd->vdev_stat.vs_alloc += alloc_delta;
- rvd->vdev_stat.vs_dspace += dspace_delta;
- rvd->vdev_stat.vs_defer += defer_delta;
- mutex_exit(&rvd->vdev_stat_lock);
+ metaslab_class_space_update(mc,
+ alloc_delta, defer_delta, space_delta, dspace_delta);
}
}
@@ -2722,7 +2724,7 @@
(dsl_pool_sync_context(spa_get_dsl(spa)) &&
spa_config_held(spa, SCL_STATE, RW_READER)));
- if (!list_link_active(&vd->vdev_state_dirty_node))
+ if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
list_insert_head(&spa->spa_state_dirty_list, vd);
}
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c Sun Nov 01 14:14:46 2009 -0800
@@ -214,7 +214,7 @@
uint64_t txg = zio->io_txg;
int i, c;
- ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+ ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
/*
* Try to find a child whose DTL doesn't contain the block to read.
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Sun Nov 01 14:14:46 2009 -0800
@@ -24,7 +24,6 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c Sun Nov 01 14:14:46 2009 -0800
@@ -129,7 +129,6 @@
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
#define VDEV_RAIDZ_R 2
-#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
@@ -1524,7 +1523,6 @@
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
- blkptr_t *bp = zio->io_bp;
raidz_map_t *rm;
raidz_col_t *rc;
int c, i;
@@ -1585,7 +1583,7 @@
rc->rc_skipped = 1;
continue;
}
- if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
if (c >= rm->rm_firstdatacol)
rm->rm_missingdata++;
else
--- a/usr/src/uts/common/fs/zfs/zap.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap.c Sun Nov 01 14:14:46 2009 -0800
@@ -70,7 +70,7 @@
}
void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
{
dmu_buf_t *db;
zap_leaf_t *l;
@@ -102,6 +102,7 @@
zp->zap_num_entries = 0;
zp->zap_salt = zap->zap_salt;
zp->zap_normflags = zap->zap_normflags;
+ zp->zap_flags = flags;
/* block 1 will be the first leaf */
for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -315,8 +316,13 @@
static int
zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
{
- /* In case things go horribly wrong. */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+ /*
+ * The pointer table should never use more hash bits than we
+ * have (otherwise we'd be using useless zero bits to index it).
+ * If we are within 2 bits of running out, stop growing, since
+ * this is already an aberrant condition.
+ */
+ if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
return (ENOSPC);
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -702,9 +708,9 @@
static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checksize(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
{
- if (name && strlen(name) > ZAP_MAXNAMELEN)
+ if (zn->zn_key_orig_len > ZAP_MAXNAMELEN)
return (E2BIG);
/* Only integer sizes supported by C */
@@ -736,7 +742,7 @@
int err;
zap_entry_handle_t zeh;
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ err = fzap_checksize(zn, integer_size, num_integers);
if (err != 0)
return (err);
@@ -746,7 +752,7 @@
err = zap_leaf_lookup(l, zn, &zeh);
if (err == 0) {
err = zap_entry_read(&zeh, integer_size, num_integers, buf);
- (void) zap_entry_read_name(&zeh, rn_len, realname);
+ (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
if (ncp) {
*ncp = zap_entry_normalization_conflict(&zeh,
zn, NULL, zn->zn_zap);
@@ -769,8 +775,7 @@
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(!zap->zap_ismicro);
- ASSERT(fzap_checksize(zn->zn_name_orij,
- integer_size, num_integers) == 0);
+ ASSERT(fzap_checksize(zn, integer_size, num_integers) == 0);
err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
if (err != 0)
@@ -784,7 +789,7 @@
if (err != ENOENT)
goto out;
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+ err = zap_entry_create(l, zn, cd,
integer_size, num_integers, val, &zeh);
if (err == 0) {
@@ -807,12 +812,12 @@
uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
- int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ int err = fzap_checksize(zn, integer_size, num_integers);
if (err != 0)
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
- val, ZAP_MAXCD, tx));
+ val, ZAP_NEED_CD, tx));
}
int
@@ -825,7 +830,7 @@
zap_t *zap = zn->zn_zap;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+ err = fzap_checksize(zn, integer_size, num_integers);
if (err != 0)
return (err);
@@ -838,8 +843,8 @@
ASSERT(err == 0 || err == ENOENT);
if (create) {
- err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
- ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+ err = zap_entry_create(l, zn, ZAP_NEED_CD,
+ integer_size, num_integers, val, &zeh);
if (err == 0)
zap_increment_num_entries(zap, 1, tx);
} else {
@@ -1013,6 +1018,8 @@
zap_entry_handle_t zeh;
zap_leaf_t *l;
+ /* memset(za, 0xba, sizeof (zap_attribute_t)); */
+
/* retrieve the next entry at or after zc_hash/zc_cd */
/* if no entry, return ENOENT */
@@ -1063,7 +1070,7 @@
err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
ASSERT(err == 0 || err == EOVERFLOW);
}
- err = zap_entry_read_name(&zeh,
+ err = zap_entry_read_name(zap, &zeh,
sizeof (za->za_name), za->za_name);
ASSERT(err == 0);
@@ -1109,7 +1116,7 @@
zap_leaf_t *l;
zap_entry_handle_t zeh;
- if (zn->zn_name_orij && strlen(zn->zn_name_orij) > ZAP_MAXNAMELEN)
+ if (zn->zn_key_orig_len > ZAP_MAXNAMELEN)
return (E2BIG);
err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
--- a/usr/src/uts/common/fs/zfs/zap_leaf.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c Sun Nov 01 14:14:46 2009 -0800
@@ -29,6 +29,7 @@
* the names are stored null-terminated.
*/
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
@@ -272,11 +273,12 @@
static void
zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
- char *buf)
+ void *buf)
{
int len = MIN(array_len, buf_len);
int byten = 0;
uint64_t value = 0;
+ char *p = buf;
ASSERT3U(array_int_len, <=, buf_int_len);
@@ -284,7 +286,7 @@
if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
uint8_t *ip = la->la_array;
- uint64_t *buf64 = (uint64_t *)buf;
+ uint64_t *buf64 = buf;
*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
(uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -299,8 +301,8 @@
while (chunk != CHAIN_END) {
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
- bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
- buf += ZAP_LEAF_ARRAY_BYTES;
+ bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+ p += ZAP_LEAF_ARRAY_BYTES;
chunk = la->la_next;
}
return;
@@ -315,12 +317,12 @@
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
- stv(buf_int_len, buf, value);
+ stv(buf_int_len, p, value);
byten = 0;
len--;
if (len == 0)
return;
- buf += buf_int_len;
+ p += buf_int_len;
}
}
chunk = la->la_next;
@@ -328,7 +330,6 @@
}
/*
- * Only to be used on 8-bit arrays.
* array_len is actual len in bytes (not encoded le_value_length).
* namenorm is null-terminated.
*/
@@ -337,23 +338,44 @@
{
int bseen = 0;
+ if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+ uint64_t *thiskey;
+ boolean_t match;
+
+ ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+ thiskey = kmem_alloc(array_len * sizeof (*thiskey), KM_SLEEP);
+
+ zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_len,
+ sizeof (*thiskey), array_len, thiskey);
+ match = bcmp(thiskey, zn->zn_key_orig,
+ array_len * sizeof (*thiskey)) == 0;
+ kmem_free(thiskey, array_len * sizeof (*thiskey));
+ return (match);
+ }
+
if (zn->zn_matchtype == MT_FIRST) {
char *thisname = kmem_alloc(array_len, KM_SLEEP);
boolean_t match;
- zap_leaf_array_read(l, chunk, 1, array_len, 1,
- array_len, thisname);
+ zap_leaf_array_read(l, chunk, sizeof (char), array_len,
+ sizeof (char), array_len, thisname);
match = zap_match(zn, thisname);
kmem_free(thisname, array_len);
return (match);
}
- /* Fast path for exact matching */
+ /*
+ * Fast path for exact matching.
+ * First check that the lengths match, so that we don't read
+ * past the end of the zn_key_orig array.
+ */
+ if (array_len != zn->zn_key_orig_len)
+ return (B_FALSE);
while (bseen < array_len) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+ if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
break;
chunk = la->la_next;
bseen += toread;
@@ -426,7 +448,7 @@
{
uint16_t chunk;
uint64_t besth = -1ULL;
- uint32_t bestcd = ZAP_MAXCD;
+ uint32_t bestcd = -1U;
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
uint16_t lh;
struct zap_leaf_entry *le;
@@ -459,7 +481,7 @@
}
}
- return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+ return (bestcd == -1U ? ENOENT : 0);
}
int
@@ -483,14 +505,20 @@
}
int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+ char *buf)
{
struct zap_leaf_entry *le =
ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
- le->le_name_length, 1, buflen, buf);
+ if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+ le->le_name_length, 8, buflen / 8, buf);
+ } else {
+ zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+ le->le_name_length, 1, buflen, buf);
+ }
if (le->le_name_length > buflen)
return (EOVERFLOW);
return (0);
@@ -549,26 +577,26 @@
}
int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
uint8_t integer_size, uint64_t num_integers, const void *buf,
zap_entry_handle_t *zeh)
{
uint16_t chunk;
uint16_t *chunkp;
struct zap_leaf_entry *le;
- uint64_t namelen, valuelen;
+ uint64_t valuelen;
int numchunks;
+ uint64_t h = zn->zn_hash;
valuelen = integer_size * num_integers;
- namelen = strlen(name) + 1;
- ASSERT(namelen >= 2);
- numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
+ numchunks = 1 +
+ ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_len * zn->zn_key_intlen) +
ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (E2BIG);
- if (cd == ZAP_MAXCD) {
+ if (cd == ZAP_NEED_CD) {
/* find the lowest unused cd */
if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
cd = 0;
@@ -585,7 +613,7 @@
}
} else {
/* old unsorted format; do it the O(n^2) way */
- for (cd = 0; cd < ZAP_MAXCD; cd++) {
+ for (cd = 0; ; cd++) {
for (chunk = *LEAF_HASH_ENTPTR(l, h);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -600,10 +628,10 @@
}
}
/*
- * we would run out of space in a block before we could
- * have ZAP_MAXCD entries
+ * We would run out of space in a block before we could
+ * store enough entries to run out of CD values.
*/
- ASSERT3U(cd, <, ZAP_MAXCD);
+ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
}
if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -613,8 +641,9 @@
chunk = zap_leaf_chunk_alloc(l);
le = ZAP_LEAF_ENTRY(l, chunk);
le->le_type = ZAP_CHUNK_ENTRY;
- le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
- le->le_name_length = namelen;
+ le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+ zn->zn_key_intlen, zn->zn_key_orig_len);
+ le->le_name_length = zn->zn_key_orig_len;
le->le_value_chunk =
zap_leaf_array_create(l, buf, integer_size, num_integers);
le->le_value_length = num_integers;
--- a/usr/src/uts/common/fs/zfs/zap_micro.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c Sun Nov 01 14:14:46 2009 -0800
@@ -23,6 +23,7 @@
* Use is subject to license terms.
*/
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h>
@@ -36,33 +37,74 @@
#include <sys/sunddi.h>
#endif
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+ if (zap->zap_ismicro)
+ return (0);
+ return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
+int
+zap_hashbits(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return (48);
+ else
+ return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+ if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+ return ((1<<16)-1);
+ else
+ return (-1U);
+}
static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+zap_hash(zap_name_t *zn)
{
- const uint8_t *cp;
- uint8_t c;
- uint64_t crc = zap->zap_salt;
+ zap_t *zap = zn->zn_zap;
+ uint64_t h = 0;
- /* NB: name must already be normalized, if necessary */
+ if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+ ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+ h = *(uint64_t *)zn->zn_key_orig;
+ } else {
+ const uint8_t *cp = (const uint8_t *)zn->zn_key_norm;
+ int i, len;
+
+ h = zap->zap_salt;
+ ASSERT(h != 0);
+ ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+ len = zn->zn_key_norm_len;
- ASSERT(crc != 0);
- ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
- for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
- crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
+ /*
+ * Because we previously stored the terminating null on
+ * disk, but didn't hash it, we need to continue to not
+ * hash it. (The zn_key_*_len includes the terminating
+ * null for non-binary keys.)
+ */
+ if (!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY))
+ len--;
+
+ for (i = 0; i < len; cp++, i++)
+ h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF];
+
}
-
/*
- * Only use 28 bits, since we need 4 bits in the cookie for the
- * collision differentiator. We MUST use the high bits, since
- * those are the ones that we first pay attention to when
+ * Don't use all 64 bits, since we need some in the cookie for
+ * the collision differentiator. We MUST use the high bits,
+ * since those are the ones that we first pay attention to when
* chosing the bucket.
*/
- crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
- return (crc);
+ return (h);
}
static int
@@ -71,6 +113,8 @@
size_t inlen, outlen;
int err;
+ ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
inlen = strlen(name) + 1;
outlen = ZAP_MAXNAMELEN;
@@ -85,16 +129,18 @@
boolean_t
zap_match(zap_name_t *zn, const char *matchname)
{
+ ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
if (zn->zn_matchtype == MT_FIRST) {
char norm[ZAP_MAXNAMELEN];
if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
return (B_FALSE);
- return (strcmp(zn->zn_name_norm, norm) == 0);
+ return (strcmp(zn->zn_key_norm, norm) == 0);
} else {
/* MT_BEST or MT_EXACT */
- return (strcmp(zn->zn_name_orij, matchname) == 0);
+ return (strcmp(zn->zn_key_orig, matchname) == 0);
}
}
@@ -104,30 +150,49 @@
kmem_free(zn, sizeof (zap_name_t));
}
-/* XXX combine this with zap_lockdir()? */
zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
{
zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
zn->zn_zap = zap;
- zn->zn_name_orij = name;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = key;
+ zn->zn_key_orig_len = strlen(zn->zn_key_orig) + 1;
zn->zn_matchtype = mt;
if (zap->zap_normflags) {
- if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+ if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_normbuf;
+ zn->zn_key_norm = zn->zn_normbuf;
+ zn->zn_key_norm_len = strlen(zn->zn_key_norm) + 1;
} else {
if (mt != MT_EXACT) {
zap_name_free(zn);
return (NULL);
}
- zn->zn_name_norm = zn->zn_name_orij;
+ zn->zn_key_norm = zn->zn_key_orig;
+ zn->zn_key_norm_len = zn->zn_key_orig_len;
}
- zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+ zn->zn_hash = zap_hash(zn);
+ return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+ zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+ ASSERT(zap->zap_normflags == 0);
+ zn->zn_zap = zap;
+ zn->zn_key_intlen = sizeof (*key);
+ zn->zn_key_orig = zn->zn_key_norm = key;
+ zn->zn_key_orig_len = zn->zn_key_norm_len = numints;
+ zn->zn_matchtype = MT_EXACT;
+
+ zn->zn_hash = zap_hash(zn);
return (zn);
}
@@ -186,7 +251,7 @@
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT(mzep->mze_cd < ZAP_MAXCD);
+ ASSERT(mzep->mze_cd < zap_maxcd(zap));
mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
@@ -206,9 +271,6 @@
ASSERT(zn->zn_zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
- if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
- return (NULL);
-
mze_tofind.mze_hash = zn->zn_hash;
mze_tofind.mze_phys.mze_cd = 0;
@@ -421,7 +483,7 @@
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
- return (mzap_upgrade(zapp, tx));
+ return (mzap_upgrade(zapp, tx, 0));
}
err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
ASSERT3U(err, ==, 0);
@@ -441,10 +503,11 @@
}
static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
- int i, sz, nchunks, err;
+ int i, sz, nchunks;
+ int err = 0;
zap_t *zap = *zapp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -454,11 +517,13 @@
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
- err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
- 1ULL << fzap_default_block_shift, 0, tx);
- if (err) {
- kmem_free(mzp, sz);
- return (err);
+ if (!flags) {
+ err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+ 1ULL << fzap_default_block_shift, 0, tx);
+ if (err) {
+ kmem_free(mzp, sz);
+ return (err);
+ }
}
dprintf("upgrading obj=%llu with %u chunks\n",
@@ -466,10 +531,9 @@
/* XXX destroy the avl later, so we can use the stored hash value */
mze_destroy(zap);
- fzap_upgrade(zap, tx);
+ fzap_upgrade(zap, tx, flags);
for (i = 0; i < nchunks; i++) {
- int err;
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
zap_name_t *zn;
if (mze->mze_name[0] == 0)
@@ -489,7 +553,8 @@
}
static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+ dmu_tx_t *tx)
{
dmu_buf_t *db;
mzap_phys_t *zp;
@@ -510,6 +575,15 @@
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
zp->mz_normflags = normflags;
dmu_buf_rele(db, FTAG);
+
+ if (flags != 0) {
+ zap_t *zap;
+ /* Only fat zap supports flags; upgrade immediately. */
+ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+ zap_unlockdir(zap);
+ }
}
int
@@ -530,7 +604,7 @@
err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
if (err != 0)
return (err);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
return (0);
}
@@ -547,7 +621,26 @@
{
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
- mzap_create_impl(os, obj, normflags, tx);
+ mzap_create_impl(os, obj, normflags, 0, tx);
+ return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+ ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+ leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+ indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+ VERIFY(dmu_object_set_blocksize(os, obj,
+ 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+ mzap_create_impl(os, obj, normflags, flags, tx);
return (obj);
}
@@ -699,6 +792,30 @@
}
int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+
+ err = fzap_lookup(zn, integer_size, num_integers, buf,
+ NULL, 0, NULL);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers)
{
@@ -733,6 +850,28 @@
return (err);
}
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_length(zn, integer_size, num_integers);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
@@ -741,20 +880,18 @@
int start = zap->zap_m.zap_alloc_next;
uint32_t cd;
- dprintf("obj=%llu %s=%llu\n", zap->zap_object,
- zn->zn_name_orij, value);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
#ifdef ZFS_DEBUG
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
- ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+ ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
cd = mze_find_unused_cd(zap, zn->zn_hash);
/* given the limited size of the microzap, this can't happen */
- ASSERT(cd != ZAP_MAXCD);
+ ASSERT(cd < zap_maxcd(zap));
again:
for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -762,7 +899,7 @@
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
mze->mze_cd = cd;
- (void) strcpy(mze->mze_name, zn->zn_name_orij);
+ (void) strcpy(mze->mze_name, zn->zn_key_orig);
zap->zap_m.zap_num_entries++;
zap->zap_m.zap_alloc_next = i+1;
if (zap->zap_m.zap_alloc_next ==
@@ -780,7 +917,7 @@
}
int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
@@ -793,7 +930,7 @@
err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
if (err)
return (err);
- zn = zap_name_alloc(zap, name, MT_EXACT);
+ zn = zap_name_alloc(zap, key, MT_EXACT);
if (zn == NULL) {
zap_unlockdir(zap);
return (ENOTSUP);
@@ -802,10 +939,8 @@
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
- strlen(name) >= MZAP_NAME_LEN) {
- dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
- zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ strlen(key) >= MZAP_NAME_LEN) {
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_add(zn, integer_size, num_integers, val, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
@@ -825,6 +960,31 @@
}
int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, int integer_size, uint64_t num_integers,
+ const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_add(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_add() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_add() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
@@ -849,7 +1009,7 @@
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx);
+ err = mzap_upgrade(&zn->zn_zap, tx, 0);
if (err == 0)
err = fzap_update(zn, integer_size, num_integers,
val, tx);
@@ -872,6 +1032,31 @@
}
int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints,
+ int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ zap_name_t *zn;
+ int err;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_update(zn, integer_size, num_integers, val, tx);
+ zap = zn->zn_zap; /* fzap_update() may change zap */
+ zap_name_free(zn);
+ if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
+ zap_unlockdir(zap);
+ return (err);
+}
+
+int
zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
{
return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
@@ -912,17 +1097,32 @@
return (err);
}
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+ int key_numints, dmu_tx_t *tx)
+{
+ zap_t *zap;
+ int err;
+ zap_name_t *zn;
+
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ if (err)
+ return (err);
+ zn = zap_name_alloc_uint64(zap, key, key_numints);
+ if (zn == NULL) {
+ zap_unlockdir(zap);
+ return (ENOTSUP);
+ }
+ err = fzap_remove(zn, tx);
+ zap_name_free(zn);
+ zap_unlockdir(zap);
+ return (err);
+}
+
/*
* Routines for iterating over the attributes.
*/
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this. So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
@@ -931,15 +1131,9 @@
zc->zc_zap = NULL;
zc->zc_leaf = NULL;
zc->zc_zapobj = zapobj;
- if (serialized == -1ULL) {
- zc->zc_hash = -1ULL;
- zc->zc_cd = 0;
- } else {
- zc->zc_hash = serialized << (64-ZAP_HASHBITS);
- zc->zc_cd = serialized >> ZAP_HASHBITS;
- if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
- zc->zc_cd = 0;
- }
+ zc->zc_serialized = serialized;
+ zc->zc_hash = 0;
+ zc->zc_cd = 0;
}
void
@@ -969,10 +1163,21 @@
{
if (zc->zc_hash == -1ULL)
return (-1ULL);
- ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
- ASSERT(zc->zc_cd < ZAP_MAXCD);
- return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
- ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+ if (zc->zc_zap == NULL)
+ return (zc->zc_serialized);
+ ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+ ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+ /*
+ * We want to keep the high 32 bits of the cursor zero if we can, so
+ * that 32-bit programs can access this. So usually use a small
+ * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+ * of the cursor.
+ *
+ * [ collision differentiator | zap_hashbits()-bit hash value ]
+ */
+ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+ ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
}
int
@@ -987,10 +1192,23 @@
return (ENOENT);
if (zc->zc_zap == NULL) {
+ int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, &zc->zc_zap);
if (err)
return (err);
+
+ /*
+ * To support zap_cursor_init_serialized, advance, retrieve,
+ * we must add to the existing zc_cd, which may already
+ * be 1 due to the zap_cursor_advance.
+ */
+ ASSERT(zc->zc_hash == 0);
+ hb = zap_hashbits(zc->zc_zap);
+ zc->zc_hash = zc->zc_serialized << (64 - hb);
+ zc->zc_cd += zc->zc_serialized >> hb;
+ if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+ zc->zc_cd = 0;
} else {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
}
@@ -1035,12 +1253,6 @@
if (zc->zc_hash == -1ULL)
return;
zc->zc_cd++;
- if (zc->zc_cd >= ZAP_MAXCD) {
- zc->zc_cd = 0;
- zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
- if (zc->zc_hash == 0) /* EOF */
- zc->zc_hash = -1ULL;
- }
}
int
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c Sun Nov 01 14:14:46 2009 -0800
@@ -700,6 +700,7 @@
bcopy(info, report->zcr_ckinfo, sizeof (*info));
}
+ report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
report->zcr_length = length;
#ifdef _KERNEL
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Sun Nov 01 14:14:46 2009 -0800
@@ -41,7 +41,6 @@
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
-#include <sys/vdev_impl.h>
#include <sys/dmu.h>
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
@@ -179,7 +178,7 @@
if (dmu_objset_hold(name, FTAG, &os) == 0) {
boolean_t ret;
- ret = (dmu_objset_id(os) == dmu_objset_spa(os)->spa_bootfs);
+ ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
dmu_objset_rele(os, FTAG);
return (ret);
}
@@ -1219,7 +1218,7 @@
*
* l2cache and spare devices are ok to be added to a rootpool.
*/
- if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
spa_close(spa, FTAG);
return (EDOM);
}
@@ -1665,6 +1664,11 @@
SPA_VERSION_GZIP_COMPRESSION))
return (ENOTSUP);
+ if (intval == ZIO_COMPRESS_ZLE &&
+ zfs_earlier_version(name,
+ SPA_VERSION_ZLE_COMPRESSION))
+ return (ENOTSUP);
+
/*
* If this is a bootable dataset then
* verify that the compression algorithm
@@ -1683,6 +1687,11 @@
return (ENOTSUP);
break;
+ case ZFS_PROP_DEDUP:
+ if (zfs_earlier_version(name, SPA_VERSION_DEDUP))
+ return (ENOTSUP);
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(name, ZPL_VERSION_FUID))
return (ENOTSUP);
@@ -2978,9 +2987,9 @@
mutex_exit(&spa_namespace_lock);
return (EIO);
}
- if (spa->spa_log_state == SPA_LOG_MISSING) {
+ if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
/* we need to let spa_open/spa_load clear the chains */
- spa->spa_log_state = SPA_LOG_CLEAR;
+ spa_set_log_state(spa, SPA_LOG_CLEAR);
}
spa->spa_last_open_failed = 0;
mutex_exit(&spa_namespace_lock);
--- a/usr/src/uts/common/fs/zfs/zfs_log.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c Sun Nov 01 14:14:46 2009 -0800
@@ -47,14 +47,6 @@
#include <sys/ddi.h>
#include <sys/dsl_dataset.h>
-#define ZFS_HANDLE_REPLAY(zilog, tx) \
- if (zilog->zl_replay) { \
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); \
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = \
- zilog->zl_replaying_seq; \
- return; \
- }
-
/*
* These zfs_log_* functions must be called within a dmu tx, in one
* of 2 contexts depending on zilog->z_replay:
@@ -251,11 +243,9 @@
size_t namesize = strlen(name) + 1;
size_t fuidsz = 0;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If we have FUIDs present then add in space for
* domains and ACE fuid's if any.
@@ -356,11 +346,9 @@
lr_remove_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_remove_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
@@ -382,11 +370,9 @@
lr_link_t *lr;
size_t namesize = strlen(name) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
lr = (lr_link_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
@@ -411,11 +397,9 @@
size_t namesize = strlen(name) + 1;
size_t linksize = strlen(link) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
@@ -447,11 +431,9 @@
size_t snamesize = strlen(sname) + 1;
size_t dnamesize = strlen(dname) + 1;
- if (zilog == NULL)
+ if (zil_replaying(zilog, tx))
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
@@ -479,11 +461,9 @@
uintptr_t fsync_cnt;
ssize_t immediate_write_sz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
? 0 : zfs_immediate_write_sz;
@@ -518,8 +498,7 @@
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
+ zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
@@ -560,11 +539,9 @@
uint64_t seq;
lr_truncate_t *lr;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_truncate_t *)&itx->itx_lr;
lr->lr_foid = zp->z_id;
@@ -590,12 +567,9 @@
size_t recsize = sizeof (lr_setattr_t);
void *start;
-
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
/*
* If XVATTR set, then log record size needs to allow
* for lr_attr_t + xvattr mask, mapsize and create time
@@ -659,11 +633,9 @@
size_t txsize;
size_t aclbytes = vsecp->vsa_aclentsz;
- if (zilog == NULL || zp->z_unlinked)
+ if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
-
txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
TX_ACL_V0 : TX_ACL;
--- a/usr/src/uts/common/fs/zfs/zfs_replay.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_replay.c Sun Nov 01 14:14:46 2009 -0800
@@ -625,7 +625,7 @@
znode_t *zp;
int error;
ssize_t resid;
- uint64_t orig_eof, eod;
+ uint64_t orig_eof, eod, offset, length;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
@@ -640,15 +640,24 @@
error = 0;
return (error);
}
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+ eod = offset + length; /* end of data for this write */
+
orig_eof = zp->z_phys->zp_size;
- eod = lr->lr_offset + lr->lr_length; /* end of data for this write */
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(zfsvfs->z_log, lr);
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
- error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
- lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+ UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
/*
* This may be a write from a dmu_sync() for a whole block,
@@ -682,16 +691,8 @@
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log writes out of order, it's possible the
- * file has been removed. In this case just drop the write
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
end = lr->lr_offset + lr->lr_length;
if (end > zp->z_phys->zp_size) {
@@ -714,16 +715,8 @@
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log truncates out of order, it's possible the
- * file has been removed. In this case just drop the truncate
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
bzero(&fl, sizeof (fl));
fl.l_type = F_WRLCK;
@@ -757,16 +750,8 @@
zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log setattrs out of order, it's possible the
- * file has been removed. In this case just drop the setattr
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -812,16 +797,8 @@
zfs_oldace_byteswap(ace, lr->lr_aclcnt);
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -869,16 +846,8 @@
}
}
- if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
- /*
- * As we can log acls out of order, it's possible the
- * file has been removed. In this case just drop the acl
- * and return success.
- */
- if (error == ENOENT)
- error = 0;
+ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
- }
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Sun Nov 01 14:14:46 2009 -0800
@@ -837,21 +837,25 @@
}
void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
- vnode_t *vp = ZTOV(rl->r_zp);
- objset_t *os = rl->r_zp->z_zfsvfs->z_os;
-
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
+ znode_t *zp = zgd->zgd_private;
+ objset_t *os = zp->z_zfsvfs->z_os;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_range_unlock(zgd->zgd_rl);
+
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
- VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
kmem_free(zgd, sizeof (zgd_t));
}
@@ -868,20 +872,21 @@
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
- uint64_t off = lr->lr_offset;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- int dlen = lr->lr_length; /* length of user data */
int error = 0;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
/*
* Nothing to do if the file has been removed
*/
- if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
return (ENOENT);
if (zp->z_unlinked) {
/*
@@ -893,6 +898,10 @@
return (ENOENT);
}
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zfsvfs->z_log;
+ zgd->zgd_private = zp;
+
/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
@@ -901,17 +910,16 @@
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- rl = zfs_range_lock(zp, off, dlen, RL_READER);
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (offset >= zp->z_phys->zp_size) {
error = ENOENT;
- goto out;
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
}
- VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
- DMU_READ_NO_PREFETCH));
+ ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
- uint64_t boff; /* block starting offset */
-
/*
* Have to lock the whole block to ensure when it's
* written out and it's checksum is being calculated
@@ -919,80 +927,58 @@
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
- if (ISP2(zp->z_blksz)) {
- boff = P2ALIGN_TYPED(off, zp->z_blksz,
- uint64_t);
- } else {
- boff = 0;
- }
- dlen = zp->z_blksz;
- rl = zfs_range_lock(zp, boff, dlen, RL_READER);
- if (zp->z_blksz == dlen)
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : 0;
+ offset -= blkoff;
+ zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+ RL_READER);
+ if (zp->z_blksz == size)
break;
- zfs_range_unlock(rl);
+ offset += blkoff;
+ zfs_range_unlock(zgd->zgd_rl);
}
/* test for truncation needs to be done while range locked */
- if (off >= zp->z_phys->zp_size) {
+ if (offset >= zp->z_phys->zp_size)
error = ENOENT;
- goto out;
- }
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_rl = rl;
- zgd->zgd_zilog = zfsvfs->z_log;
- zgd->zgd_bp = &lr->lr_blkptr;
#ifdef DEBUG
if (zil_fault_io) {
error = EIO;
zil_fault_io = 0;
- } else {
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
}
-#else
- error = dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db);
#endif
- if (error != 0) {
- kmem_free(zgd, sizeof (zgd_t));
- goto out;
- }
-
- ASSERT(boff == db->db_offset);
- lr->lr_blkoff = off - boff;
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zfs_get_done, zgd);
- ASSERT((error && error != EINPROGRESS) ||
- lr->lr_length <= zp->z_blksz);
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db);
+
if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= zp->z_blksz);
+
/*
- * dmu_sync() can compress a block of zeros to a null
- * blkptr but the block size still needs to be passed
- * through to replay.
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
*/
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ error = 0;
+ }
}
-
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zfs_get_done() callback.
- */
- if (error == EINPROGRESS) {
- return (0);
- } else if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- error = 0;
- }
- dmu_buf_rele(db, zgd);
- kmem_free(zgd, sizeof (zgd_t));
}
-out:
- zfs_range_unlock(rl);
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+ zfs_get_done(zgd, error);
+
return (error);
}
--- a/usr/src/uts/common/fs/zfs/zil.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zil.c Sun Nov 01 14:14:46 2009 -0800
@@ -25,7 +25,6 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
-#include <sys/spa_impl.h>
#include <sys/dmu.h>
#include <sys/zap.h>
#include <sys/arc.h>
@@ -80,10 +79,10 @@
static boolean_t zil_empty(zilog_t *zilog);
static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
{
- const dva_t *dva1 = x1;
- const dva_t *dva2 = x2;
+ const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+ const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
return (-1);
@@ -99,34 +98,37 @@
}
static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
{
- avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
- offsetof(zil_dva_node_t, zn_node));
+ avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+ sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
}
static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ zil_bp_node_t *zn;
void *cookie = NULL;
while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(zn, sizeof (zil_dva_node_t));
+ kmem_free(zn, sizeof (zil_bp_node_t));
avl_destroy(t);
}
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
- zil_dva_node_t *zn;
+ avl_tree_t *t = &zilog->zl_bp_tree;
+ const dva_t *dva = BP_IDENTITY(bp);
+ zil_bp_node_t *zn;
avl_index_t where;
if (avl_find(t, dva, &where) != NULL)
return (EEXIST);
- zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+ zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
zn->zn_dva = *dva;
avl_insert(t, zn, where);
@@ -151,37 +153,38 @@
}
/*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
*/
static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst)
{
- blkptr_t blk = *bp;
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
zbookmark_t zb;
- uint32_t aflags = ARC_WAIT;
int error;
- zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
- *abufpp = NULL;
+ if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ zio_flags |= ZIO_FLAG_SPECULATIVE;
- /*
- * We shouldn't be doing any scrubbing while we're doing log
- * replay, it's OK to not lock.
- */
- error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
- arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+ SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+ error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
- char *data = (*abufpp)->b_data;
- uint64_t blksz = BP_GET_LSIZE(bp);
- zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
+ char *data = abuf->b_data;
+ uint64_t size = BP_GET_LSIZE(bp);
+ zil_trailer_t *ztp = (zil_trailer_t *)(data + size) - 1;
zio_cksum_t cksum = bp->blk_cksum;
+ bcopy(data, dst, size);
+ *nbp = ztp->zit_next_blk;
+
/*
* Validate the checksummed log block.
*
@@ -194,41 +197,76 @@
if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
- (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
+ (ztp->zit_nused > (size - sizeof (zil_trailer_t))))
error = ECKSUM;
- }
- if (error) {
- VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
- *abufpp = NULL;
- }
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+ return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+ enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+ const blkptr_t *bp = &lr->lr_blkptr;
+ uint32_t aflags = ARC_WAIT;
+ arc_buf_t *abuf = NULL;
+ zbookmark_t zb;
+ int error;
+
+ if (BP_IS_HOLE(bp)) {
+ if (wbuf != NULL)
+ bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+ return (0);
+ }
+
+ if (zilog->zl_header->zh_claim_txg == 0)
+ zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+ ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+ error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+ ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+ if (error == 0) {
+ if (wbuf != NULL)
+ bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+ (void) arc_buf_remove_ref(abuf, &abuf);
+ }
return (error);
}
/*
* Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
*/
-uint64_t
+int
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
{
const zil_header_t *zh = zilog->zl_header;
- uint64_t claim_seq = zh->zh_claim_seq;
- uint64_t seq = 0;
- uint64_t max_seq = 0;
- blkptr_t blk = zh->zh_log;
- arc_buf_t *abuf;
+ boolean_t claimed = !!zh->zh_claim_txg;
+ uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+ uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+ uint64_t max_blk_seq = 0;
+ uint64_t max_lr_seq = 0;
+ uint64_t blk_count = 0;
+ uint64_t lr_count = 0;
+ blkptr_t blk, next_blk;
char *lrbuf, *lrp;
- zil_trailer_t *ztp;
- int reclen, error;
+ int error = 0;
- if (BP_IS_HOLE(&blk))
- return (max_seq);
+ /*
+ * Old logs didn't record the maximum zh_claim_lr_seq.
+ */
+ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+ claim_lr_seq = UINT64_MAX;
/*
* Starting at the block pointed to by zh_log we read the log chain.
@@ -239,95 +277,121 @@
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- zil_dva_tree_init(&zilog->zl_dva_tree);
- for (;;) {
- seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ zil_bp_tree_init(zilog);
+
+ for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+ zil_trailer_t *ztp =
+ (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ int reclen;
- if (claim_seq != 0 && seq > claim_seq)
+ if (blk_seq > claim_blk_seq)
+ break;
+ if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+ break;
+ ASSERT(max_blk_seq < blk_seq);
+ max_blk_seq = blk_seq;
+ blk_count++;
+
+ if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
break;
- ASSERT(max_seq < seq);
- max_seq = seq;
-
- error = zil_read_log_block(zilog, &blk, &abuf);
-
- if (parse_blk_func != NULL)
- parse_blk_func(zilog, &blk, arg, txg);
-
+ error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf);
if (error)
break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
-
- if (parse_lr_func == NULL) {
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- continue;
- }
-
for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
- parse_lr_func(zilog, lr, arg, txg);
+ if (lr->lrc_seq > claim_lr_seq)
+ goto done;
+ if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+ goto done;
+ ASSERT(max_lr_seq < lr->lrc_seq);
+ max_lr_seq = lr->lrc_seq;
+ lr_count++;
}
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
}
- zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+ zilog->zl_parse_error = error;
+ zilog->zl_parse_blk_seq = max_blk_seq;
+ zilog->zl_parse_lr_seq = max_lr_seq;
+ zilog->zl_parse_blk_count = blk_count;
+ zilog->zl_parse_lr_count = lr_count;
+
+ ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+ (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
+
+ zil_bp_tree_fini(zilog);
+ zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+
+ return (error);
+}
+
+static int
+zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+{
+ /*
+ * Claim log block if not already committed and not already claimed.
+ * If tx == NULL, just verify that the block is claimable.
+ */
+ if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+ return (0);
- return (max_seq);
+ return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+ tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
+{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ int error;
+
+ if (lrc->lrc_txtype != TX_WRITE)
+ return (0);
+
+ /*
+ * If the block is not readable, don't claim it. This can happen
+ * in normal operation when a log block is written to disk before
+ * some of the dmu_sync() blocks it points to. In this case, the
+ * transaction cannot have been committed to anyone (we would have
+ * waited for all writes to be stable first), so it is semantically
+ * correct to declare this the end of the log.
+ */
+ if (lr->lr_blkptr.blk_birth >= first_txg &&
+ (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+ return (error);
+
+ return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
}
/* ARGSUSED */
-static void
-zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
+static int
+zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
{
- spa_t *spa = zilog->zl_spa;
- int err;
+ zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
- /*
- * Claim log block if not already committed and not already claimed.
- */
- if (bp->blk_birth >= first_txg &&
- zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
- err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
- ZIO_FLAG_MUSTSUCCEED));
- ASSERT(err == 0);
- }
+ return (0);
}
-static void
-zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
-{
- if (lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
- }
-}
-
-/* ARGSUSED */
-static void
-zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
-{
- zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
-}
-
-static void
+static int
zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
{
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
/*
* If we previously claimed it, we need to free it.
*/
- if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
- lr_write_t *lr = (lr_write_t *)lrc;
- blkptr_t *bp = &lr->lr_blkptr;
- if (bp->blk_birth >= claim_txg &&
- !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
- (void) arc_free(NULL, zilog->zl_spa,
- dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
- }
- }
+ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+ zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+ return (0);
}
/*
@@ -359,17 +423,17 @@
*/
if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
if (!BP_IS_HOLE(&blk)) {
- zio_free_blk(zilog->zl_spa, &blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &blk);
BP_ZERO(&blk);
}
- error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
- NULL, txg, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+ ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -387,6 +451,7 @@
lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
lwb->lwb_max_txg = txg;
lwb->lwb_zio = NULL;
+ lwb->lwb_tx = NULL;
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
@@ -428,11 +493,13 @@
*/
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+ zilog->zl_old_header = *zh; /* debugging aid */
+
if (BP_IS_HOLE(&zh->zh_log))
return;
tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
@@ -440,61 +507,27 @@
ASSERT3U(zilog->zl_destroy_txg, <, txg);
zilog->zl_destroy_txg = txg;
+ zilog->zl_keep_first = keep_first;
if (!list_is_empty(&zilog->zl_lwb_list)) {
ASSERT(zh->zh_claim_txg == 0);
- zilog->zl_keep_first = B_FALSE;
+ ASSERT(!keep_first);
while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
list_remove(&zilog->zl_lwb_list, lwb);
if (lwb->lwb_buf != NULL)
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
- zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+ zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
}
- } else {
- zilog->zl_keep_first = keep_first;
- if (zh->zh_flags & ZIL_REPLAY_NEEDED) {
- ASSERT(!keep_first);
- (void) zil_parse(zilog, zil_free_log_block,
- zil_free_log_record, tx, zh->zh_claim_txg);
- } else {
- /*
- * Would like to assert zil_empty() but that
- * would force us to read the log chain which
- * requires us to do I/O to the log. This is
- * overkill since we really just want to destroy
- * the chain anyway.
- */
- if (!keep_first) {
- blkptr_t bp = zh->zh_log;
- zio_free_blk(zilog->zl_spa, &bp, txg);
- }
- }
+ } else if (!keep_first) {
+ (void) zil_parse(zilog, zil_free_log_block,
+ zil_free_log_record, tx, zh->zh_claim_txg);
}
mutex_exit(&zilog->zl_lock);
dmu_tx_commit(tx);
}
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
- const zil_header_t *zh = zilog->zl_header;
- arc_buf_t *abuf = NULL;
-
- if (BP_IS_HOLE(&zh->zh_log))
- return (B_TRUE);
-
- if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
- return (B_TRUE);
-
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- return (B_FALSE);
-}
-
int
zil_claim(char *osname, void *txarg)
{
@@ -514,9 +547,9 @@
zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog);
- if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
+ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
if (!BP_IS_HOLE(&zh->zh_log))
- zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
+ zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
dmu_objset_rele(os, FTAG);
@@ -524,23 +557,6 @@
}
/*
- * Record here whether the zil has any records to replay.
- * If the header block pointer is null or the block points
- * to the stubby then we know there are no valid log records.
- * We use the header to store this state as the the zilog gets
- * freed later in dmu_objset_close().
- * The flags (and the rest of the header fields) are cleared in
- * zil_sync() as a result of a zil_destroy(), after replaying the log.
- *
- * Note, the intent log can be empty but still need the
- * stubby to be claimed.
- */
- if (!zil_empty(zilog)) {
- zh->zh_flags |= ZIL_REPLAY_NEEDED;
- dsl_dataset_dirty(dmu_objset_ds(os), tx);
- }
-
- /*
* Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can
* read only part of the log now (e.g. due to a missing device),
@@ -549,9 +565,14 @@
*/
ASSERT3U(zh->zh_claim_txg, <=, first_txg);
if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+ (void) zil_parse(zilog, zil_claim_log_block,
+ zil_claim_log_record, tx, first_txg);
zh->zh_claim_txg = first_txg;
- zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
- zil_claim_log_record, tx, first_txg);
+ zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+ zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+ if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+ zh->zh_flags |= ZIL_REPLAY_NEEDED;
+ zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
@@ -565,19 +586,15 @@
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
-/* ARGSUSED */
int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(char *osname, void *tx)
{
zilog_t *zilog;
- zil_header_t *zh;
- blkptr_t blk;
- arc_buf_t *abuf;
objset_t *os;
- char *lrbuf;
- zil_trailer_t *ztp;
int error;
+ ASSERT(tx == NULL);
+
error = dmu_objset_hold(osname, FTAG, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
@@ -585,26 +602,20 @@
}
zilog = dmu_objset_zil(os);
- zh = zil_header_in_syncing_context(zilog);
- blk = zh->zh_log;
- if (BP_IS_HOLE(&blk)) {
- dmu_objset_rele(os, FTAG);
- return (0); /* no chain */
- }
- for (;;) {
- error = zil_read_log_block(zilog, &blk, &abuf);
- if (error)
- break;
- lrbuf = abuf->b_data;
- ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
- blk = ztp->zit_next_blk;
- VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
- }
+ /*
+ * Because tx == NULL, zil_claim_log_block() will not actually claim
+ * any blocks, but just determine whether it is possible to do so.
+ * In addition to checking the log chain, zil_claim_log_block()
+ * will invoke zio_claim() with a done func of spa_claim_notify(),
+ * which will update spa_max_claim_txg. See spa_load() for details.
+ */
+ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+ zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
+
dmu_objset_rele(os, FTAG);
- if (error == ECKSUM)
- return (0); /* normal end of chain */
- return (error);
+
+ return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
static int
@@ -622,7 +633,7 @@
}
void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_vdev_tree;
avl_index_t where;
@@ -698,6 +709,7 @@
{
lwb_t *lwb = zio->io_private;
zilog_t *zilog = lwb->lwb_zilog;
+ dmu_tx_t *tx = lwb->lwb_tx;
ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
@@ -719,17 +731,15 @@
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
- if (zio->io_error)
- zilog->zl_log_error = B_TRUE;
+ lwb->lwb_tx = NULL;
+ mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
- * which we allocated the next block sync. We still have the
- * zl_lock to ensure zil_sync doesn't kmem free the lwb.
+ * which we allocated the next block sync.
*/
- txg_rele_to_sync(&lwb->lwb_txgh);
- mutex_exit(&zilog->zl_lock);
+ dmu_tx_commit(tx);
}
/*
@@ -740,10 +750,9 @@
{
zbookmark_t zb;
- zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
- zb.zb_object = 0;
- zb.zb_level = -1;
- zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+ SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+ lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
if (zilog->zl_root_zio == NULL) {
zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -778,6 +787,7 @@
zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
spa_t *spa = zilog->zl_spa;
blkptr_t *bp = &ztp->zit_next_blk;
+ dmu_tx_t *tx;
uint64_t txg;
uint64_t zil_blksz;
int error;
@@ -789,10 +799,16 @@
* before writing it in order to establish the log chain.
* Note that if the allocation of nlwb synced before we wrote
* the block that points at it (lwb), we'd leak it if we crashed.
- * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+ * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+ * We dirty the dataset to ensure that zil_sync() will be called
+ * to clean up in the event of allocation failure or I/O failure.
*/
- txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
- txg_rele_to_quiesce(&lwb->lwb_txgh);
+ tx = dmu_tx_create(zilog->zl_os);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ txg = dmu_tx_get_txg(tx);
+
+ lwb->lwb_tx = tx;
/*
* Pick a ZIL blocksize. We request a size that is the
@@ -808,22 +824,11 @@
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg,
+ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
USE_SLOG(zilog));
if (error) {
- dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
-
/*
- * We dirty the dataset to ensure that zil_sync() will
- * be called to remove this lwb from our zl_lwb_list.
- * Failing to do so, may leave an lwb with a NULL lwb_buf
- * hanging around on the zl_lwb_list.
- */
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- dmu_tx_commit(tx);
-
- /*
- * Since we've just experienced an allocation failure so we
+ * Since we've just experienced an allocation failure,
* terminate the current lwb and send it on its way.
*/
ztp->zit_pad = 0;
@@ -848,7 +853,6 @@
* Allocate a new log write buffer (lwb).
*/
nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-
nlwb->lwb_zilog = zilog;
nlwb->lwb_blk = *bp;
nlwb->lwb_nused = 0;
@@ -856,6 +860,7 @@
nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
nlwb->lwb_max_txg = txg;
nlwb->lwb_zio = NULL;
+ nlwb->lwb_tx = NULL;
/*
* Put new lwb at the end of the log chain
@@ -870,7 +875,6 @@
/*
* kick off the write for the old log block
*/
- dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
ASSERT(lwb->lwb_zio);
zio_nowait(lwb->lwb_zio);
@@ -881,20 +885,20 @@
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lr = (lr_write_t *)lrc;
+ lr_write_t *lrw = (lr_write_t *)lrc;
+ char *lr_buf;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
- uint64_t dlen;
+ uint64_t dlen = 0;
if (lwb == NULL)
return (NULL);
+
ASSERT(lwb->lwb_buf != NULL);
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
dlen = P2ROUNDUP_TYPED(
- lr->lr_length, sizeof (uint64_t), uint64_t);
- else
- dlen = 0;
+ lrw->lr_length, sizeof (uint64_t), uint64_t);
zilog->zl_cur_used += (reclen + dlen);
@@ -915,12 +919,10 @@
}
}
- /*
- * Update the lrc_seq, to be log record sequence number. See zil.h
- * Then copy the record to the log buffer.
- */
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+ lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+ bcopy(lrc, lr_buf, reclen);
+ lrc = (lr_t *)lr_buf;
+ lrw = (lr_write_t *)lrc;
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@@ -932,18 +934,16 @@
char *dbuf;
int error;
- /* alignment is guaranteed */
- lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
if (dlen) {
ASSERT(itx->itx_wr_state == WR_NEED_COPY);
- dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
- lr->lr_common.lrc_reclen += dlen;
+ dbuf = lr_buf + reclen;
+ lrw->lr_common.lrc_reclen += dlen;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
error = zilog->zl_get_data(
- itx->itx_private, lr, dbuf, lwb->lwb_zio);
+ itx->itx_private, lrw, dbuf, lwb->lwb_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
@@ -956,6 +956,13 @@
}
}
+ /*
+ * We're actually making an entry, so update lrc_seq to be the
+ * log record sequence number. Note that this is generally not
+ * equal to the itx sequence number because not all transactions
+ * are synchronous, and sometimes spa_sync() gets there first.
+ */
+ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
lwb->lwb_nused += reclen + dlen;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
@@ -980,12 +987,19 @@
return (itx);
}
+void
+zil_itx_destroy(itx_t *itx)
+{
+ kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
+
uint64_t
zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
{
uint64_t seq;
ASSERT(itx->itx_lr.lrc_seq == 0);
+ ASSERT(!zilog->zl_replay);
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_itx_list, itx);
@@ -1034,8 +1048,7 @@
/* destroy sync'd log transactions */
while ((itx = list_head(&clean_list)) != NULL) {
list_remove(&clean_list, itx);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
+ zil_itx_destroy(itx);
}
list_destroy(&clean_list);
}
@@ -1064,9 +1077,10 @@
{
uint64_t txg;
uint64_t commit_seq = 0;
- itx_t *itx, *itx_next = (itx_t *)-1;
+ itx_t *itx, *itx_next;
lwb_t *lwb;
spa_t *spa;
+ int error = 0;
zilog->zl_writer = B_TRUE;
ASSERT(zilog->zl_root_zio == NULL);
@@ -1094,69 +1108,56 @@
/* Loop through in-memory log transactions filling log blocks. */
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
- for (;;) {
+
+ for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
/*
- * Find the next itx to push:
- * Push all transactions related to specified foid and all
- * other transactions except TX_WRITE, TX_TRUNCATE,
- * TX_SETATTR and TX_ACL for all other files.
+ * Save the next pointer. Even though we drop zl_lock below,
+ * all threads that can remove itx list entries (other writers
+ * and zil_itx_clean()) can't do so until they have zl_writer.
*/
- if (itx_next != (itx_t *)-1)
- itx = itx_next;
- else
- itx = list_head(&zilog->zl_itx_list);
- for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
- if (foid == 0) /* push all foids? */
- break;
- if (itx->itx_sync) /* push all O_[D]SYNC */
- break;
- switch (itx->itx_lr.lrc_txtype) {
- case TX_SETATTR:
- case TX_WRITE:
- case TX_TRUNCATE:
- case TX_ACL:
- /* lr_foid is same offset for these records */
- if (((lr_write_t *)&itx->itx_lr)->lr_foid
- != foid) {
- continue; /* skip this record */
- }
- }
- break;
- }
- if (itx == NULL)
- break;
+ itx_next = list_next(&zilog->zl_itx_list, itx);
+
+ /*
+ * Determine whether to push this itx.
+ * Push all transactions related to specified foid and
+ * all other transactions except those that can be logged
+ * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
+ * for all other files.
+ *
+ * If foid == 0 (meaning "push all foids") or
+ * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
+ */
+ if (foid != 0 && !itx->itx_sync &&
+ TX_OOO(itx->itx_lr.lrc_txtype) &&
+ ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
+ continue; /* skip this record */
if ((itx->itx_lr.lrc_seq > seq) &&
((lwb == NULL) || (lwb->lwb_nused == 0) ||
- (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
+ (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb))))
break;
- }
- /*
- * Save the next pointer. Even though we soon drop
- * zl_lock all threads that may change the list
- * (another writer or zil_itx_clean) can't do so until
- * they have zl_writer.
- */
- itx_next = list_next(&zilog->zl_itx_list, itx);
list_remove(&zilog->zl_itx_list, itx);
zilog->zl_itx_list_sz -= itx->itx_sod;
+
mutex_exit(&zilog->zl_lock);
+
txg = itx->itx_lr.lrc_txg;
ASSERT(txg);
if (txg > spa_last_synced_txg(spa) ||
txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
- kmem_free(itx, offsetof(itx_t, itx_lr)
- + itx->itx_lr.lrc_reclen);
+
+ zil_itx_destroy(itx);
+
mutex_enter(&zilog->zl_lock);
}
DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
/* determine commit sequence number */
itx = list_head(&zilog->zl_itx_list);
if (itx)
- commit_seq = itx->itx_lr.lrc_seq;
+ commit_seq = itx->itx_lr.lrc_seq - 1;
else
commit_seq = zilog->zl_itx_seq;
mutex_exit(&zilog->zl_lock);
@@ -1173,22 +1174,28 @@
*/
if (zilog->zl_root_zio) {
DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
- (void) zio_wait(zilog->zl_root_zio);
+ error = zio_wait(zilog->zl_root_zio);
zilog->zl_root_zio = NULL;
DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
zil_flush_vdevs(zilog);
}
- if (zilog->zl_log_error || lwb == NULL) {
- zilog->zl_log_error = 0;
+ if (error || lwb == NULL)
txg_wait_synced(zilog->zl_dmu_pool, 0);
- }
mutex_enter(&zilog->zl_lock);
zilog->zl_writer = B_FALSE;
ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
zilog->zl_commit_seq = commit_seq;
+
+ /*
+ * Remember the highest committed log sequence number for ztest.
+ * We only update this value when all the log writes succeeded,
+ * because ztest wants to ASSERT that it got the whole log chain.
+ */
+ if (error == 0 && lwb != NULL)
+ zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
/*
@@ -1208,7 +1215,7 @@
while (zilog->zl_writer) {
cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
- if (seq < zilog->zl_commit_seq) {
+ if (seq <= zilog->zl_commit_seq) {
mutex_exit(&zilog->zl_lock);
return;
}
@@ -1220,6 +1227,33 @@
}
/*
+ * Report whether all transactions are committed.
+ */
+static boolean_t
+zil_is_committed(zilog_t *zilog)
+{
+ lwb_t *lwb;
+ boolean_t committed;
+
+ mutex_enter(&zilog->zl_lock);
+
+ while (zilog->zl_writer)
+ cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+ if (!list_is_empty(&zilog->zl_itx_list))
+ committed = B_FALSE; /* unpushed transactions */
+ else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
+ committed = B_TRUE; /* intent log never used */
+ else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
+ committed = B_FALSE; /* zil_sync() not done yet */
+ else
+ committed = B_TRUE; /* everything synced */
+
+ mutex_exit(&zilog->zl_lock);
+ return (committed);
+}
+
+/*
* Called in syncing context to free committed log blocks and update log header.
*/
void
@@ -1228,6 +1262,7 @@
zil_header_t *zh = zil_header_in_syncing_context(zilog);
uint64_t txg = dmu_tx_get_txg(tx);
spa_t *spa = zilog->zl_spa;
+ uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
lwb_t *lwb;
/*
@@ -1241,7 +1276,11 @@
ASSERT(zilog->zl_stop_sync == 0);
- zh->zh_replay_seq = zilog->zl_replayed_seq[txg & TXG_MASK];
+ if (*replayed_seq != 0) {
+ ASSERT(zh->zh_replay_seq < *replayed_seq);
+ zh->zh_replay_seq = *replayed_seq;
+ *replayed_seq = 0;
+ }
if (zilog->zl_destroy_txg == txg) {
blkptr_t blk = zh->zh_log;
@@ -1270,7 +1309,7 @@
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break;
list_remove(&zilog->zl_lwb_list, lwb);
- zio_free_blk(spa, &lwb->lwb_blk, txg);
+ zio_free_zil(spa, txg, &lwb->lwb_blk);
kmem_cache_free(zil_lwb_cache, lwb);
/*
@@ -1393,7 +1432,7 @@
if (!zil_is_committed(zilog)) {
uint64_t txg;
dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
- (void) dmu_tx_assign(tx, TXG_WAIT);
+ VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
dmu_tx_commit(tx);
@@ -1466,102 +1505,89 @@
mutex_exit(&zilog->zl_lock);
}
-/*
- * Read in the data for the dmu_sync()ed block, and change the log
- * record to write this whole block.
- */
-void
-zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
-{
- blkptr_t *wbp = &lr->lr_blkptr;
- char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t blksz;
-
- if (BP_IS_HOLE(wbp)) { /* compressed to a hole */
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- /*
- * If the blksz is zero then we must be replaying a log
- * from an version prior to setting the blksize of null blocks.
- * So we just zero the actual write size reqeusted.
- */
- if (blksz == 0) {
- bzero(wbuf, lr->lr_length);
- return;
- }
- bzero(wbuf, blksz);
- } else {
- /*
- * A subsequent write may have overwritten this block, in which
- * case wbp may have been been freed and reallocated, and our
- * read of wbp may fail with a checksum error. We can safely
- * ignore this because the later write will provide the
- * correct data.
- */
- zbookmark_t zb;
-
- zb.zb_objset = dmu_objset_id(zilog->zl_os);
- zb.zb_object = lr->lr_foid;
- zb.zb_level = 0;
- zb.zb_blkid = -1; /* unknown */
-
- blksz = BP_GET_LSIZE(&lr->lr_blkptr);
- (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
- NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
- }
- lr->lr_offset -= lr->lr_offset % blksz;
- lr->lr_length = blksz;
-}
-
typedef struct zil_replay_arg {
- objset_t *zr_os;
zil_replay_func_t **zr_replay;
void *zr_arg;
boolean_t zr_byteswap;
- char *zr_lrbuf;
+ char *zr_lr;
} zil_replay_arg_t;
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+ char name[MAXNAMELEN];
+
+ zilog->zl_replaying_seq--; /* didn't actually replay this one */
+
+ dmu_objset_name(zilog->zl_os, name);
+
+ cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+ "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+ (u_longlong_t)lr->lrc_seq,
+ (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+ (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+ return (error);
+}
+
+static int
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
zil_replay_arg_t *zr = zra;
const zil_header_t *zh = zilog->zl_header;
uint64_t reclen = lr->lrc_reclen;
uint64_t txtype = lr->lrc_txtype;
- char *name;
- int pass, error;
+ int error = 0;
- if (!zilog->zl_replay) /* giving up */
- return;
+ zilog->zl_replaying_seq = lr->lrc_seq;
+
+ if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
+ return (0);
if (lr->lrc_txg < claim_txg) /* already committed */
- return;
-
- if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */
- return;
+ return (0);
/* Strip case-insensitive bit, still present in log record */
txtype &= ~TX_CI;
- if (txtype == 0 || txtype >= TX_MAX_TYPE) {
- error = EINVAL;
- goto bad;
+ if (txtype == 0 || txtype >= TX_MAX_TYPE)
+ return (zil_replay_error(zilog, lr, EINVAL));
+
+ /*
+ * If this record type can be logged out of order, the object
+ * (lr_foid) may no longer exist. That's legitimate, not an error.
+ */
+ if (TX_OOO(txtype)) {
+ error = dmu_object_info(zilog->zl_os,
+ ((lr_ooo_t *)lr)->lr_foid, NULL);
+ if (error == ENOENT || error == EEXIST)
+ return (0);
}
/*
* Make a copy of the data so we can revise and extend it.
*/
- bcopy(lr, zr->zr_lrbuf, reclen);
+ bcopy(lr, zr->zr_lr, reclen);
+
+ /*
+ * If this is a TX_WRITE with a blkptr, suck in the data.
+ */
+ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+ error = zil_read_log_data(zilog, (lr_write_t *)lr,
+ zr->zr_lr + reclen);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
+ }
/*
* The log block containing this lr may have been byteswapped
* so that we can easily examine common fields like lrc_txtype.
- * However, the log is a mix of different data types, and only the
+ * However, the log is a mix of different record types, and only the
* replay vectors know how to byteswap their records. Therefore, if
* the lr was byteswapped, undo it before invoking the replay vector.
*/
if (zr->zr_byteswap)
- byteswap_uint64_array(zr->zr_lrbuf, reclen);
+ byteswap_uint64_array(zr->zr_lr, reclen);
/*
* We must now do two things atomically: replay this log record,
@@ -1569,42 +1595,30 @@
* we did so. At the end of each replay function the sequence number
* is updated if we are in replay mode.
*/
- for (pass = 1; pass <= 2; pass++) {
- zilog->zl_replaying_seq = lr->lrc_seq;
- /* Only byteswap (if needed) on the 1st pass. */
- error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
- zr->zr_byteswap && pass == 1);
-
- if (!error)
- return;
-
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+ if (error) {
/*
* The DMU's dnode layer doesn't see removes until the txg
* commits, so a subsequent claim can spuriously fail with
* EEXIST. So if we receive any error we try syncing out
- * any removes then retry the transaction.
+ * any removes then retry the transaction. Note that we
+ * specify B_FALSE for byteswap now, so we don't do it twice.
*/
- if (pass == 1)
- txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+ if (error)
+ return (zil_replay_error(zilog, lr, error));
}
-
-bad:
- ASSERT(error);
- name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- dmu_objset_name(zr->zr_os, name);
- cmn_err(CE_WARN, "ZFS replay transaction error %d, "
- "dataset %s, seq 0x%llx, txtype %llu %s\n",
- error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
- (lr->lrc_txtype & TX_CI) ? "CI" : "");
- zilog->zl_replay = B_FALSE;
- kmem_free(name, MAXNAMELEN);
+ return (0);
}
/* ARGSUSED */
-static void
+static int
zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
zilog->zl_replay_blks++;
+
+ return (0);
}
/*
@@ -1622,11 +1636,10 @@
return;
}
- zr.zr_os = os;
zr.zr_replay = replay_func;
zr.zr_arg = arg;
zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
- zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+ zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
/*
* Wait for in-progress removes to sync before starting replay.
@@ -1638,54 +1651,27 @@
ASSERT(zilog->zl_replay_blks == 0);
(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
zh->zh_claim_txg);
- kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+ kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_replay = B_FALSE;
}
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
{
- lwb_t *lwb;
- int ret;
+ if (zilog == NULL)
+ return (B_TRUE);
- mutex_enter(&zilog->zl_lock);
- while (zilog->zl_writer)
- cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-
- /* recent unpushed intent log transactions? */
- if (!list_is_empty(&zilog->zl_itx_list)) {
- ret = B_FALSE;
- goto out;
+ if (zilog->zl_replay) {
+ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+ zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+ zilog->zl_replaying_seq;
+ return (B_TRUE);
}
- /* intent log never used? */
- lwb = list_head(&zilog->zl_lwb_list);
- if (lwb == NULL) {
- ret = B_TRUE;
- goto out;
- }
-
- /*
- * more than 1 log buffer means zil_sync() hasn't yet freed
- * entries after a txg has committed
- */
- if (list_next(&zilog->zl_lwb_list, lwb)) {
- ret = B_FALSE;
- goto out;
- }
-
- ASSERT(zil_empty(zilog));
- ret = B_TRUE;
-out:
- cv_broadcast(&zilog->zl_cv_writer);
- mutex_exit(&zilog->zl_lock);
- return (ret);
+ return (B_FALSE);
}
/* ARGSUSED */
--- a/usr/src/uts/common/fs/zfs/zio.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio.c Sun Nov 01 14:14:46 2009 -0800
@@ -32,6 +32,9 @@
#include <sys/zio_impl.h>
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
/*
* ==========================================================================
@@ -59,10 +62,6 @@
char *zio_type_name[ZIO_TYPES] = {
"null", "read", "write", "free", "claim", "ioctl" };
-#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
-#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
-#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
-
/*
* ==========================================================================
* I/O kmem caches
@@ -81,8 +80,13 @@
* An allocating zio is one that either currently has the DVA allocate
* stage set or will have it later in its lifetime.
*/
-#define IO_IS_ALLOCATING(zio) \
- ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
void
zio_init(void)
@@ -124,12 +128,13 @@
char name[36];
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
- align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+ align, NULL, NULL, NULL, NULL, NULL,
+ size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, data_alloc_arena,
- KMC_NODEBUG);
+ size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
}
}
@@ -264,7 +269,8 @@
zt->zt_transform(zio,
zt->zt_orig_data, zt->zt_orig_size);
- zio_buf_free(zio->io_data, zt->zt_bufsize);
+ if (zt->zt_bufsize != 0)
+ zio_buf_free(zio->io_data, zt->zt_bufsize);
zio->io_data = zt->zt_orig_data;
zio->io_size = zt->zt_orig_size;
@@ -293,7 +299,7 @@
{
if (zio->io_error == 0 &&
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
- zio->io_data, zio->io_size, data, size) != 0)
+ zio->io_data, data, zio->io_size, size) != 0)
zio->io_error = EIO;
}
@@ -378,6 +384,9 @@
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
+ pio->io_child_count++;
+ cio->io_parent_count++;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
}
@@ -394,6 +403,9 @@
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
+ pio->io_child_count--;
+ cio->io_parent_count--;
+
mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
@@ -409,7 +421,7 @@
mutex_enter(&zio->io_lock);
ASSERT(zio->io_stall == NULL);
if (*countp != 0) {
- zio->io_stage--;
+ zio->io_stage >>= 1;
zio->io_stall = countp;
waiting = B_TRUE;
}
@@ -451,10 +463,11 @@
* ==========================================================================
*/
static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
- const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+ zio_type_t type, int priority, enum zio_flag flags,
+ vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+ enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
@@ -481,14 +494,17 @@
zio->io_child_type = ZIO_CHILD_VDEV;
else if (flags & ZIO_FLAG_GANG_CHILD)
zio->io_child_type = ZIO_CHILD_GANG;
+ else if (flags & ZIO_FLAG_DDT_CHILD)
+ zio->io_child_type = ZIO_CHILD_DDT;
else
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
- zio->io_bp = bp;
+ zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
- if (type != ZIO_TYPE_WRITE)
+ if (type != ZIO_TYPE_WRITE ||
+ zio->io_child_type == ZIO_CHILD_DDT)
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
@@ -498,14 +514,14 @@
zio->io_spa = spa;
zio->io_txg = txg;
- zio->io_data = data;
- zio->io_size = size;
zio->io_done = done;
zio->io_private = private;
zio->io_type = type;
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
+ zio->io_orig_data = zio->io_data = data;
+ zio->io_orig_size = zio->io_size = size;
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
@@ -539,7 +555,7 @@
zio_t *
zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
- void *private, int flags)
+ void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -551,7 +567,7 @@
}
zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
{
return (zio_null(NULL, spa, NULL, done, private, flags));
}
@@ -559,33 +575,24 @@
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
- zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+ zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
data, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
return (zio);
}
-void
-zio_skip_write(zio_t *zio)
-{
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
- ASSERT(zio->io_stage == ZIO_STAGE_READY);
- ASSERT(!BP_IS_GANG(zio->io_bp));
-
- zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-}
-
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_prop_t *zp,
+ void *data, uint64_t size, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, int flags, const zbookmark_t *zb)
+ int priority, enum zio_flag flags, const zbookmark_t *zb)
{
zio_t *zio;
@@ -595,13 +602,15 @@
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
zp->zp_type < DMU_OT_NUMTYPES &&
zp->zp_level < 32 &&
- zp->zp_ndvas > 0 &&
- zp->zp_ndvas <= spa_max_replication(spa));
- ASSERT(ready != NULL);
+ zp->zp_copies > 0 &&
+ zp->zp_copies <= spa_max_replication(spa) &&
+ zp->zp_dedup <= 1 &&
+ zp->zp_dedup_verify <= 1);
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
- ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+ ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+ ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
zio->io_ready = ready;
zio->io_prop = *zp;
@@ -612,7 +621,7 @@
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private, int priority,
- int flags, zbookmark_t *zb)
+ enum zio_flag flags, zbookmark_t *zb)
{
zio_t *zio;
@@ -623,33 +632,44 @@
return (zio);
}
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+ ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+ zio->io_prop.zp_copies = copies;
+ zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+ bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ enum zio_flag flags)
{
zio_t *zio;
ASSERT(!BP_IS_HOLE(bp));
-
- if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
-
- if (txg == spa->spa_syncing_txg &&
- spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return (zio_null(pio, spa, NULL, NULL, NULL, flags));
- }
+ ASSERT(spa_syncing_txg(spa) == txg);
+ ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
- done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
return (zio);
}
zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+ zio_done_func_t *done, void *private, enum zio_flag flags)
{
zio_t *zio;
@@ -663,9 +683,11 @@
*
* All claims *must* be resolved in the first txg -- before the SPA
* starts allocating blocks -- so that nothing is allocated twice.
+ * If txg == 0 we just verify that the block is claimable.
*/
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
- ASSERT3U(spa_first_txg(spa), <=, txg);
+ ASSERT(txg == spa_first_txg(spa) || txg == 0);
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -676,7 +698,7 @@
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
- zio_done_func_t *done, void *private, int priority, int flags)
+ zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
{
zio_t *zio;
int c;
@@ -701,7 +723,7 @@
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -722,7 +744,7 @@
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags, boolean_t labels)
+ int priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -757,10 +779,10 @@
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority, int flags,
+ void *data, uint64_t size, int type, int priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
- uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+ enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
ASSERT(vd->vdev_parent ==
@@ -773,26 +795,33 @@
* detection as close to the leaves as possible and
* eliminates redundant checksums in the interior nodes.
*/
- pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
- pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+ pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
if (vd->vdev_children == 0)
offset += VDEV_LABEL_START_SIZE;
+ flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+ /*
+ * If we've decided to do a repair, the write is not speculative --
+ * even if the original read was.
+ */
+ if (flags & ZIO_FLAG_IO_REPAIR)
+ flags &= ~ZIO_FLAG_SPECULATIVE;
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
- done, private, type, priority,
- (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
- vd, offset, &pio->io_bookmark,
- ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+ done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+ ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
- int type, int priority, int flags, zio_done_func_t *done, void *private)
+ int type, int priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -802,7 +831,7 @@
data, size, done, private, type, priority,
flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
vd, offset, NULL,
- ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+ ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
return (zio);
}
@@ -829,28 +858,30 @@
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t csize = BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(csize);
-
- zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ void *cbuf = zio_buf_alloc(psize);
+
+ zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+ zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
return (ZIO_PIPELINE_CONTINUE);
}
static int
zio_write_bp_init(zio_t *zio)
{
+ spa_t *spa = zio->io_spa;
zio_prop_t *zp = &zio->io_prop;
- int compress = zp->zp_compress;
+ enum zio_compress compress = zp->zp_compress;
blkptr_t *bp = zio->io_bp;
- void *cbuf;
uint64_t lsize = zio->io_size;
- uint64_t csize = lsize;
- uint64_t cbufsize = 0;
+ uint64_t psize = lsize;
int pass = 1;
/*
@@ -864,7 +895,29 @@
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
- ASSERT(compress != ZIO_COMPRESS_INHERIT);
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+ if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth != zio->io_txg);
+ ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+ *bp = *zio->io_bp_override;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+ if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+ zp->zp_dedup_verify);
+
+ if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+ BP_SET_DEDUP(bp, 1);
+ zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ }
if (bp->blk_birth == zio->io_txg) {
/*
@@ -876,22 +929,29 @@
* convergence take longer. Therefore, after the first
* few passes, stop compressing to ensure convergence.
*/
- pass = spa_sync_pass(zio->io_spa);
+ pass = spa_sync_pass(spa);
+
+ ASSERT(zio->io_txg == spa_syncing_txg(spa));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!BP_GET_DEDUP(bp));
if (pass > SYNC_PASS_DONT_COMPRESS)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
- spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+ ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
if (compress != ZIO_COMPRESS_OFF) {
- if (!zio_compress_data(compress, zio->io_data, zio->io_size,
- &cbuf, &csize, &cbufsize)) {
+ void *cbuf = zio_buf_alloc(lsize);
+ psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
- } else if (csize != 0) {
- zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+ zio_buf_free(cbuf, lsize);
+ } else {
+ ASSERT(psize < lsize);
+ zio_push_transform(zio, cbuf, psize, lsize, NULL);
}
}
@@ -903,10 +963,10 @@
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+ if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
pass > SYNC_PASS_REWRITE) {
- ASSERT(csize != 0);
- uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+ ASSERT(psize != 0);
+ enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -914,17 +974,38 @@
zio->io_pipeline = ZIO_WRITE_PIPELINE;
}
- if (csize == 0) {
+ if (psize == 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
- BP_SET_PSIZE(bp, csize);
+ BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
BP_SET_TYPE(bp, zp->zp_type);
BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+ if (zp->zp_dedup) {
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+ zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+ }
+ }
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+ if (BP_GET_DEDUP(bp))
+ zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+ else
+ arc_free(zio->io_spa, bp);
}
if (zio_injection_enabled &&
@@ -1003,7 +1084,7 @@
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
void
zio_execute(zio_t *zio)
@@ -1011,32 +1092,34 @@
zio->io_executor = curthread;
while (zio->io_stage < ZIO_STAGE_DONE) {
- uint32_t pipeline = zio->io_pipeline;
- zio_stage_t stage = zio->io_stage;
+ enum zio_stage pipeline = zio->io_pipeline;
+ enum zio_stage stage = zio->io_stage;
int rv;
ASSERT(!MUTEX_HELD(&zio->io_lock));
-
- while (((1U << ++stage) & pipeline) == 0)
- continue;
+ ASSERT(ISP2(stage));
+ ASSERT(zio->io_stall == NULL);
+
+ do {
+ stage <<= 1;
+ } while ((stage & pipeline) == 0);
ASSERT(stage <= ZIO_STAGE_DONE);
- ASSERT(zio->io_stall == NULL);
/*
* If we are in interrupt context and this pipeline stage
* will grab a config lock that is held across I/O,
- * issue async to avoid deadlock.
+ * or may wait for an I/O that needs an interrupt thread
+ * to complete, issue async to avoid deadlock.
*/
- if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
- zio->io_vd == NULL &&
+ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
return;
}
zio->io_stage = stage;
- rv = zio_pipeline[stage](zio);
+ rv = zio_pipeline[highbit(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
return;
@@ -1119,19 +1202,8 @@
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
pio->io_child_error[c] = 0;
- if (IO_IS_ALLOCATING(pio)) {
- /*
- * Remember the failed bp so that the io_ready() callback
- * can update its accounting upon reexecution. The block
- * was already freed in zio_done(); we indicate this with
- * a fill count of -1 so that zio_free() knows to skip it.
- */
- blkptr_t *bp = pio->io_bp;
- ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
- bp->blk_fill = BLK_FILL_ALREADY_FREED;
- pio->io_bp_orig = *bp;
- BP_ZERO(bp);
- }
+ if (IO_IS_ALLOCATING(pio))
+ BP_ZERO(pio->io_bp);
/*
* As we reexecute pio's children, new children could be created.
@@ -1319,6 +1391,12 @@
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp));
}
+ /*
+ * If we are here to damage data for testing purposes,
+ * leave the GBH alone so that we can detect the damage.
+ */
+ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1332,8 +1410,8 @@
zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
{
- return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
- NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+ return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+ ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
@@ -1417,7 +1495,7 @@
blkptr_t *bp = zio->io_bp;
ASSERT(gio == zio_unique_parent(zio));
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
if (zio->io_error)
return;
@@ -1523,9 +1601,9 @@
ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
- ASSERT3U(zio->io_prop.zp_ndvas, ==, gio->io_prop.zp_ndvas);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
- ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+ ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+ ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
mutex_enter(&pio->io_lock);
@@ -1550,13 +1628,13 @@
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
- int ndvas = gio->io_prop.zp_ndvas;
- int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+ int copies = gio->io_prop.zp_copies;
+ int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
zio_prop_t zp;
int error;
- error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
- bp, gbh_ndvas, txg, pio == gio ? NULL : gio->io_bp,
+ error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
if (error) {
pio->io_error = error;
@@ -1592,7 +1670,9 @@
zp.zp_compress = ZIO_COMPRESS_OFF;
zp.zp_type = DMU_OT_NONE;
zp.zp_level = 0;
- zp.zp_ndvas = gio->io_prop.zp_ndvas;
+ zp.zp_copies = gio->io_prop.zp_copies;
+ zp.zp_dedup = 0;
+ zp.zp_dedup_verify = 0;
zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1613,15 +1693,380 @@
/*
* ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp;
+ zio_t *pio = zio_unique_parent(zio);
+
+ mutex_enter(&pio->io_lock);
+ ddp = ddt_phys_select(dde, bp);
+ if (zio->io_error == 0)
+ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
+ if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+ dde->dde_repair_data = zio->io_data;
+ else
+ zio_buf_free(zio->io_data, zio->io_size);
+ mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+ ddt_phys_t *ddp = dde->dde_phys;
+ ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+ blkptr_t blk;
+
+ ASSERT(zio->io_vsd == NULL);
+ zio->io_vsd = dde;
+
+ if (ddp_self == NULL)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+ continue;
+ ddt_bp_create(ddt, &dde->dde_key, ddp, &blk);
+ zio_nowait(zio_read(zio, zio->io_spa, &blk,
+ zio_buf_alloc(zio->io_size), zio->io_size,
+ zio_ddt_child_read_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+ &zio->io_bookmark));
+ }
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ zio_nowait(zio_read(zio, zio->io_spa, bp,
+ zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_read_done(zio_t *zio)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ return (ZIO_PIPELINE_STOP);
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (zio->io_child_error[ZIO_CHILD_DDT]) {
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_vsd;
+ if (ddt == NULL) {
+ ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+ if (dde == NULL) {
+ zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+ return (ZIO_PIPELINE_STOP);
+ }
+ if (dde->dde_repair_data != NULL) {
+ bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ zio->io_child_error[ZIO_CHILD_DDT] = 0;
+ }
+ ddt_repair_done(ddt, dde);
+ zio->io_vsd = NULL;
+ }
+
+ ASSERT(zio->io_vsd == NULL);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+ spa_t *spa = zio->io_spa;
+
+ /*
+ * Note: we compare the original data, not the transformed data,
+ * because when zio->io_bp is an override bp, we will not have
+ * pushed the I/O transforms. That's an important optimization
+ * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+ */
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ zio_t *lio = dde->dde_lead_zio[p];
+
+ if (lio != NULL) {
+ return (lio->io_orig_size != zio->io_orig_size ||
+ bcmp(zio->io_orig_data, lio->io_orig_data,
+ zio->io_orig_size) != 0);
+ }
+ }
+
+ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ if (ddp->ddp_phys_birth != 0) {
+ arc_buf_t *abuf = NULL;
+ uint32_t aflags = ARC_WAIT;
+ blkptr_t blk = *zio->io_bp;
+ int error;
+
+ ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+ ddt_exit(ddt);
+
+ error = arc_read_nolock(NULL, spa, &blk,
+ arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &zio->io_bookmark);
+
+ if (error == 0) {
+ if (arc_buf_size(abuf) != zio->io_orig_size ||
+ bcmp(abuf->b_data, zio->io_orig_data,
+ zio->io_orig_size) != 0)
+ error = EEXIST;
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+
+ ddt_enter(ddt);
+ return (error != 0);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ zio_t *pio;
+
+ if (zio->io_error)
+ return;
+
+ ddt_enter(ddt);
+
+ ASSERT(dde->dde_lead_zio[p] == zio);
+
+ ddt_phys_fill(ddp, zio->io_bp);
+
+ while ((pio = zio_walk_parents(zio)) != NULL)
+ ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+ int p = zio->io_prop.zp_copies;
+ ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ while (zio_walk_parents(zio) != NULL)
+ ddt_phys_addref(ddp);
+ } else {
+ ddt_phys_clear(ddp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+ int p = DDT_PHYS_DITTO;
+ zio_prop_t *zp = &zio->io_prop;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(zio->io_spa, bp);
+ ddt_entry_t *dde = zio->io_private;
+ ddt_phys_t *ddp = &dde->dde_phys[p];
+ ddt_key_t *ddk = &dde->dde_key;
+
+ ddt_enter(ddt);
+
+ ASSERT(ddp->ddp_refcnt == 0);
+ ASSERT(dde->dde_lead_zio[p] == zio);
+ dde->dde_lead_zio[p] = NULL;
+
+ if (zio->io_error == 0) {
+ ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+ ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+ ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+ if (ddp->ddp_phys_birth != 0)
+ ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+ ddt_phys_fill(ddp, bp);
+ }
+
+ ddt_exit(ddt);
+}
+
+static int
+zio_ddt_write(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t txg = zio->io_txg;
+ zio_prop_t *zp = &zio->io_prop;
+ int p = zp->zp_copies;
+ int ditto_copies;
+ zio_t *cio = NULL;
+ zio_t *dio = NULL;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+ ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = &dde->dde_phys[p];
+
+ if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+ /*
+ * If we're using a weak checksum, upgrade to a strong checksum
+ * and try again. If we're already using a strong checksum,
+ * we can't resolve it, so just convert to an ordinary write.
+ * (And automatically e-mail a paper to Nature?)
+ */
+ if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ zp->zp_checksum = spa_dedup_checksum(spa);
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ BP_ZERO(bp);
+ } else {
+ zp->zp_dedup = 0;
+ }
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+ ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+ if (ditto_copies > ddt_ditto_copies_present(dde) &&
+ dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+ zio_prop_t czp = *zp;
+
+ czp.zp_copies = ditto_copies;
+
+ /*
+ * If we arrived here with an override bp, we won't have run
+ * the transform stack, so we won't have the data we need to
+ * generate a child i/o. So, toss the override bp and restart.
+ * This is safe, because using the override bp is just an
+ * optimization; and it's rare, so the cost doesn't matter.
+ */
+ if (zio->io_bp_override) {
+ zio_pop_transforms(zio);
+ zio->io_stage = ZIO_STAGE_OPEN;
+ zio->io_pipeline = ZIO_WRITE_PIPELINE;
+ zio->io_bp_override = NULL;
+ BP_ZERO(bp);
+ ddt_exit(ddt);
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, &czp, NULL,
+ zio_ddt_ditto_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+ }
+
+ if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+ if (ddp->ddp_phys_birth != 0)
+ ddt_bp_fill(ddp, bp, txg);
+ if (dde->dde_lead_zio[p] != NULL)
+ zio_add_child(zio, dde->dde_lead_zio[p]);
+ else
+ ddt_phys_addref(ddp);
+ } else if (zio->io_bp_override) {
+ ASSERT(bp->blk_birth == txg);
+ ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+ ddt_phys_fill(ddp, bp);
+ ddt_phys_addref(ddp);
+ } else {
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ zio_ddt_child_write_done, dde, zio->io_priority,
+ ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+ zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ dde->dde_lead_zio[p] = cio;
+ }
+
+ ddt_exit(ddt);
+
+ if (cio)
+ zio_nowait(cio);
+ if (dio)
+ zio_nowait(dio);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ ddt_t *ddt = ddt_select(spa, bp);
+ ddt_entry_t *dde;
+ ddt_phys_t *ddp;
+
+ ASSERT(BP_GET_DEDUP(bp));
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_TRUE);
+ ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ ddt_exit(ddt);
+
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
* Allocate and free blocks
* ==========================================================================
*/
-
static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa->spa_normal_class;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
@@ -1632,12 +2077,12 @@
ASSERT(BP_IS_HOLE(bp));
ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
- ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+ ASSERT3U(zio->io_prop.zp_copies, >, 0);
+ ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
if (error) {
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1676,36 +2121,11 @@
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- spa_t *spa = zio->io_spa;
- boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
- if (zio->io_bp == bp && !now) {
- /*
- * This is a rewrite for sync-to-convergence.
- * We can't do a metaslab_free(NOW) because bp wasn't allocated
- * during this sync pass, which means that metaslab_sync()
- * already committed the allocation.
- */
- ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
- BP_IDENTITY(&zio->io_bp_orig)));
- ASSERT(spa_sync_pass(spa) > 1);
-
- if (BP_IS_GANG(bp) && gn == NULL) {
- /*
- * This is a gang leader whose gang header(s) we
- * couldn't read now, so defer the free until later.
- * The block should still be intact because without
- * the headers, we'd never even start the rewrite.
- */
- bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
- return;
- }
- }
+ ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
- metaslab_free(spa, bp, bp->blk_birth, now);
+ metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1719,17 +2139,19 @@
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t txg, boolean_t use_slog)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+ uint64_t size, boolean_t use_slog)
{
int error = 1;
+ ASSERT(txg > spa_syncing_txg(spa));
+
if (use_slog)
- error = metaslab_alloc(spa, spa->spa_log_class, size,
+ error = metaslab_alloc(spa, spa_log_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error)
- error = metaslab_alloc(spa, spa->spa_normal_class, size,
+ error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
if (error == 0) {
@@ -1739,6 +2161,7 @@
BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
BP_SET_LEVEL(new_bp, 0);
+ BP_SET_DEDUP(new_bp, 0);
BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
}
@@ -1746,15 +2169,15 @@
}
/*
- * Free an intent log block. We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
*/
void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
{
+ ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
ASSERT(!BP_IS_GANG(bp));
- metaslab_free(spa, bp, txg, B_FALSE);
+ zio_free(spa, txg, bp);
}
/*
@@ -1938,7 +2361,7 @@
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
- zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
return (ZIO_PIPELINE_STOP);
}
@@ -1971,7 +2394,7 @@
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
ASSERT(zio->io_error == 0);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1979,7 +2402,7 @@
{
ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
- zio->io_stage--;
+ zio->io_stage >>= 1;
}
void
@@ -1989,7 +2412,7 @@
ASSERT(zio->io_error == 0);
zio->io_flags |= ZIO_FLAG_IO_BYPASS;
- zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+ zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
}
/*
@@ -2032,10 +2455,11 @@
zio_checksum_verify(zio_t *zio)
{
zio_bad_cksum_t info;
-
blkptr_t *bp = zio->io_bp;
int error;
+ ASSERT(zio->io_vd != NULL);
+
if (bp == NULL) {
/*
* This is zio_read_phys().
@@ -2065,7 +2489,7 @@
void
zio_checksum_verified(zio_t *zio)
{
- zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+ zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
/*
@@ -2105,7 +2529,8 @@
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
return (ZIO_PIPELINE_STOP);
if (zio->io_ready) {
@@ -2139,6 +2564,15 @@
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (BP_IS_GANG(bp)) {
+ zio->io_flags &= ~ZIO_FLAG_NODATA;
+ } else {
+ ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+ zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+ }
+ }
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -2158,6 +2592,7 @@
*/
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+ zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
return (ZIO_PIPELINE_STOP);
@@ -2168,23 +2603,51 @@
if (bp != NULL) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
- ASSERT(bp->blk_pad[2] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
(bp == zio_unique_parent(zio)->io_bp));
if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+ zio->io_bp_override == NULL &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
ASSERT(!BP_SHOULD_BYTESWAP(bp));
- ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+ ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
ASSERT(BP_COUNT_GANG(bp) == 0 ||
(BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
}
}
/*
- * If there were child vdev or gang errors, they apply to us now.
+ * If there were child vdev/gang/ddt errors, they apply to us now.
*/
zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+ zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+ /*
+ * If the I/O on the transformed data was successful, generate any
+ * checksum reports now while we still have the transformed data.
+ */
+ if (zio->io_error == 0) {
+ while (zio->io_cksum_report != NULL) {
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ uint64_t align = zcr->zcr_align;
+ uint64_t asize = P2ROUNDUP(psize, align);
+ char *abuf = zio->io_data;
+
+ if (asize != psize) {
+ abuf = zio_buf_alloc(asize);
+ bcopy(zio->io_data, abuf, psize);
+ bzero(abuf + psize, asize - psize);
+ }
+
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, abuf);
+ zfs_ereport_free_checksum(zcr);
+
+ if (asize != psize)
+ zio_buf_free(abuf, asize);
+ }
+ }
zio_pop_transforms(zio); /* note: may set zio->io_error */
@@ -2219,12 +2682,15 @@
* propagate all the way to the root via zio_notify_parent().
*/
ASSERT(vd == NULL && bp != NULL);
-
- if (IO_IS_ALLOCATING(zio))
+ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+ if (IO_IS_ALLOCATING(zio) &&
+ !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
if (zio->io_error != ENOSPC)
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
else
zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ }
if ((zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_FREE) &&
@@ -2253,11 +2719,10 @@
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) &&
- zio->io_child_type == ZIO_CHILD_LOGICAL) {
- ASSERT(zio->io_child_type != ZIO_CHILD_GANG);
+ if ((zio->io_error || zio->io_reexecute) &&
+ IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+ !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
zio_dva_unallocate(zio, zio->io_gang_tree, bp);
- }
zio_gang_tree_free(&zio->io_gang_tree);
@@ -2335,22 +2800,19 @@
return (ZIO_PIPELINE_STOP);
}
- ASSERT(zio_walk_children(zio) == NULL);
+ ASSERT(zio->io_child_count == 0);
ASSERT(zio->io_reexecute == 0);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
- /* Report any checksum errors, since the IO is complete */
+ /*
+ * Report any checksum errors, since the I/O is complete.
+ */
while (zio->io_cksum_report != NULL) {
- zio_cksum_report_t *rpt = zio->io_cksum_report;
-
- zio->io_cksum_report = rpt->zcr_next;
- rpt->zcr_next = NULL;
-
- /* only pass in our data buffer if we've succeeded. */
- rpt->zcr_finish(rpt,
- (zio->io_error == 0) ? zio->io_data : NULL);
-
- zfs_ereport_free_checksum(rpt);
+ zio_cksum_report_t *zcr = zio->io_cksum_report;
+ zio->io_cksum_report = zcr->zcr_next;
+ zcr->zcr_next = NULL;
+ zcr->zcr_finish(zcr, NULL);
+ zfs_ereport_free_checksum(zcr);
}
/*
@@ -2389,12 +2851,17 @@
* I/O pipeline definition
* ==========================================================================
*/
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
+ zio_read_bp_init,
+ zio_free_bp_init,
zio_issue_async,
- zio_read_bp_init,
zio_write_bp_init,
zio_checksum_generate,
+ zio_ddt_read_start,
+ zio_ddt_read_done,
+ zio_ddt_write,
+ zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
zio_dva_allocate,
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c Sun Nov 01 14:14:46 2009 -0800
@@ -49,13 +49,13 @@
* we want the ability to take advantage of that hardware.
*
* Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
*
* When writing a block, we always checksum it with the latest-and-greatest
* checksum function of the appropriate strength. When reading a block,
* we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
*/
/*ARGSUSED*/
@@ -66,19 +66,19 @@
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
+ {{NULL, NULL}, 0, 0, 0, "inherit"},
+ {{NULL, NULL}, 0, 0, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
};
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
{
ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +93,29 @@
return (child);
}
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+ enum zio_checksum parent)
+{
+ ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+ ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+ if (child == ZIO_CHECKSUM_INHERIT)
+ return (parent);
+
+ if (child == ZIO_CHECKSUM_ON)
+ return (spa_dedup_checksum(spa));
+
+ if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+ return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+ ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+ (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+ return (child);
+}
+
/*
* Set the external verifier for a gang block based on <vdev, offset, txg>,
* a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +124,7 @@
zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
{
dva_t *dva = BP_IDENTITY(bp);
- uint64_t txg = bp->blk_birth;
+ uint64_t txg = BP_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
--- a/usr/src/uts/common/fs/zfs/zio_compress.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/compress.h>
#include <sys/spa.h>
@@ -51,10 +49,11 @@
{gzip_compress, gzip_decompress, 7, "gzip-7"},
{gzip_compress, gzip_decompress, 8, "gzip-8"},
{gzip_compress, gzip_decompress, 9, "gzip-9"},
+ {zle_compress, zle_decompress, 64, "zle"},
};
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
{
ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@
return (child);
}
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
- uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- uint64_t ciosize, gapsize, destbufsize;
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
- char *dest;
- uint_t allzero;
+ size_t c_len, d_len, r_len;
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
- ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+ ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+ ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
/*
* If the data is all zeroes, we don't even need to allocate
- * a block for it. We indicate this by setting *destsizep = 0.
+ * a block for it. We indicate this by returning zero size.
*/
- allzero = 1;
- word = src;
- word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
- while (word < word_end) {
- if (*word++ != 0) {
- allzero = 0;
+ word_end = (uint64_t *)((char *)src + s_len);
+ for (word = src; word < word_end; word++)
+ if (*word != 0)
break;
- }
- }
- if (allzero) {
- *destp = NULL;
- *destsizep = 0;
- *destbufsizep = 0;
- return (1);
- }
- if (cpfunc == ZIO_COMPRESS_EMPTY)
+ if (word == word_end)
return (0);
+ if (c == ZIO_COMPRESS_EMPTY)
+ return (s_len);
+
/* Compress at least 12.5% */
- destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
- if (destbufsize == 0)
- return (0);
- dest = zio_buf_alloc(destbufsize);
+ d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+ if (d_len == 0)
+ return (s_len);
+
+ c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
+
+ if (c_len > d_len)
+ return (s_len);
- ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
- (size_t)destbufsize, ci->ci_level);
- if (ciosize > destbufsize) {
- zio_buf_free(dest, destbufsize);
- return (0);
+ /*
+ * Cool. We compressed at least as much as we were hoping to.
+ * For both security and repeatability, pad out the last sector.
+ */
+ r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+ if (r_len > c_len) {
+ bzero((char *)dst + c_len, r_len - c_len);
+ c_len = r_len;
}
- /* Cool. We compressed at least as much as we were hoping to. */
+ ASSERT3U(c_len, <=, d_len);
+ ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
- /* For security, make sure we don't write random heap crap to disk */
- gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
- if (gapsize != 0) {
- bzero(dest + ciosize, gapsize);
- ciosize += gapsize;
- }
-
- ASSERT3U(ciosize, <=, destbufsize);
- ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
- *destp = dest;
- *destsizep = ciosize;
- *destbufsizep = destbufsize;
-
- return (1);
+ return (c_len);
}
int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
- void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+ size_t s_len, size_t d_len)
{
- zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+ zio_compress_info_t *ci = &zio_compress_table[c];
- ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+ if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+ return (EINVAL);
- return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+ return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
--- a/usr/src/uts/common/fs/zfs/zio_inject.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c Sun Nov 01 14:14:46 2009 -0800
@@ -43,8 +43,8 @@
#include <sys/arc.h>
#include <sys/zio_impl.h>
#include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
#include <sys/fs/zfs.h>
uint32_t zio_injection_enabled;
@@ -70,8 +70,9 @@
/*
* Check for a match against the MOS, which is based on type
*/
- if (zb->zb_objset == 0 && record->zi_objset == 0 &&
- record->zi_object == 0) {
+ if (zb->zb_objset == DMU_META_OBJSET &&
+ record->zi_objset == DMU_META_OBJSET &&
+ record->zi_object == DMU_META_DNODE_OBJECT) {
if (record->zi_type == DMU_OT_NONE ||
type == record->zi_type)
return (record->zi_freq == 0 ||
@@ -357,7 +358,7 @@
VERIFY(handler->zi_record.zi_timer == 0 ||
handler->zi_record.zi_timer -
handler->zi_record.zi_duration >=
- spa->spa_syncing_txg);
+ spa_syncing_txg(spa));
}
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/zle.c Sun Nov 01 14:14:46 2009 -0800
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding. This is a fast and simple algorithm to eliminate
+ * runs of zeroes. Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values. If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end - 1) {
+ uchar_t *first = src;
+ uchar_t *len = dst++;
+ if (src[0] == 0) {
+ uchar_t *last = src + (256 - n);
+ while (src < MIN(last, s_end) && src[0] == 0)
+ src++;
+ *len = src - first - 1 + n;
+ } else {
+ uchar_t *last = src + n;
+ if (d_end - dst < n)
+ break;
+ while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+ *dst++ = *src++;
+ if (src[0])
+ *dst++ = *src++;
+ *len = src - first - 1;
+ }
+ }
+ return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+ uchar_t *src = s_start;
+ uchar_t *dst = d_start;
+ uchar_t *s_end = src + s_len;
+ uchar_t *d_end = dst + d_len;
+
+ while (src < s_end && dst < d_end) {
+ int len = 1 + *src++;
+ if (len <= n) {
+ while (len-- != 0)
+ *dst++ = *src++;
+ } else {
+ len -= n;
+ while (len-- != 0)
+ *dst++ = 0;
+ }
+ }
+ return (dst == d_end ? 0 : -1);
+}
--- a/usr/src/uts/common/fs/zfs/zvol.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/fs/zfs/zvol.c Sun Nov 01 14:14:46 2009 -0800
@@ -246,8 +246,8 @@
/*ARGSUSED*/
static int
-zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
- const dnode_phys_t *dnp, void *arg)
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct maparg *ma = arg;
zvol_extent_t *ze;
@@ -361,25 +361,32 @@
{
objset_t *os = zv->zv_objset;
char *data = (char *)(lr + 1); /* data follows lr_write_t */
- uint64_t off = lr->lr_offset;
- uint64_t len = lr->lr_length;
+ uint64_t offset, length;
dmu_tx_t *tx;
int error;
if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
- /* If it's a dmu_sync() block get the data and write the whole block */
- if (lr->lr_common.lrc_reclen == sizeof (lr_write_t))
- zil_get_replay_data(dmu_objset_zil(os), lr);
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
tx = dmu_tx_create(os);
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
+ dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
- dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+ dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
dmu_tx_commit(tx);
}
@@ -882,14 +889,16 @@
}
static void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
+zvol_get_done(zgd_t *zgd, int error)
{
- zgd_t *zgd = (zgd_t *)vzgd;
- rl_t *rl = zgd->zgd_rl;
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
- dmu_buf_rele(db, vzgd);
- zfs_range_unlock(rl);
- zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+ zfs_range_unlock(zgd->zgd_rl);
+
+ if (error == 0 && zgd->zgd_bp)
+ zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
kmem_free(zgd, sizeof (zgd_t));
}
@@ -901,15 +910,20 @@
{
zvol_state_t *zv = arg;
objset_t *os = zv->zv_objset;
+ uint64_t object = ZVOL_OBJ;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length; /* length of user data */
+ blkptr_t *bp = &lr->lr_blkptr;
dmu_buf_t *db;
- rl_t *rl;
zgd_t *zgd;
- uint64_t boff; /* block starting offset */
- int dlen = lr->lr_length; /* length of user data */
int error;
- ASSERT(zio);
- ASSERT(dlen != 0);
+ ASSERT(zio != NULL);
+ ASSERT(size != 0);
+
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_zilog = zv->zv_zilog;
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -918,49 +932,30 @@
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
- if (buf != NULL) /* immediate write */
- return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf,
- DMU_READ_NO_PREFETCH));
-
- zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_bp = &lr->lr_blkptr;
+ if (buf != NULL) { /* immediate write */
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ } else {
+ size = zv->zv_volblocksize;
+ offset = P2ALIGN(offset, size);
+ error = dmu_buf_hold(os, object, offset, zgd, &db);
+ if (error == 0) {
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
- /*
- * Lock the range of the block to ensure that when the data is
- * written out and its checksum is being calculated that no other
- * thread can change the block.
- */
- boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
- rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
- RL_READER);
- zgd->zgd_rl = rl;
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
- VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zvol_get_done, zgd);
- error = dmu_sync(zio, db, &lr->lr_blkptr,
- lr->lr_common.lrc_txg, zvol_get_done, zgd);
- if (error == 0) {
- /*
- * dmu_sync() can compress a block of zeros to a null blkptr
- * but the block size still needs to be passed through to
- * replay.
- */
- BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
- zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
+ if (error == 0)
+ return (0);
+ }
}
- /*
- * If we get EINPROGRESS, then we need to wait for a
- * write IO initiated by dmu_sync() to complete before
- * we can release this dbuf. We will finish everything
- * up in the zvol_get_done() callback.
- */
- if (error == EINPROGRESS)
- return (0);
- dmu_buf_rele(db, zgd);
- zfs_range_unlock(rl);
- kmem_free(zgd, sizeof (zgd_t));
+ zvol_get_done(zgd, error);
+
return (error);
}
@@ -984,12 +979,8 @@
if (zil_disable)
return;
- if (zilog->zl_replay) {
- dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
- zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
- zilog->zl_replaying_seq;
+ if (zil_replaying(zilog, tx))
return;
- }
immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
? 0 : zvol_immediate_write_sz;
@@ -1024,8 +1015,7 @@
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
- kmem_free(itx, offsetof(itx_t, itx_lr) +
- itx->itx_lr.lrc_reclen);
+ zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
@@ -1037,7 +1027,7 @@
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
lr->lr_length = len;
- lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+ lr->lr_blkoff = 0;
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = zv;
@@ -1791,7 +1781,8 @@
dmu_tx_abort(tx);
return (error);
}
- (void) dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx);
+ if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+ zv->zv_volblocksize = vbs;
dmu_tx_commit(tx);
return (0);
--- a/usr/src/uts/common/sys/avl.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/sys/avl.h Sun Nov 01 14:14:46 2009 -0800
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _AVL_H
#define _AVL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This is a private header file. Applications should not directly include
* this file.
@@ -163,7 +161,7 @@
* node - node that has the value being looked for
* where - position for use with avl_nearest() or avl_insert(), may be NULL
*/
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
/*
* Insert a node into the tree.
--- a/usr/src/uts/common/sys/fs/zfs.h Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/common/sys/fs/zfs.h Sun Nov 01 14:14:46 2009 -0800
@@ -119,6 +119,7 @@
ZFS_PROP_LOGBIAS,
ZFS_PROP_UNIQUE, /* not exposed to the user */
ZFS_PROP_OBJSETID, /* not exposed to the user */
+ ZFS_PROP_DEDUP,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -155,6 +156,8 @@
ZPOOL_PROP_FAILUREMODE,
ZPOOL_PROP_LISTSNAPS,
ZPOOL_PROP_AUTOEXPAND,
+ ZPOOL_PROP_DEDUPDITTO,
+ ZPOOL_PROP_DEDUPRATIO,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -197,6 +200,7 @@
boolean_t zfs_prop_userquota(const char *name);
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
/*
@@ -209,6 +213,7 @@
boolean_t zpool_prop_readonly(zpool_prop_t);
int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
/*
* Definitions for the Delegation.
@@ -296,14 +301,16 @@
#define SPA_VERSION_17 17ULL
#define SPA_VERSION_18 18ULL
#define SPA_VERSION_19 19ULL
+#define SPA_VERSION_20 20ULL
+#define SPA_VERSION_21 21ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_19
-#define SPA_VERSION_STRING "19"
+#define SPA_VERSION SPA_VERSION_21
+#define SPA_VERSION_STRING "21"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -344,6 +351,8 @@
#define SPA_VERSION_RAIDZ3 SPA_VERSION_17
#define SPA_VERSION_USERREFS SPA_VERSION_18
#define SPA_VERSION_HOLES SPA_VERSION_19
+#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20
+#define SPA_VERSION_DEDUP SPA_VERSION_21
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -559,7 +568,6 @@
uint64_t vs_alloc; /* space allocated */
uint64_t vs_space; /* total capacity */
uint64_t vs_dspace; /* deflated capacity */
- uint64_t vs_defer; /* in-core deferred */
uint64_t vs_rsize; /* replaceable dev size */
uint64_t vs_ops[ZIO_TYPES]; /* operation count */
uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */
--- a/usr/src/uts/intel/zfs/spa_boot.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/intel/zfs/spa_boot.c Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,11 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/sunddi.h>
--- a/usr/src/uts/sparc/zfs/spa_boot.c Fri Oct 30 18:47:17 2009 -0600
+++ b/usr/src/uts/sparc/zfs/spa_boot.c Sun Nov 01 14:14:46 2009 -0800
@@ -20,12 +20,11 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
+#include <sys/zio.h>
#include <sys/spa.h>
#include <sys/bootconf.h>