6899923 vdev_offline/vdev_add deadlock
6900497 zdb -S could be much faster
6900898 deduped blocks should be scrubbed/resilvered exactly once
--- a/usr/src/cmd/zdb/zdb.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/cmd/zdb/zdb.c Thu Nov 19 22:24:55 2009 -0800
@@ -94,14 +94,15 @@
usage(void)
{
(void) fprintf(stderr,
- "Usage: %s [-CumdibcsvhL] [-S user:cksumalg] "
+ "Usage: %s [-CumdibcsvhL] "
"poolname [object...]\n"
" %s [-div] dataset [object...]\n"
" %s -m [-L] poolname [vdev [metaslab...]]\n"
" %s -R poolname vdev:offset:size[:flags]\n"
+ " %s -S poolname\n"
" %s -l device\n"
" %s -C\n\n",
- cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+ cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
@@ -549,7 +550,7 @@
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
- ddt_bp_create(ddt, ddk, ddp, &blk);
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
sprintf_blkptr(blkbuf, &blk);
(void) printf("index %llx refcnt %llu %s %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
@@ -686,7 +687,7 @@
(void) printf("%s contents:\n\n", name);
- while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0)
+ while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
dump_dde(ddt, &dde, walk);
ASSERT(error == ENOENT);
@@ -1344,7 +1345,8 @@
nicenum(doi.doi_physical_blocks_512 << 9, asize);
nicenum(doi.doi_bonus_size, bonus_size);
(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
- doi.doi_data_block_size / doi.doi_max_offset);
+ doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+ doi.doi_max_offset);
aux[0] = '\0';
@@ -1865,26 +1867,28 @@
};
static void
-zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- zdb_cb_t *zcb)
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
- uint64_t walk = 0;
+ ddt_bookmark_t ddb = { 0 };
ddt_entry_t dde;
int error;
- if (class == DDT_CLASS_UNIQUE || !ddt_object_exists(ddt, type, class))
- return;
-
- while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+ while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
ddt_phys_t *ddp = dde.dde_phys;
+
+ if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ return;
+
ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;
- ddt_bp_create(ddt, &dde.dde_key, ddp, &blk);
+ ddt_bp_create(ddb.ddb_checksum,
+ &dde.dde_key, ddp, &blk);
if (p == DDT_PHYS_DITTO) {
- zdb_count_block(ddt->ddt_spa, NULL, zcb, &blk,
+ zdb_count_block(spa, NULL, zcb, &blk,
ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
@@ -1893,6 +1897,7 @@
}
}
if (!dump_opt['L']) {
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
@@ -1924,12 +1929,7 @@
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
- for (enum ddt_type type = 0; type < DDT_TYPES; type++)
- for (enum ddt_class class = 0; class < DDT_CLASSES;
- class++)
- zdb_ddt_leak_init(spa->spa_ddt[c],
- type, class, zcb);
+ zdb_ddt_leak_init(spa, zcb);
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
@@ -1957,6 +1957,7 @@
zdb_cb_t zcb = { 0 };
zdb_blkstats_t *zb, *tzb;
uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
int leaks = 0;
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
@@ -2000,7 +2001,10 @@
bplist_close(bpl);
}
- zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0);
+ if (dump_opt['c'] > 1)
+ flags |= TRAVERSE_PREFETCH_DATA;
+
+ zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
if (zcb.zcb_haderrors) {
(void) printf("\nError counts:\n\n");
@@ -2170,7 +2174,8 @@
avl_numnodes(t));
}
- if (BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+ if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+ BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
return (0);
ddt_key_fill(&zdde_search.zdde_key, bp);
@@ -2205,7 +2210,8 @@
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
- (void) traverse_pool(spa, zdb_ddt_add_cb, &t, 0);
+ (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ zdb_ddt_add_cb, &t);
spa_config_exit(spa, SCL_CONFIG, FTAG);
--- a/usr/src/cmd/ztest/ztest.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/cmd/ztest/ztest.c Thu Nov 19 22:24:55 2009 -0800
@@ -837,7 +837,6 @@
return (error);
}
-#if 0
static int
ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
{
@@ -860,7 +859,6 @@
return (error);
}
-#endif
static void
ztest_rll_init(rll_t *rll)
@@ -4134,10 +4132,8 @@
(void) rw_rdlock(&zs->zs_name_lock);
-#if 0
(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
-#endif
VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
--- a/usr/src/common/zfs/zpool_prop.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/common/zfs/zpool_prop.c Thu Nov 19 22:24:55 2009 -0800
@@ -90,6 +90,8 @@
/* default number properties */
register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+ register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+ PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
/* default index (boolean) properties */
register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
@@ -109,8 +111,6 @@
/* hidden properties */
register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
PROP_READONLY, ZFS_TYPE_POOL, "NAME");
- register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_POOL, "DEDUPDITTO");
}
/*
--- a/usr/src/uts/common/fs/zfs/ddt.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/ddt.c Thu Nov 19 22:24:55 2009 -0800
@@ -32,6 +32,7 @@
#include <sys/zap.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
+#include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
@@ -158,7 +159,7 @@
int
ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
- ddt_entry_t *dde, uint64_t *walk)
+ uint64_t *walk, ddt_entry_t *dde)
{
ASSERT(ddt_object_exists(ddt, type, class));
@@ -212,8 +213,8 @@
}
void
-ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp,
- blkptr_t *bp)
+ddt_bp_create(enum zio_checksum checksum,
+ const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
{
BP_ZERO(bp);
@@ -225,7 +226,7 @@
BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
- BP_SET_CHECKSUM(bp, ddt->ddt_checksum);
+ BP_SET_CHECKSUM(bp, checksum);
BP_SET_TYPE(bp, DMU_OT_NONE);
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
@@ -277,7 +278,7 @@
{
blkptr_t blk;
- ddt_bp_create(ddt, ddk, ddp, &blk);
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_phys_clear(ddp);
zio_free(ddt->ddt_spa, txg, &blk);
}
@@ -750,6 +751,30 @@
}
}
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+ ddt_t *ddt;
+ ddt_entry_t dde;
+
+ if (!BP_GET_DEDUP(bp))
+ return (B_FALSE);
+
+ if (max_class == DDT_CLASS_UNIQUE)
+ return (B_TRUE);
+
+ ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+ ddt_key_fill(&dde.dde_key, bp);
+
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+ for (enum ddt_class class = 0; class <= max_class; class++)
+ if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
ddt_entry_t *
ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
{
@@ -820,7 +845,7 @@
ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
continue;
- ddt_bp_create(ddt, ddk, ddp, &blk);
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
@@ -845,7 +870,7 @@
rdde_next = AVL_NEXT(t, rdde);
avl_remove(&ddt->ddt_repair_tree, rdde);
ddt_exit(ddt);
- ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk);
+ ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
dde = ddt_repair_start(ddt, &blk);
ddt_repair_entry(ddt, dde, rdde, rio);
ddt_repair_done(ddt, dde);
@@ -857,6 +882,7 @@
static void
ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
{
+ dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
ddt_phys_t *ddp = dde->dde_phys;
ddt_key_t *ddk = &dde->dde_key;
enum ddt_type otype = dde->dde_type;
@@ -905,6 +931,11 @@
if (!ddt_object_exists(ddt, ntype, nclass))
ddt_object_create(ddt, ntype, nclass, tx);
VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+ if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
+ oclass > dp->dp_scrub_ddt_class_max &&
+ nclass <= dp->dp_scrub_ddt_class_max)
+ dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
}
}
@@ -968,3 +999,31 @@
dmu_tx_commit(tx);
}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+ do {
+ do {
+ do {
+ ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+ int error = ENOENT;
+ if (ddt_object_exists(ddt, ddb->ddb_type,
+ ddb->ddb_class)) {
+ error = ddt_object_walk(ddt,
+ ddb->ddb_type, ddb->ddb_class,
+ &ddb->ddb_cursor, dde);
+ }
+ if (error == 0)
+ return (0);
+ if (error != ENOENT)
+ return (error);
+ ddb->ddb_cursor = 0;
+ } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+ ddb->ddb_checksum = 0;
+ } while (++ddb->ddb_type < DDT_TYPES);
+ ddb->ddb_type = 0;
+ } while (++ddb->ddb_class < DDT_CLASSES);
+
+ return (ENOENT);
+}
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c Thu Nov 19 22:24:55 2009 -0800
@@ -375,7 +375,8 @@
* NB: pool must not be changing on-disk (eg, from zdb or sync context).
*/
int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+ blkptr_cb_t func, void *arg)
{
int err;
uint64_t obj;
@@ -384,7 +385,7 @@
/* visit the MOS */
err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
- txg_start, TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
+ txg_start, flags, func, arg);
if (err)
return (err);
@@ -408,8 +409,7 @@
return (err);
if (ds->ds_phys->ds_prev_snap_txg > txg)
txg = ds->ds_phys->ds_prev_snap_txg;
- err = traverse_dataset(ds, txg,
- TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
+ err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
if (err)
return (err);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Thu Nov 19 22:24:55 2009 -0800
@@ -171,11 +171,23 @@
if (err)
goto out;
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
&dp->dp_scrub_bookmark);
if (err)
goto out;
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+ &dp->dp_scrub_ddt_bookmark);
+ if (err && err != ENOENT)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+ &dp->dp_scrub_ddt_class_max);
+ if (err && err != ENOENT)
+ goto out;
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&spa->spa_scrub_errors);
if (err)
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c Thu Nov 19 22:24:55 2009 -0800
@@ -53,6 +53,7 @@
int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
extern int zfs_txg_timeout;
@@ -78,6 +79,7 @@
dp->dp_scrub_min_txg = 0;
dp->dp_scrub_max_txg = tx->tx_txg;
+ dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
if (*funcp == SCRUB_FUNC_CLEAN) {
vdev_t *rvd = dp->dp_spa->spa_root_vdev;
@@ -101,6 +103,14 @@
dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
POOL_SCRUB_EVERYTHING, B_FALSE);
+ /*
+ * If this is an incremental scrub, limit the DDT scrub phase
+ * to just the auto-ditto class (for correctness); the rest
+ * of the scrub should go faster using top-down pruning.
+ */
+ if (dp->dp_scrub_min_txg > TXG_INITIAL)
+ dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
+
dp->dp_spa->spa_scrub_started = B_TRUE;
}
@@ -119,8 +129,8 @@
dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+ bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
dp->dp_scrub_restart = B_FALSE;
- dp->dp_scrub_ditto = B_FALSE;
dp->dp_spa->spa_scrub_errors = 0;
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -136,9 +146,17 @@
DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
&dp->dp_scrub_max_txg, tx));
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
&dp->dp_scrub_bookmark, tx));
VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+ &dp->dp_scrub_ddt_bookmark, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+ &dp->dp_scrub_ddt_class_max, tx));
+ VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&dp->dp_spa->spa_scrub_errors, tx));
@@ -186,6 +204,7 @@
dp->dp_scrub_queue_obj, tx));
dp->dp_scrub_queue_obj = 0;
bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+ bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_QUEUE, tx));
@@ -200,6 +219,11 @@
VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, tx));
+ (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
+ (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
+
spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
"complete=%u", *completep);
@@ -296,7 +320,7 @@
}
static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
{
int elapsed_ticks;
int mintime;
@@ -308,7 +332,7 @@
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
- if (zb->zb_level != 0)
+ if (zb != NULL && zb->zb_level != 0)
return (B_FALSE);
mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
@@ -316,11 +340,23 @@
elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time;
if (elapsed_ticks > hz * zfs_txg_timeout ||
(elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
- dprintf("pausing at %llx/%llx/%llx/%llx\n",
- (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
- (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+ if (zb) {
+ dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)zb->zb_objset,
+ (longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (longlong_t)zb->zb_blkid);
+ dp->dp_scrub_bookmark = *zb;
+ }
+ if (ddb) {
+ dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+ (longlong_t)ddb->ddb_class,
+ (longlong_t)ddb->ddb_type,
+ (longlong_t)ddb->ddb_checksum,
+ (longlong_t)ddb->ddb_cursor);
+ ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
+ }
dp->dp_scrub_pausing = B_TRUE;
- dp->dp_scrub_bookmark = *zb;
return (B_TRUE);
}
return (B_FALSE);
@@ -423,7 +459,7 @@
if (bp->blk_birth <= dp->dp_scrub_min_txg)
return;
- if (scrub_pause(dp, zb))
+ if (scrub_pause(dp, zb, NULL))
return;
if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
@@ -525,7 +561,13 @@
}
}
- (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+ /*
+ * If dsl_pool_scrub_ddt() has aready scrubbed this block,
+ * don't scrub it again.
+ */
+ if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
+ (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+
if (buf)
(void) arc_buf_remove_ref(buf, &buf);
}
@@ -542,7 +584,6 @@
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
}
-
}
static void
@@ -564,8 +605,8 @@
return;
if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
- SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
- 0, 0, 0);
+ SET_BOOKMARK(&dp->dp_scrub_bookmark,
+ ZB_DESTROYED_OBJSET, 0, 0, 0);
} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
ds->ds_object, tx) != 0) {
return;
@@ -771,34 +812,65 @@
return (0);
}
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Then the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can. There are two cases to consider.
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference count changes; if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ *
+ * The code does not actually use the refcnt directly, but rather uses the
+ * dde's replication class (enum ddt_class), which serves the same purpose.
+ */
static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
- enum ddt_class class)
+dsl_pool_scrub_ddt(dsl_pool_t *dp)
{
- ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
+ ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
ddt_entry_t dde;
- blkptr_t blk;
- zbookmark_t zb = { 0 };
- uint64_t walk = 0;
int error;
- if (!ddt_object_exists(ddt, type, class))
- return;
-
- while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
- int p = DDT_PHYS_DITTO;
- ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
- scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+ while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
+ if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
+ return;
+ dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
+ if (scrub_pause(dp, NULL, ddb))
+ return;
}
ASSERT(error == ENOENT);
+ ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
}
-static void
-dsl_pool_scrub_ditto(dsl_pool_t *dp)
+void
+dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+ const ddt_entry_t *dde)
{
- for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
- for (enum ddt_type type = 0; type < DDT_TYPES; type++)
- dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
+ const ddt_key_t *ddk = &dde->dde_key;
+ const ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t blk;
+ zbookmark_t zb = { 0 };
+
+ for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &blk);
+ scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+ }
}
void
@@ -840,9 +912,10 @@
dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
spa->spa_scrub_active = B_TRUE;
- if (!dp->dp_scrub_ditto) {
- dsl_pool_scrub_ditto(dp);
- dp->dp_scrub_ditto = B_TRUE;
+ if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
+ dsl_pool_scrub_ddt(dp);
+ if (dp->dp_scrub_pausing)
+ goto out;
}
if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
@@ -895,12 +968,18 @@
dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
return;
out:
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
&dp->dp_scrub_bookmark, tx));
- VERIFY(0 == zap_update(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+ sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+ &dp->dp_scrub_ddt_bookmark, tx));
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+ &dp->dp_scrub_ddt_class_max, tx));
+ VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
&spa->spa_scrub_errors, tx));
--- a/usr/src/uts/common/fs/zfs/spa.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/spa.c Thu Nov 19 22:24:55 2009 -0800
@@ -1230,8 +1230,8 @@
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
- error = traverse_pool(spa, spa_load_verify_cb, rio,
- spa->spa_verify_min_txg);
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
(void) zio_wait(rio);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c Thu Nov 19 22:24:55 2009 -0800
@@ -840,8 +840,8 @@
uint64_t
spa_vdev_enter(spa_t *spa)
{
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
- mutex_enter(&spa->spa_vdev_top_lock);
return (spa_vdev_config_enter(spa));
}
@@ -937,8 +937,8 @@
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+ mutex_exit(&spa_namespace_lock);
mutex_exit(&spa->spa_vdev_top_lock);
- mutex_exit(&spa_namespace_lock);
return (error);
}
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h Thu Nov 19 22:24:55 2009 -0800
@@ -153,6 +153,19 @@
avl_node_t ddt_node;
};
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+ uint64_t ddb_class;
+ uint64_t ddb_type;
+ uint64_t ddb_checksum;
+ uint64_t ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
typedef struct ddt_ops {
char ddt_op_name[32];
int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
@@ -173,7 +186,7 @@
extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
enum ddt_class class, char *name);
extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
- enum ddt_class class, ddt_entry_t *dde, uint64_t *walk);
+ enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
enum ddt_class class);
extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
@@ -183,7 +196,7 @@
extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
-extern void ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk,
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);
extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
@@ -214,13 +227,14 @@
extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
-extern ddt_t *ddt_select_by_checksum(spa_t *spa, enum zio_checksum c);
-
extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+ const blkptr_t *bp);
+
extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
@@ -230,6 +244,7 @@
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern const ddt_ops_t ddt_zap_ops;
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Thu Nov 19 22:24:55 2009 -0800
@@ -211,6 +211,10 @@
/* 4x8 zbookmark_t */
#define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark"
+/* 4x8 ddt_bookmark_t */
+#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark"
+/* 1x8 max_class */
+#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max"
/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
#define DMU_POOL_SCRUB_QUEUE "scrub_queue"
/* 1x8 txg */
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h Thu Nov 19 22:24:55 2009 -0800
@@ -47,9 +47,10 @@
#define TRAVERSE_PREFETCH_DATA (1<<3)
#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
- int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start);
+int traverse_dataset(struct dsl_dataset *ds,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+ uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
#ifdef __cplusplus
}
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Thu Nov 19 22:24:55 2009 -0800
@@ -32,6 +32,7 @@
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/dnode.h>
+#include <sys/ddt.h>
#ifdef __cplusplus
extern "C" {
@@ -96,13 +97,14 @@
uint64_t dp_scrub_queue_obj;
uint64_t dp_scrub_min_txg;
uint64_t dp_scrub_max_txg;
+ uint64_t dp_scrub_start_time;
+ uint64_t dp_scrub_ddt_class_max;
zbookmark_t dp_scrub_bookmark;
+ ddt_bookmark_t dp_scrub_ddt_bookmark;
boolean_t dp_scrub_pausing;
boolean_t dp_scrub_isresilver;
- uint64_t dp_scrub_start_time;
+ boolean_t dp_scrub_restart;
kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
- boolean_t dp_scrub_restart;
- boolean_t dp_scrub_ditto;
/* Has its own locking */
tx_state_t dp_tx;
@@ -145,6 +147,8 @@
int dsl_pool_scrub_clean(dsl_pool_t *dp);
void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+ const ddt_entry_t *dde);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
--- a/usr/src/uts/common/fs/zfs/zio.c Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Thu Nov 19 22:24:55 2009 -0800
@@ -867,6 +867,9 @@
if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+ if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+ zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
@@ -1736,7 +1739,8 @@
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
continue;
- ddt_bp_create(ddt, &dde->dde_key, ddp, &blk);
+ ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+ &blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
zio_buf_alloc(zio->io_size), zio->io_size,
zio_ddt_child_read_done, dde, zio->io_priority,