--- a/usr/src/cmd/zdb/zdb.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zdb/zdb.c Fri May 21 17:29:22 2010 -0700
@@ -900,9 +900,9 @@
}
static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp)
+sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
{
- dva_t *dva = bp->blk_dva;
+ const dva_t *dva = bp->blk_dva;
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
if (dump_opt['b'] >= 5) {
@@ -1127,12 +1127,21 @@
(void) printf("\t\tbp = %s\n", blkbuf);
}
-static void
-dump_bplist(objset_t *mos, uint64_t object, char *name)
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
- bplist_t bpl = { 0 };
- blkptr_t blk, *bp = &blk;
- uint64_t itor = 0;
+ char blkbuf[BP_SPRINTF_LEN];
+
+ ASSERT(bp->blk_birth != 0);
+ sprintf_blkptr_compact(blkbuf, bp);
+ (void) printf("\t%s\n", blkbuf);
+ return (0);
+}
+
+static void
+dump_bpobj(bpobj_t *bpo, char *name)
+{
char bytes[32];
char comp[32];
char uncomp[32];
@@ -1140,45 +1149,59 @@
if (dump_opt['d'] < 3)
return;
- bplist_init(&bpl);
- VERIFY(0 == bplist_open(&bpl, mos, object));
- if (bplist_empty(&bpl)) {
- bplist_close(&bpl);
- bplist_fini(&bpl);
- return;
- }
-
- zdb_nicenum(bpl.bpl_phys->bpl_bytes, bytes);
- if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
- zdb_nicenum(bpl.bpl_phys->bpl_comp, comp);
- zdb_nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
- (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n",
- name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+ zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+ if (bpo->bpo_havesubobj) {
+ zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+ zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+ (void) printf("\n %s: %llu local blkptrs, %llu subobjs, "
+ "%s (%s/%s comp)\n",
+ name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
bytes, comp, uncomp);
} else {
- (void) printf("\n %s: %llu entries, %s\n",
- name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+ (void) printf("\n %s: %llu blkptrs, %s\n",
+ name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes);
}
- if (dump_opt['d'] < 5) {
- bplist_close(&bpl);
- bplist_fini(&bpl);
+ if (dump_opt['d'] < 5)
return;
- }
(void) printf("\n");
- while (bplist_iterate(&bpl, &itor, bp) == 0) {
- char blkbuf[BP_SPRINTF_LEN];
-
- ASSERT(bp->blk_birth != 0);
- sprintf_blkptr_compact(blkbuf, bp);
- (void) printf("\tItem %3llu: %s\n",
- (u_longlong_t)itor - 1, blkbuf);
+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+}
+
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+ dsl_deadlist_entry_t *dle;
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ zdb_nicenum(dl->dl_phys->dl_used, bytes);
+ zdb_nicenum(dl->dl_phys->dl_comp, comp);
+ zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+ (void) printf("\n Deadlist: %s (%s/%s comp)\n",
+ bytes, comp, uncomp);
+
+ if (dump_opt['d'] < 4)
+ return;
+
+ (void) printf("\n");
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ (void) printf(" mintxg %llu -> obj %llu\n",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+
+ if (dump_opt['d'] >= 5)
+ dump_bpobj(&dle->dle_bpobj, "");
}
-
- bplist_close(&bpl);
- bplist_fini(&bpl);
}
static avl_tree_t idx_tree;
@@ -1404,6 +1427,10 @@
dump_sa_layouts, /* SA attribute layouts */
dump_zap, /* DSL scrub translations */
dump_none, /* fake dedup BP */
+ dump_zap, /* deadlist */
+ dump_none, /* deadlist hdr */
+ dump_zap, /* dsl clones */
+ dump_none, /* bpobj subobjs */
dump_unknown, /* Unknown type, must be last */
};
@@ -1590,8 +1617,7 @@
dump_intent_log(dmu_objset_zil(os));
if (dmu_objset_ds(os) != NULL)
- dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
- dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
+ dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
if (verbosity < 2)
return;
@@ -1867,10 +1893,11 @@
uint64_t zcb_errors[256];
int zcb_readfails;
int zcb_haderrors;
+ spa_t *zcb_spa;
} zdb_cb_t;
static void
-zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
dmu_object_type_t type)
{
uint64_t refcnt = 0;
@@ -1898,7 +1925,7 @@
ddt_t *ddt;
ddt_entry_t *dde;
- ddt = ddt_select(spa, bp);
+ ddt = ddt_select(zcb->zcb_spa, bp);
ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_FALSE);
@@ -1914,8 +1941,8 @@
ddt_exit(ddt);
}
- VERIFY3U(zio_wait(zio_claim(NULL, spa,
- refcnt ? 0 : spa_first_txg(spa),
+ VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+ refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
@@ -1934,7 +1961,7 @@
type = BP_GET_TYPE(bp);
- zdb_count_block(spa, zilog, zcb, bp, type);
+ zdb_count_block(zcb, zilog, bp, type);
is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
@@ -2048,8 +2075,7 @@
ddt_bp_create(ddb.ddb_checksum,
&dde.dde_key, ddp, &blk);
if (p == DDT_PHYS_DITTO) {
- zdb_count_block(spa, NULL, zcb, &blk,
- ZDB_OT_DITTO);
+ zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
@@ -2070,6 +2096,8 @@
static void
zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
+ zcb->zcb_spa = spa;
+
if (!dump_opt['L']) {
vdev_t *rvd = spa->spa_root_vdev;
for (int c = 0; c < rvd->vdev_children; c++) {
@@ -2111,6 +2139,22 @@
}
}
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+
+ if (dump_opt['b'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ sprintf_blkptr(blkbuf, bp);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ return (0);
+}
+
static int
dump_block_stats(spa_t *spa)
{
@@ -2140,26 +2184,10 @@
/*
* If there's a deferred-free bplist, process that first.
*/
- if (spa->spa_deferred_bplist_obj != 0) {
- bplist_t *bpl = &spa->spa_deferred_bplist;
- blkptr_t blk;
- uint64_t itor = 0;
-
- VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
- spa->spa_deferred_bplist_obj));
-
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- if (dump_opt['b'] >= 4) {
- char blkbuf[BP_SPRINTF_LEN];
- sprintf_blkptr(blkbuf, &blk);
- (void) printf("[%s] %s\n",
- "deferred free", blkbuf);
- }
- zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
- }
-
- bplist_close(bpl);
- }
+ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ count_block_cb, &zcb, NULL);
+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ count_block_cb, &zcb, NULL);
if (dump_opt['c'] > 1)
flags |= TRAVERSE_PREFETCH_DATA;
@@ -2438,8 +2466,11 @@
if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
- dump_bplist(dp->dp_meta_objset,
- spa->spa_deferred_bplist_obj, "Deferred frees");
+ dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+ "Pool frees");
+ }
dump_dtl(spa->spa_root_vdev, 0);
}
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
--- a/usr/src/cmd/zinject/translate.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/translate.c Fri May 21 17:29:22 2010 -0700
@@ -359,8 +359,8 @@
case TYPE_CONFIG:
record->zi_type = DMU_OT_PACKED_NVLIST;
break;
- case TYPE_BPLIST:
- record->zi_type = DMU_OT_BPLIST;
+ case TYPE_BPOBJ:
+ record->zi_type = DMU_OT_BPOBJ;
break;
case TYPE_SPACEMAP:
record->zi_type = DMU_OT_SPACE_MAP;
--- a/usr/src/cmd/zinject/zinject.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/zinject.c Fri May 21 17:29:22 2010 -0700
@@ -69,7 +69,7 @@
* mos Any data in the MOS
* mosdir object directory
* config pool configuration
- * bplist blkptr list
+ * bpobj blkptr list
* spacemap spacemap
* metaslab metaslab
* errlog persistent error log
@@ -163,7 +163,7 @@
"mosdir",
"metaslab",
"config",
- "bplist",
+ "bpobj",
"spacemap",
"errlog",
"uber",
@@ -193,8 +193,8 @@
return ("metaslab");
case DMU_OT_PACKED_NVLIST:
return ("config");
- case DMU_OT_BPLIST:
- return ("bplist");
+ case DMU_OT_BPOBJ:
+ return ("bpobj");
case DMU_OT_SPACE_MAP:
return ("spacemap");
case DMU_OT_ERROR_LOG:
@@ -285,7 +285,7 @@
"\t\t\ton a ZFS filesystem.\n"
"\n"
"\t-t <mos>\tInject errors into the MOS for objects of the given\n"
- "\t\t\ttype. Valid types are: mos, mosdir, config, bplist,\n"
+ "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n"
"\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n"
"\t\t\tthe poolname.\n");
}
--- a/usr/src/cmd/zinject/zinject.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/zinject.h Fri May 21 17:29:22 2010 -0700
@@ -38,7 +38,7 @@
TYPE_MOSDIR, /* MOS object directory */
TYPE_METASLAB, /* metaslab objects */
TYPE_CONFIG, /* MOS config */
- TYPE_BPLIST, /* block pointer list */
+ TYPE_BPOBJ, /* block pointer list */
TYPE_SPACEMAP, /* space map objects */
TYPE_ERRLOG, /* persistent error log */
TYPE_LABEL_UBERBLOCK, /* label specific uberblock */
--- a/usr/src/cmd/zpool/zpool_main.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c Fri May 21 17:29:22 2010 -0700
@@ -4007,6 +4007,8 @@
(void) printf(gettext(" 23 Slim ZIL\n"));
(void) printf(gettext(" 24 System attributes\n"));
(void) printf(gettext(" 25 Improved scrub stats\n"));
+ (void) printf(gettext(" 26 Improved snapshot deletion "
+ "performance\n"));
(void) printf(gettext("\nFor more information on a particular "
"version, including supported releases,\n"));
(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
--- a/usr/src/cmd/ztest/ztest.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/ztest/ztest.c Fri May 21 17:29:22 2010 -0700
@@ -94,6 +94,7 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
#include <sys/refcount.h>
#include <stdio.h>
#include <stdio_ext.h>
@@ -3136,9 +3137,9 @@
fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
}
- error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds);
+ error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
if (error)
- fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
+ fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
error = dsl_dataset_promote(clone2name, NULL);
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
--- a/usr/src/grub/capability Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/grub/capability Fri May 21 17:29:22 2010 -0700
@@ -39,7 +39,7 @@
# This file and the associated version are Solaris specific and are
# not a part of the open source distribution of GRUB.
#
-VERSION=17
+VERSION=18
dboot
xVM
zfs
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Fri May 21 17:29:22 2010 -0700
@@ -26,7 +26,7 @@
/*
* On-disk version number.
*/
-#define SPA_VERSION 25ULL
+#define SPA_VERSION 26ULL
/*
* The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/uts/common/Makefile.files Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/Makefile.files Fri May 21 17:29:22 2010 -0700
@@ -1327,6 +1327,7 @@
ZFS_COMMON_OBJS += \
arc.o \
bplist.o \
+ bpobj.o \
dbuf.o \
ddt.o \
ddt_zap.o \
@@ -1340,6 +1341,7 @@
dnode_sync.o \
dsl_dir.o \
dsl_dataset.o \
+ dsl_deadlist.o \
dsl_pool.o \
dsl_synctask.o \
dmu_zfetch.o \
--- a/usr/src/uts/common/fs/zfs/bplist.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/bplist.c Fri May 21 17:29:22 2010 -0700
@@ -25,353 +25,45 @@
#include <sys/bplist.h>
#include <sys/zfs_context.h>
-void
-bplist_init(bplist_t *bpl)
-{
- bzero(bpl, sizeof (*bpl));
- mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&bpl->bpl_q_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&bpl->bpl_queue, sizeof (bplist_q_t),
- offsetof(bplist_q_t, bpq_node));
-}
void
-bplist_fini(bplist_t *bpl)
-{
- ASSERT(list_is_empty(&bpl->bpl_queue));
- list_destroy(&bpl->bpl_queue);
- mutex_destroy(&bpl->bpl_q_lock);
- mutex_destroy(&bpl->bpl_lock);
-}
-
-static int
-bplist_hold(bplist_t *bpl)
+bplist_create(bplist_t *bpl)
{
- ASSERT(MUTEX_HELD(&bpl->bpl_lock));
- if (bpl->bpl_dbuf == NULL) {
- int err = dmu_bonus_hold(bpl->bpl_mos,
- bpl->bpl_object, bpl, &bpl->bpl_dbuf);
- if (err)
- return (err);
- bpl->bpl_phys = bpl->bpl_dbuf->db_data;
- }
- return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
- int size;
-
- size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
- BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
- return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
- DMU_OT_BPLIST_HDR, size, tx));
+ mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+ offsetof(bplist_entry_t, bpe_node));
}
void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
-{
- VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+bplist_destroy(bplist_t *bpl)
{
- dmu_object_info_t doi;
- int err;
-
- err = dmu_object_info(mos, object, &doi);
- if (err)
- return (err);
-
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(bpl->bpl_dbuf == NULL);
- ASSERT(bpl->bpl_phys == NULL);
- ASSERT(bpl->bpl_cached_dbuf == NULL);
- ASSERT(list_is_empty(&bpl->bpl_queue));
- ASSERT(object != 0);
- ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
- ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
- bpl->bpl_mos = mos;
- bpl->bpl_object = object;
- bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
- bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
- bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
- mutex_exit(&bpl->bpl_lock);
- return (0);
+ list_destroy(&bpl->bpl_list);
+ mutex_destroy(&bpl->bpl_lock);
}
void
-bplist_close(bplist_t *bpl)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
{
- mutex_enter(&bpl->bpl_lock);
-
- ASSERT(list_is_empty(&bpl->bpl_queue));
+ bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
- if (bpl->bpl_cached_dbuf) {
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- bpl->bpl_cached_dbuf = NULL;
- }
- if (bpl->bpl_dbuf) {
- dmu_buf_rele(bpl->bpl_dbuf, bpl);
- bpl->bpl_dbuf = NULL;
- bpl->bpl_phys = NULL;
- }
-
+ mutex_enter(&bpl->bpl_lock);
+ bpe->bpe_blk = *bp;
+ list_insert_tail(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
}
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
- boolean_t rv;
-
- if (bpl->bpl_object == 0)
- return (B_TRUE);
-
- mutex_enter(&bpl->bpl_lock);
- VERIFY(0 == bplist_hold(bpl)); /* XXX */
- rv = (bpl->bpl_phys->bpl_entries == 0);
- mutex_exit(&bpl->bpl_lock);
-
- return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
{
- int err = 0;
-
- if (bpl->bpl_cached_dbuf == NULL ||
- bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
- if (bpl->bpl_cached_dbuf != NULL)
- dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
- err = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blkid << bpl->bpl_blockshift,
- bpl, &bpl->bpl_cached_dbuf, DMU_READ_PREFETCH);
- ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
- 1ULL << bpl->bpl_blockshift);
- }
- return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
+ bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- do {
- if (*itorp >= bpl->bpl_phys->bpl_entries) {
- mutex_exit(&bpl->bpl_lock);
- return (ENOENT);
- }
-
- blk = *itorp >> bpl->bpl_bpshift;
- off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- bparray = bpl->bpl_cached_dbuf->db_data;
- *bp = bparray[off];
- (*itorp)++;
- } while (bp->blk_birth == 0);
-
- mutex_exit(&bpl->bpl_lock);
- return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
- uint64_t blk, off;
- blkptr_t *bparray;
- int err;
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
- off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
- err = bplist_cache(bpl, blk);
- if (err) {
+ while (bpe = list_head(&bpl->bpl_list)) {
+ list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
- bparray = bpl->bpl_cached_dbuf->db_data;
- bparray[off] = *bp;
-
- /* We never need the fill count. */
- bparray[off].blk_fill = 0;
-
- /* The bplist will compress better if we can leave off the checksum */
- if (!BP_GET_DEDUP(&bparray[off]))
- bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- bpl->bpl_phys->bpl_entries++;
- bpl->bpl_phys->bpl_bytes +=
- bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
- bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
- }
- mutex_exit(&bpl->bpl_lock);
-
- return (0);
-}
-
-void
-bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
- VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
-}
-
-/*
- * Deferred entry; will be processed later by bplist_sync().
- */
-void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
-{
- bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
- ASSERT(!BP_IS_HOLE(bp));
- mutex_enter(&bpl->bpl_q_lock);
- bpq->bpq_blk = *bp;
- list_insert_tail(&bpl->bpl_queue, bpq);
- mutex_exit(&bpl->bpl_q_lock);
-}
-
-void
-bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
-{
- bplist_q_t *bpq;
-
- mutex_enter(&bpl->bpl_q_lock);
- while (bpq = list_head(&bpl->bpl_queue)) {
- list_remove(&bpl->bpl_queue, bpq);
- mutex_exit(&bpl->bpl_q_lock);
- func(arg, &bpq->bpq_blk, tx);
- kmem_free(bpq, sizeof (*bpq));
- mutex_enter(&bpl->bpl_q_lock);
- }
- mutex_exit(&bpl->bpl_q_lock);
-}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
- mutex_enter(&bpl->bpl_lock);
- ASSERT(list_is_empty(&bpl->bpl_queue));
- VERIFY(0 == bplist_hold(bpl));
- dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- VERIFY(0 == dmu_free_range(bpl->bpl_mos,
- bpl->bpl_object, 0, -1ULL, tx));
- bpl->bpl_phys->bpl_entries = 0;
- bpl->bpl_phys->bpl_bytes = 0;
- if (bpl->bpl_havecomp) {
- bpl->bpl_phys->bpl_comp = 0;
- bpl->bpl_phys->bpl_uncomp = 0;
+ func(arg, &bpe->bpe_blk, tx);
+ kmem_free(bpe, sizeof (*bpe));
+ mutex_enter(&bpl->bpl_lock);
}
mutex_exit(&bpl->bpl_lock);
}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
- int err;
-
- mutex_enter(&bpl->bpl_lock);
-
- err = bplist_hold(bpl);
- if (err) {
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- *usedp = bpl->bpl_phys->bpl_bytes;
- if (bpl->bpl_havecomp) {
- *compp = bpl->bpl_phys->bpl_comp;
- *uncompp = bpl->bpl_phys->bpl_uncomp;
- }
- mutex_exit(&bpl->bpl_lock);
-
- if (!bpl->bpl_havecomp) {
- uint64_t itor = 0, comp = 0, uncomp = 0;
- blkptr_t bp;
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- comp += BP_GET_PSIZE(&bp);
- uncomp += BP_GET_UCSIZE(&bp);
- }
- if (err == ENOENT)
- err = 0;
- *compp = comp;
- *uncompp = uncomp;
- }
-
- return (err);
-}
-
-/*
- * Return (in *dsizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
- uint64_t *dsizep)
-{
- uint64_t size = 0;
- uint64_t itor = 0;
- blkptr_t bp;
- int err;
-
- /*
- * As an optimization, if they want the whole txg range, just
- * get bpl_bytes rather than iterating over the bps.
- */
- if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
- mutex_enter(&bpl->bpl_lock);
- err = bplist_hold(bpl);
- if (err == 0)
- *dsizep = bpl->bpl_phys->bpl_bytes;
- mutex_exit(&bpl->bpl_lock);
- return (err);
- }
-
- while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
- if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
- size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
- }
- }
- if (err == ENOENT)
- err = 0;
- *dsizep = size;
- return (err);
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/bpobj.c Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,462 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+ int size;
+
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+ size = BPOBJ_SIZE_V0;
+ else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ size = BPOBJ_SIZE_V1;
+ else
+ size = sizeof (bpobj_phys_t);
+
+ return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+ DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+ int64_t i;
+ bpobj_t bpo;
+ dmu_object_info_t doi;
+ int epb;
+ dmu_buf_t *dbuf = NULL;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+ mutex_enter(&bpo.bpo_lock);
+
+ if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ VERIFY3U(0, ==, dmu_buf_hold(os,
+ bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ bpobj_free(os, objarray[blkoff], tx);
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+ mutex_exit(&bpo.bpo_lock);
+ bpobj_close(&bpo);
+
+ VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+ int err;
+
+ err = dmu_object_info(os, object, &doi);
+ if (err)
+ return (err);
+
+ bzero(bpo, sizeof (*bpo));
+ mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ASSERT(bpo->bpo_dbuf == NULL);
+ ASSERT(bpo->bpo_phys == NULL);
+ ASSERT(object != 0);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+ bpo->bpo_os = os;
+ bpo->bpo_object = object;
+ bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+ bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+ bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+
+ err = dmu_bonus_hold(bpo->bpo_os,
+ bpo->bpo_object, bpo, &bpo->bpo_dbuf);
+ if (err)
+ return (err);
+ bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+ return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+ /* Lame workaround for closing a bpobj that was never opened. */
+ if (bpo->bpo_object == 0)
+ return;
+
+ dmu_buf_rele(bpo->bpo_dbuf, bpo);
+ if (bpo->bpo_cached_dbuf != NULL)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ bpo->bpo_dbuf = NULL;
+ bpo->bpo_phys = NULL;
+ bpo->bpo_cached_dbuf = NULL;
+
+ mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+ boolean_t free)
+{
+ dmu_object_info_t doi;
+ int epb;
+ int64_t i;
+ int err = 0;
+ dmu_buf_t *dbuf = NULL;
+
+ mutex_enter(&bpo->bpo_lock);
+
+ if (free)
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+ for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+ blkptr_t *bparray;
+ blkptr_t *bp;
+ uint64_t offset, blkoff;
+
+ offset = i * sizeof (blkptr_t);
+ blkoff = P2PHASE(i, bpo->bpo_epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+ FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ bparray = dbuf->db_data;
+ bp = &bparray[blkoff];
+ err = func(arg, bp, tx);
+ if (err)
+ break;
+ if (free) {
+ bpo->bpo_phys->bpo_bytes -=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+ }
+ bpo->bpo_phys->bpo_num_blkptrs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ i++;
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+ i * sizeof (blkptr_t), -1ULL, tx));
+ }
+ if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+ goto out;
+
+ ASSERT(bpo->bpo_havecomp);
+ err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+ if (err)
+ return (err);
+ epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+ for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+ uint64_t *objarray;
+ uint64_t offset, blkoff;
+ bpobj_t sublist;
+ uint64_t used_before, comp_before, uncomp_before;
+ uint64_t used_after, comp_after, uncomp_after;
+
+ offset = i * sizeof (uint64_t);
+ blkoff = P2PHASE(i, epb);
+
+ if (dbuf == NULL || dbuf->db_offset > offset) {
+ if (dbuf)
+ dmu_buf_rele(dbuf, FTAG);
+ err = dmu_buf_hold(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+ if (err)
+ break;
+ }
+
+ ASSERT3U(offset, >=, dbuf->db_offset);
+ ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+ objarray = dbuf->db_data;
+ err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+ if (err)
+ break;
+ if (free) {
+ err = bpobj_space(&sublist,
+ &used_before, &comp_before, &uncomp_before);
+ if (err)
+ break;
+ }
+ err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+ if (free) {
+ VERIFY3U(0, ==, bpobj_space(&sublist,
+ &used_after, &comp_after, &uncomp_after));
+ bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+ ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+ bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+ bpo->bpo_phys->bpo_uncomp -=
+ uncomp_before - uncomp_after;
+ }
+
+ bpobj_close(&sublist);
+ if (err)
+ break;
+ if (free) {
+ err = dmu_object_free(bpo->bpo_os,
+ objarray[blkoff], tx);
+ if (err)
+ break;
+ bpo->bpo_phys->bpo_num_subobjs--;
+ ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+ }
+ }
+ if (dbuf) {
+ dmu_buf_rele(dbuf, FTAG);
+ dbuf = NULL;
+ }
+ if (free) {
+ VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ (i + 1) * sizeof (uint64_t), -1ULL, tx));
+ }
+
+out:
+ /* If there are no entries, there should be no bytes. */
+ ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+ bpo->bpo_phys->bpo_bytes == 0);
+
+ mutex_exit(&bpo->bpo_lock);
+ return (err);
+}
+
+/*
+ * Iterate and remove the entries. If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries. If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+ return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+ bpobj_t subbpo;
+ uint64_t used, comp, uncomp;
+
+ ASSERT(bpo->bpo_havesubobj);
+ ASSERT(bpo->bpo_havecomp);
+
+ VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+ VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+ bpobj_close(&subbpo);
+
+ if (used == 0) {
+ /* No point in having an empty subobj. */
+ bpobj_free(bpo->bpo_os, subobj, tx);
+ return;
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ if (bpo->bpo_phys->bpo_subobjs == 0) {
+ bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+ DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ }
+
+ mutex_enter(&bpo->bpo_lock);
+ dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+ bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+ sizeof (subobj), &subobj, tx);
+ bpo->bpo_phys->bpo_num_subobjs++;
+ bpo->bpo_phys->bpo_bytes += used;
+ bpo->bpo_phys->bpo_comp += comp;
+ bpo->bpo_phys->bpo_uncomp += uncomp;
+ mutex_exit(&bpo->bpo_lock);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ blkptr_t stored_bp = *bp;
+ uint64_t offset;
+ int blkoff;
+ blkptr_t *bparray;
+
+ ASSERT(!BP_IS_HOLE(bp));
+
+ /* We never need the fill count. */
+ stored_bp.blk_fill = 0;
+
+ /* The bpobj will compress better if we can leave off the checksum */
+ if (!BP_GET_DEDUP(bp))
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+ mutex_enter(&bpo->bpo_lock);
+
+ offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+ blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+ if (bpo->bpo_cached_dbuf == NULL ||
+ offset < bpo->bpo_cached_dbuf->db_offset ||
+ offset >= bpo->bpo_cached_dbuf->db_offset +
+ bpo->bpo_cached_dbuf->db_size) {
+ if (bpo->bpo_cached_dbuf)
+ dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+ VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+ offset, bpo, &bpo->bpo_cached_dbuf, 0));
+ }
+
+ dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+ bparray = bpo->bpo_cached_dbuf->db_data;
+ bparray[blkoff] = stored_bp;
+
+ dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+ bpo->bpo_phys->bpo_num_blkptrs++;
+ bpo->bpo_phys->bpo_bytes +=
+ bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+ if (bpo->bpo_havecomp) {
+ bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+ bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+ }
+ mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+ spa_t *spa;
+ uint64_t mintxg;
+ uint64_t maxtxg;
+ uint64_t used;
+ uint64_t comp;
+ uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct space_range_arg *sra = arg;
+
+ if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+ sra->used += bp_get_dsize_sync(sra->spa, bp);
+ sra->comp += BP_GET_PSIZE(bp);
+ sra->uncomp += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ mutex_enter(&bpo->bpo_lock);
+
+ *usedp = bpo->bpo_phys->bpo_bytes;
+ if (bpo->bpo_havecomp) {
+ *compp = bpo->bpo_phys->bpo_comp;
+ *uncompp = bpo->bpo_phys->bpo_uncomp;
+ mutex_exit(&bpo->bpo_lock);
+ return (0);
+ } else {
+ mutex_exit(&bpo->bpo_lock);
+ return (bpobj_space_range(bpo, 0, UINT64_MAX,
+ usedp, compp, uncompp));
+ }
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ struct space_range_arg sra = { 0 };
+ int err;
+
+ /*
+ * As an optimization, if they want the whole txg range, just
+ * get bpo_bytes rather than iterating over the bps.
+ */
+ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+ return (bpobj_space(bpo, usedp, compp, uncompp));
+
+ sra.spa = dmu_objset_spa(bpo->bpo_os);
+ sra.mintxg = mintxg;
+ sra.maxtxg = maxtxg;
+
+ err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+ *usedp = sra.used;
+ *compp = sra.comp;
+ *uncompp = sra.uncomp;
+ return (err);
+}
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Fri May 21 17:29:22 2010 -0700
@@ -51,8 +51,8 @@
{ byteswap_uint64_array, TRUE, "object array" },
{ byteswap_uint8_array, TRUE, "packed nvlist" },
{ byteswap_uint64_array, TRUE, "packed nvlist size" },
- { byteswap_uint64_array, TRUE, "bplist" },
- { byteswap_uint64_array, TRUE, "bplist header" },
+ { byteswap_uint64_array, TRUE, "bpobj" },
+ { byteswap_uint64_array, TRUE, "bpobj header" },
{ byteswap_uint64_array, TRUE, "SPA space map header" },
{ byteswap_uint64_array, TRUE, "SPA space map" },
{ byteswap_uint64_array, TRUE, "ZIL intent log" },
@@ -96,6 +96,10 @@
{ zap_byteswap, TRUE, "SA attr layouts" },
{ zap_byteswap, TRUE, "scan translations" },
{ byteswap_uint8_array, FALSE, "deduplicated block" },
+ { zap_byteswap, TRUE, "DSL deadlist map" },
+ { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" },
+ { zap_byteswap, TRUE, "DSL dir clones" },
+ { byteswap_uint64_array, TRUE, "bpobj subobj" },
};
int
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Fri May 21 17:29:22 2010 -0700
@@ -39,6 +39,7 @@
#include <sys/zap.h>
#include <sys/zio_checksum.h>
#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
#include <sys/avl.h>
#include <sys/ddt.h>
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri May 21 17:29:22 2010 -0700
@@ -39,6 +39,7 @@
#include <sys/zfs_znode.h>
#include <sys/zvol.h>
#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -51,6 +52,13 @@
static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+#define SWITCH64(x, y) \
+ { \
+ uint64_t __tmp = (x); \
+ (x) = (y); \
+ (y) = __tmp; \
+ }
+
#define DS_REF_MAX (1ULL << 62)
#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
@@ -178,13 +186,13 @@
/*
* We are here as part of zio's write done callback,
* which means we're a zio interrupt thread. We can't
- * call bplist_enqueue() now because it may block
+ * call dsl_deadlist_insert() now because it may block
* waiting for I/O. Instead, put bp on the deferred
* queue and let dsl_pool_sync() finish the job.
*/
- bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+ bplist_append(&ds->ds_pending_deadlist, bp);
} else {
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+ dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
}
ASSERT3U(ds->ds_prev->ds_object, ==,
ds->ds_phys->ds_prev_snap_obj);
@@ -269,7 +277,13 @@
ds->ds_prev = NULL;
}
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ if (db != NULL) {
+ dsl_deadlist_close(&ds->ds_deadlist);
+ } else {
+ ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
+ ASSERT(!ds->ds_deadlist.dl_oldfmt);
+ }
if (ds->ds_dir)
dsl_dir_close(ds->ds_dir, ds);
@@ -280,7 +294,6 @@
mutex_destroy(&ds->ds_opening_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
- bplist_fini(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
}
@@ -380,25 +393,23 @@
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&ds->ds_rwlock, 0, 0, 0);
cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
- bplist_init(&ds->ds_deadlist);
-
- err = bplist_open(&ds->ds_deadlist,
+
+ bplist_create(&ds->ds_pending_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
+
if (err == 0) {
err = dsl_dir_open_obj(dp,
ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
}
if (err) {
- /*
- * we don't really need to close the blist if we
- * just opened it.
- */
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_recvlock);
mutex_destroy(&ds->ds_opening_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
- bplist_fini(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
dmu_buf_rele(dbuf, tag);
return (err);
@@ -455,7 +466,8 @@
dsl_dataset_evict);
}
if (err || winner) {
- bplist_close(&ds->ds_deadlist);
+ bplist_destroy(&ds->ds_pending_deadlist);
+ dsl_deadlist_close(&ds->ds_deadlist);
if (ds->ds_prev)
dsl_dataset_drop_ref(ds->ds_prev, ds);
dsl_dir_close(ds->ds_dir, ds);
@@ -464,7 +476,6 @@
mutex_destroy(&ds->ds_opening_lock);
rw_destroy(&ds->ds_rwlock);
cv_destroy(&ds->ds_exclusive_cv);
- bplist_fini(&ds->ds_deadlist);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err) {
dmu_buf_rele(dbuf, tag);
@@ -726,7 +737,7 @@
if (ds->ds_dbuf)
dsl_dataset_drop_ref(ds, tag);
else
- dsl_dataset_evict(ds->ds_dbuf, ds);
+ dsl_dataset_evict(NULL, ds);
}
boolean_t
@@ -788,10 +799,12 @@
DMU_OT_NONE, 0, tx);
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
- dsphys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-
- if (origin) {
+
+ if (origin == NULL) {
+ dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+ } else {
+ dsl_dataset_t *ohds;
+
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
origin->ds_phys->ds_creation_txg;
@@ -807,6 +820,12 @@
dmu_buf_will_dirty(origin->ds_dbuf, tx);
origin->ds_phys->ds_num_children++;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+ dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+ dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+ dsl_dataset_rele(ohds, FTAG);
+
if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
if (origin->ds_phys->ds_next_clones_obj == 0) {
origin->ds_phys->ds_next_clones_obj =
@@ -820,6 +839,16 @@
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_origin_obj = origin->ds_object;
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones =
+ zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(mos,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+ }
}
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -1201,8 +1230,7 @@
else
mrs_used = 0;
- VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
- &dluncomp));
+ dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
ASSERT3U(dlused, <=, mrs_used);
ds->ds_phys->ds_unique_bytes =
@@ -1462,6 +1490,103 @@
ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
}
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ /*
+ * If it is the old version, dd_clones doesn't exist so we can't
+ * find the clones, but deadlist_remove_key() is a no-op so it
+ * doesn't matter.
+ */
+ if (ds->ds_dir->dd_phys->dd_clones == 0)
+ return;
+
+ for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+ za.za_first_integer, FTAG, &clone));
+ if (clone->ds_dir->dd_origin_txg > mintxg) {
+ dsl_deadlist_remove_key(&clone->ds_deadlist,
+ mintxg, tx);
+ dsl_dataset_remove_clones_key(clone, mintxg, tx);
+ }
+ dsl_dataset_rele(clone, FTAG);
+ }
+ zap_cursor_fini(&zc);
+}
+
+struct process_old_arg {
+ dsl_dataset_t *ds;
+ dsl_dataset_t *ds_prev;
+ boolean_t after_branch_point;
+ zio_t *pio;
+ uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ struct process_old_arg *poa = arg;
+ dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+ if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+ dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+ if (poa->ds_prev && !poa->after_branch_point &&
+ bp->blk_birth >
+ poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+ poa->ds_prev->ds_phys->ds_unique_bytes +=
+ bp_get_dsize_sync(dp->dp_spa, bp);
+ }
+ } else {
+ poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+ poa->comp += BP_GET_PSIZE(bp);
+ poa->uncomp += BP_GET_UCSIZE(bp);
+ dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+ }
+ return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+ dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+ struct process_old_arg poa = { 0 };
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ objset_t *mos = dp->dp_meta_objset;
+
+ ASSERT(ds->ds_deadlist.dl_oldfmt);
+ ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+ poa.ds = ds;
+ poa.ds_prev = ds_prev;
+ poa.after_branch_point = after_branch_point;
+ poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+ process_old_cb, &poa, tx));
+ VERIFY3U(zio_wait(poa.pio), ==, 0);
+ ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+ /* change snapused */
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -poa.used, -poa.comp, -poa.uncomp, tx);
+
+ /* swap next's deadlist to our deadlist */
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_close(&ds_next->ds_deadlist);
+ SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
+ ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj);
+}
+
void
dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
{
@@ -1566,12 +1691,9 @@
}
if (dsl_dataset_is_snapshot(ds)) {
- blkptr_t bp;
- zio_t *pio;
dsl_dataset_t *ds_next;
- uint64_t itor = 0;
uint64_t old_unique;
- int64_t used = 0, compressed = 0, uncompressed = 0;
+ uint64_t used = 0, comp = 0, uncomp = 0;
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
@@ -1587,53 +1709,49 @@
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
- pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
- /*
- * Transfer to our deadlist (which will become next's
- * new deadlist) any entries from next's current
- * deadlist which were born before prev, and free the
- * other entries.
- *
- * XXX we're doing this long task with the config lock held
- */
- while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
- if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
- VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
- &bp, tx));
- if (ds_prev && !after_branch_point &&
- bp.blk_birth >
- ds_prev->ds_phys->ds_prev_snap_txg) {
- ds_prev->ds_phys->ds_unique_bytes +=
- bp_get_dsize_sync(dp->dp_spa, &bp);
- }
- } else {
- used += bp_get_dsize_sync(dp->dp_spa, &bp);
- compressed += BP_GET_PSIZE(&bp);
- uncompressed += BP_GET_UCSIZE(&bp);
- dsl_free_sync(pio, dp, tx->tx_txg, &bp);
+
+ if (ds_next->ds_deadlist.dl_oldfmt) {
+ process_old_deadlist(ds, ds_prev, ds_next,
+ after_branch_point, tx);
+ } else {
+ /* Adjust prev's unique space. */
+ if (ds_prev && !after_branch_point) {
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds_prev->ds_phys->ds_prev_snap_txg,
+ ds->ds_phys->ds_prev_snap_txg,
+ &used, &comp, &uncomp);
+ ds_prev->ds_phys->ds_unique_bytes += used;
}
+
+ /* Adjust snapused. */
+ dsl_deadlist_space_range(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &used, &comp, &uncomp);
+ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+ -used, -comp, -uncomp, tx);
+
+ /* Move blocks to be freed to pool's free list. */
+ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+ &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+ tx);
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+ DD_USED_HEAD, used, comp, uncomp, tx);
+ dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
+
+ /* Merge our deadlist into next's and free it. */
+ dsl_deadlist_merge(&ds_next->ds_deadlist,
+ ds->ds_phys->ds_deadlist_obj, tx);
}
- VERIFY3U(zio_wait(pio), ==, 0);
- ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
-
- /* change snapused */
- dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
- -used, -compressed, -uncompressed, tx);
-
- /* free next's deadlist */
- bplist_close(&ds_next->ds_deadlist);
- bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
-
- /* set next's deadlist to our deadlist */
- bplist_close(&ds->ds_deadlist);
- ds_next->ds_phys->ds_deadlist_obj =
- ds->ds_phys->ds_deadlist_obj;
- VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj));
- ds->ds_phys->ds_deadlist_obj = 0;
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+
+ /* Collapse range in clone heads */
+ dsl_dataset_remove_clones_key(ds,
+ ds->ds_phys->ds_creation_txg, tx);
if (dsl_dataset_is_snapshot(ds_next)) {
+ dsl_dataset_t *ds_nextnext;
+
/*
* Update next's unique to include blocks which
* were previously shared by only this snapshot
@@ -1642,25 +1760,27 @@
* died after the next snap and before the one
* after that (ie. be on the snap after next's
* deadlist).
- *
- * XXX we're doing this long task with the
- * config lock held
*/
- dsl_dataset_t *ds_after_next;
- uint64_t space;
-
VERIFY(0 == dsl_dataset_hold_obj(dp,
ds_next->ds_phys->ds_next_snap_obj,
- FTAG, &ds_after_next));
-
- VERIFY(0 ==
- bplist_space_birthrange(&ds_after_next->ds_deadlist,
+ FTAG, &ds_nextnext));
+ dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
ds->ds_phys->ds_prev_snap_txg,
- ds->ds_phys->ds_creation_txg, &space));
- ds_next->ds_phys->ds_unique_bytes += space;
-
- dsl_dataset_rele(ds_after_next, FTAG);
+ ds->ds_phys->ds_creation_txg,
+ &used, &comp, &uncomp);
+ ds_next->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_rele(ds_nextnext, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+ /* Collapse range in this head. */
+ dsl_dataset_t *hds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &hds));
+ dsl_deadlist_remove_key(&hds->ds_deadlist,
+ ds->ds_phys->ds_creation_txg, tx);
+ dsl_dataset_rele(hds, FTAG);
+
} else {
ASSERT3P(ds_next->ds_prev, ==, ds);
dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
@@ -1700,9 +1820,8 @@
*/
struct killarg ka;
- ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
- bplist_close(&ds->ds_deadlist);
- bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
ds->ds_phys->ds_deadlist_obj = 0;
/*
@@ -1721,6 +1840,11 @@
ds->ds_phys->ds_unique_bytes == 0);
if (ds->ds_prev != NULL) {
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(mos,
+ ds->ds_prev->ds_dir->dd_phys->dd_clones,
+ ds->ds_object, tx));
+ }
dsl_dataset_rele(ds->ds_prev, ds);
ds->ds_prev = ds_prev = NULL;
}
@@ -1935,20 +2059,24 @@
delta, 0, 0, tx);
}
- bplist_close(&ds->ds_deadlist);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
+ ds->ds_dir->dd_myname, snapname, dsobj,
+ ds->ds_phys->ds_prev_snap_txg);
+ ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
+ UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+ dsl_deadlist_close(&ds->ds_deadlist);
+ dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_add_key(&ds->ds_deadlist,
+ ds->ds_phys->ds_prev_snap_txg, tx);
+
ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
ds->ds_phys->ds_prev_snap_obj = dsobj;
ds->ds_phys->ds_prev_snap_txg = crtxg;
ds->ds_phys->ds_unique_bytes = 0;
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
- ds->ds_phys->ds_deadlist_obj =
- bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
- ds->ds_phys->ds_deadlist_obj));
-
- dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+
err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
snapname, 8, 1, &dsobj, tx);
ASSERT(err == 0);
@@ -2357,6 +2485,7 @@
struct promotenode *snap = list_head(&pa->shared_snaps);
dsl_dataset_t *origin_ds = snap->ds;
int err;
+ uint64_t unused;
/* Check that it is a real clone */
if (!dsl_dir_is_clone(hds->ds_dir))
@@ -2372,10 +2501,9 @@
/* compute origin's new unique space */
snap = list_tail(&pa->clone_snaps);
ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
- err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
- if (err)
- return (err);
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ &pa->unique, &unused, &unused);
/*
* Walk the snapshots that we are moving
@@ -2414,9 +2542,8 @@
if (ds->ds_phys->ds_prev_snap_obj == 0)
continue;
- if (err = bplist_space(&ds->ds_deadlist,
- &dlused, &dlcomp, &dluncomp))
- goto out;
+ dsl_deadlist_space(&ds->ds_deadlist,
+ &dlused, &dlcomp, &dluncomp);
pa->used += dlused;
pa->comp += dlcomp;
pa->uncomp += dluncomp;
@@ -2450,7 +2577,7 @@
/*
* Note, typically this will not be a clone of a clone,
* so dd_origin_txg will be < TXG_INITIAL, so
- * these snaplist_space() -> bplist_space_birthrange()
+ * these snaplist_space() -> dsl_deadlist_space_range()
* calls will be fast because they do not have to
* iterate over all bps.
*/
@@ -2530,6 +2657,26 @@
origin_head->ds_dir->dd_origin_txg =
origin_ds->ds_phys->ds_creation_txg;
+ /* change dd_clone entries */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, hds->ds_object, tx));
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ hds->ds_object, tx));
+
+ VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+ pa->origin_origin->ds_dir->dd_phys->dd_clones,
+ origin_head->ds_object, tx));
+ if (dd->dd_phys->dd_clones == 0) {
+ dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+
+ }
+
/* move snapshots to this dir */
for (snap = list_head(&pa->shared_snaps); snap;
snap = list_next(&pa->shared_snaps, snap)) {
@@ -2547,6 +2694,7 @@
VERIFY(0 == zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
+
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
@@ -2556,6 +2704,40 @@
VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
+ /* move any clone references */
+ if (ds->ds_phys->ds_next_clones_obj &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ ds->ds_phys->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_dataset_t *cnds;
+ uint64_t o;
+
+ if (za.za_first_integer == oldnext_obj) {
+ /*
+ * We've already moved the
+ * origin's reference.
+ */
+ continue;
+ }
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &cnds));
+ o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+
+ VERIFY3U(zap_remove_int(dp->dp_meta_objset,
+ odd->dd_phys->dd_clones, o, tx), ==, 0);
+ VERIFY3U(zap_add_int(dp->dp_meta_objset,
+ dd->dd_phys->dd_clones, o, tx), ==, 0);
+ dsl_dataset_rele(cnds, FTAG);
+ }
+ zap_cursor_fini(&zc);
+ }
+
ASSERT3U(dsl_prop_numcb(ds), ==, 0);
}
@@ -2651,11 +2833,9 @@
*spacep = 0;
for (snap = list_head(l); snap; snap = list_next(l, snap)) {
- uint64_t used;
- int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
- mintxg, UINT64_MAX, &used);
- if (err)
- return (err);
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+ mintxg, UINT64_MAX, &used, &comp, &uncomp);
*spacep += used;
}
return (0);
@@ -2742,10 +2922,10 @@
if (err != 0)
goto out;
- if (dsl_dir_is_clone(snap->ds->ds_dir)) {
- err = dsl_dataset_own_obj(dp,
+ if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
snap->ds->ds_dir->dd_phys->dd_origin_obj,
- 0, FTAG, &pa.origin_origin);
+ FTAG, &pa.origin_origin);
if (err != 0)
goto out;
}
@@ -2770,7 +2950,7 @@
snaplist_destroy(&pa.clone_snaps, B_FALSE);
snaplist_destroy(&pa.origin_snaps, B_FALSE);
if (pa.origin_origin)
- dsl_dataset_disown(pa.origin_origin, FTAG);
+ dsl_dataset_rele(pa.origin_origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (err);
}
@@ -2860,10 +3040,12 @@
*/
if (csa->cds->ds_prev) {
dsl_dataset_t *origin = csa->cds->ds_prev;
+ uint64_t comp, uncomp;
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
- &origin->ds_phys->ds_unique_bytes));
+ &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
}
/* swap blkptrs */
@@ -2883,10 +3065,10 @@
ASSERT3U(csa->cds->ds_dir->dd_phys->
dd_used_breakdown[DD_USED_SNAP], ==, 0);
- VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
- &cdl_comp, &cdl_uncomp));
- VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
- &odl_comp, &odl_uncomp));
+ dsl_deadlist_space(&csa->cds->ds_deadlist,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space(&csa->ohds->ds_deadlist,
+ &odl_used, &odl_comp, &odl_uncomp);
dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
(csa->ohds->ds_phys->ds_used_bytes + odl_used);
@@ -2907,21 +3089,16 @@
* deadlist (since that's the only thing that's
* changing that affects the snapused).
*/
- VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used));
- VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
- csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used));
+ dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &cdl_used, &cdl_comp, &cdl_uncomp);
+ dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
+ csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+ &odl_used, &odl_comp, &odl_uncomp);
dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
-#define SWITCH64(x, y) \
- { \
- uint64_t __tmp = (x); \
- (x) = (y); \
- (y) = __tmp; \
- }
-
/* swap ds_*_bytes */
SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
csa->cds->ds_phys->ds_used_bytes);
@@ -2936,15 +3113,17 @@
dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
csa->unused_refres_delta, 0, 0, tx);
- /* swap deadlists */
- bplist_close(&csa->cds->ds_deadlist);
- bplist_close(&csa->ohds->ds_deadlist);
+ /*
+ * Swap deadlists.
+ */
+ dsl_deadlist_close(&csa->cds->ds_deadlist);
+ dsl_deadlist_close(&csa->ohds->ds_deadlist);
SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
csa->cds->ds_phys->ds_deadlist_obj);
- VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
- csa->cds->ds_phys->ds_deadlist_obj));
- VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
- csa->ohds->ds_phys->ds_deadlist_obj));
+ dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+ csa->cds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+ csa->ohds->ds_phys->ds_deadlist_obj);
dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+ const dsl_deadlist_entry_t *dle1 = arg1;
+ const dsl_deadlist_entry_t *dle2 = arg2;
+
+ if (dle1->dle_mintxg < dle2->dle_mintxg)
+ return (-1);
+ else if (dle1->dle_mintxg > dle2->dle_mintxg)
+ return (+1);
+ else
+ return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ ASSERT(!dl->dl_oldfmt);
+ if (dl->dl_havetree)
+ return;
+
+ avl_create(&dl->dl_tree, dsl_deadlist_compare,
+ sizeof (dsl_deadlist_entry_t),
+ offsetof(dsl_deadlist_entry_t, dle_node));
+ for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = strtonum(za.za_name, NULL);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+ za.za_first_integer));
+ avl_add(&dl->dl_tree, dle);
+ }
+ zap_cursor_fini(&zc);
+ dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+ dmu_object_info_t doi;
+
+ mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+ dl->dl_os = os;
+ dl->dl_object = object;
+ VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+ dmu_object_info_from_db(dl->dl_dbuf, &doi);
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ dl->dl_dbuf = NULL;
+ dl->dl_oldfmt = B_TRUE;
+ VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+ return;
+ }
+
+ dl->dl_oldfmt = B_FALSE;
+ dl->dl_phys = dl->dl_dbuf->db_data;
+ dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+ void *cookie = NULL;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt) {
+ dl->dl_oldfmt = B_FALSE;
+ bpobj_close(&dl->dl_bpobj);
+ return;
+ }
+
+ if (dl->dl_havetree) {
+ while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+ != NULL) {
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ }
+ avl_destroy(&dl->dl_tree);
+ }
+ dmu_buf_rele(dl->dl_dbuf, dl);
+ mutex_destroy(&dl->dl_lock);
+ dl->dl_dbuf = NULL;
+ dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+ return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+ sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+ dmu_object_info_t doi;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_free(os, dlobj, tx);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, os, dlobj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc))
+ bpobj_free(os, za.za_first_integer, tx);
+ zap_cursor_fini(&zc);
+ VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used +=
+ bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+ dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+ dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = bp->blk_birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ else
+ dle = AVL_PREV(&dl->dl_tree, dle);
+ bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ uint64_t obj;
+ dsl_deadlist_entry_t *dle;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+ dle->dle_mintxg = mintxg;
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+ avl_add(&dl->dl_tree, dle);
+
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+ mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle, *dle_prev;
+
+ if (dl->dl_oldfmt)
+ return;
+
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+ dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+ bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+ dle->dle_bpobj.bpo_object, tx);
+
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_t dl;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+
+ dsl_deadlist_open(&dl, os, dlobj);
+ if (dl.dl_oldfmt) {
+ dsl_deadlist_close(&dl);
+ return;
+ }
+
+ while (mrs_obj != 0) {
+ dsl_dataset_t *ds;
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+ dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+ mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_rele(ds, FTAG);
+ }
+ dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t *dle;
+ uint64_t newobj;
+
+ newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+ if (dl->dl_oldfmt) {
+ dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+ return (newobj);
+ }
+
+ dsl_deadlist_load_tree(dl);
+
+ for (dle = avl_first(&dl->dl_tree); dle;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t obj;
+
+ if (dle->dle_mintxg >= maxtxg)
+ break;
+
+ obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+ dle->dle_mintxg, obj, tx));
+ }
+ return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+ usedp, compp, uncompp));
+ return;
+ }
+
+ mutex_enter(&dl->dl_lock);
+ *usedp = dl->dl_phys->dl_used;
+ *compp = dl->dl_phys->dl_comp;
+ *uncompp = dl->dl_phys->dl_uncomp;
+ mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ if (dl->dl_oldfmt) {
+ VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+ mintxg, maxtxg, usedp, compp, uncompp));
+ return;
+ }
+
+ dsl_deadlist_load_tree(dl);
+ *usedp = *compp = *uncompp = 0;
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ /*
+ * If we don't find this mintxg, there shouldn't be anything
+ * after it either.
+ */
+ ASSERT(dle != NULL ||
+ avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+ for (; dle && dle->dle_mintxg < maxtxg;
+ dle = AVL_NEXT(&dl->dl_tree, dle)) {
+ uint64_t used, comp, uncomp;
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+
+ *usedp += used;
+ *compp += comp;
+ *uncompp += uncomp;
+ }
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+ uint64_t used, comp, uncomp;
+ bpobj_t bpo;
+
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+ bpobj_close(&bpo);
+
+ dsl_deadlist_load_tree(dl);
+
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ mutex_enter(&dl->dl_lock);
+ dl->dl_phys->dl_used += used;
+ dl->dl_phys->dl_comp += comp;
+ dl->dl_phys->dl_uncomp += uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ dle_tofind.dle_mintxg = birth;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+ bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl. obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_buf_t *bonus;
+ dsl_deadlist_phys_t *dlp;
+ dmu_object_info_t doi;
+
+ VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+ if (doi.doi_type == DMU_OT_BPOBJ) {
+ bpobj_t bpo;
+ VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+ VERIFY3U(0, ==, bpobj_iterate(&bpo,
+ dsl_deadlist_insert_cb, dl, tx));
+ bpobj_close(&bpo);
+ return;
+ }
+
+ for (zap_cursor_init(&zc, dl->dl_os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ uint64_t mintxg = strtonum(za.za_name, NULL);
+ dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+ }
+ zap_cursor_fini(&zc);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+ dlp = bonus->db_data;
+ dmu_buf_will_dirty(bonus, tx);
+ bzero(dlp, sizeof (*dlp));
+ dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx)
+{
+ dsl_deadlist_entry_t dle_tofind;
+ dsl_deadlist_entry_t *dle;
+ avl_index_t where;
+
+ ASSERT(!dl->dl_oldfmt);
+ dmu_buf_will_dirty(dl->dl_dbuf, tx);
+ dsl_deadlist_load_tree(dl);
+
+ dle_tofind.dle_mintxg = mintxg;
+ dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+ if (dle == NULL)
+ dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+ while (dle) {
+ uint64_t used, comp, uncomp;
+ dsl_deadlist_entry_t *dle_next;
+
+ bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+ VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+ &used, &comp, &uncomp));
+ mutex_enter(&dl->dl_lock);
+ ASSERT3U(dl->dl_phys->dl_used, >=, used);
+ ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+ ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+ dl->dl_phys->dl_used -= used;
+ dl->dl_phys->dl_comp -= comp;
+ dl->dl_phys->dl_uncomp -= uncomp;
+ mutex_exit(&dl->dl_lock);
+
+ VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+ dle->dle_mintxg, tx));
+
+ dle_next = AVL_NEXT(&dl->dl_tree, dle);
+ avl_remove(&dl->dl_tree, dle);
+ bpobj_close(&dle->dle_bpobj);
+ kmem_free(dle, sizeof (*dle));
+ dle = dle_next;
+ }
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Fri May 21 17:29:22 2010 -0700
@@ -412,7 +412,7 @@
{
objset_t *mos = dp->dp_meta_objset;
uint64_t ddobj;
- dsl_dir_phys_t *dsphys;
+ dsl_dir_phys_t *ddphys;
dmu_buf_t *dbuf;
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@@ -427,17 +427,17 @@
}
VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
- dsphys = dbuf->db_data;
+ ddphys = dbuf->db_data;
- dsphys->dd_creation_time = gethrestime_sec();
+ ddphys->dd_creation_time = gethrestime_sec();
if (pds)
- dsphys->dd_parent_obj = pds->dd_object;
- dsphys->dd_props_zapobj = zap_create(mos,
+ ddphys->dd_parent_obj = pds->dd_object;
+ ddphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
- dsphys->dd_child_dir_zapobj = zap_create(mos,
+ ddphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
- dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+ ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
dmu_buf_rele(dbuf, FTAG);
return (ddobj);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Fri May 21 17:29:22 2010 -0700
@@ -38,6 +38,7 @@
#include <sys/fs/zfs.h>
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
int zfs_no_write_throttle = 0;
int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
@@ -104,6 +105,7 @@
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
dsl_dir_t *dd;
dsl_dataset_t *ds;
+ uint64_t obj;
rw_enter(&dp->dp_config_rwlock, RW_WRITER);
err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
@@ -143,6 +145,20 @@
goto out;
}
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+ &dp->dp_free_dir);
+ if (err)
+ goto out;
+
+ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
+ if (err)
+ goto out;
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
&dp->dp_tmp_userrefs_obj);
@@ -177,9 +193,13 @@
dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
if (dp->dp_mos_dir)
dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_free_dir)
+ dsl_dir_close(dp->dp_free_dir, dp);
if (dp->dp_root_dir)
dsl_dir_close(dp->dp_root_dir, dp);
+ bpobj_close(&dp->dp_free_bpobj);
+
/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
if (dp->dp_meta_objset)
dmu_objset_evict(dp->dp_meta_objset);
@@ -208,7 +228,7 @@
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
objset_t *os;
dsl_dataset_t *ds;
- uint64_t dsobj;
+ uint64_t obj;
/* create and open the MOS (meta-objset) */
dp->dp_meta_objset = dmu_objset_create_impl(spa,
@@ -232,14 +252,29 @@
VERIFY(0 == dsl_pool_open_special_dir(dp,
MOS_DIR_NAME, &dp->dp_mos_dir));
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ /* create and open the free dir */
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /* create and open the free_bplist */
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+ }
+
if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
dsl_pool_create_origin(dp, tx);
/* create the root dataset */
- dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
/* create the root objset */
- VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
os = dmu_objset_create_impl(dp->dp_spa, ds,
dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
#ifdef _KERNEL
@@ -252,6 +287,14 @@
return (dp);
}
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
@@ -315,13 +358,14 @@
err = zio_wait(zio);
/*
- * If anything was added to a deadlist during a zio done callback,
- * it had to be put on the deferred queue. Enqueue it for real now.
+ * Move dead blocks from the pending deadlist to the on-disk
+ * deadlist.
*/
for (ds = list_head(&dp->dp_synced_datasets); ds;
- ds = list_next(&dp->dp_synced_datasets, ds))
- bplist_sync(&ds->ds_deadlist,
- bplist_enqueue_cb, &ds->ds_deadlist, tx);
+ ds = list_next(&dp->dp_synced_datasets, ds)) {
+ bplist_iterate(&ds->ds_pending_deadlist,
+ deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+ }
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
/*
@@ -612,6 +656,65 @@
tx, DS_FIND_CHILDREN));
}
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+ dmu_tx_t *tx = arg;
+ dsl_dataset_t *ds;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ objset_t *mos = dp->dp_meta_objset;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+ if (ds->ds_dir->dd_phys->dd_origin_obj) {
+ dsl_dataset_t *origin;
+
+ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+ ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+ if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+ origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+ DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ }
+
+ VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+ origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+ dsl_dataset_rele(origin, FTAG);
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+ ASSERT(dmu_tx_is_syncing(tx));
+ uint64_t obj;
+
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+ VERIFY(0 == dsl_pool_open_special_dir(dp,
+ FREE_DIR_NAME, &dp->dp_free_dir));
+
+ /*
+ * We can't use bpobj_alloc(), because spa_version() still
+ * returns the old version, and we need a new-version bpobj with
+ * subobj support. So call dmu_object_alloc() directly.
+ */
+ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+ SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+ VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+ dp->dp_meta_objset, obj));
+
+ VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+}
+
void
dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
{
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c Fri May 21 17:29:22 2010 -0700
@@ -57,6 +57,7 @@
static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
@@ -599,7 +600,7 @@
* XXX need to make sure all of these arc_read() prefetches are
* done before setting xlateall (similar to dsl_read())
*/
- (void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp,
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
&flags, &czb);
}
@@ -1378,11 +1379,56 @@
zap_cursor_fini(&zc);
}
+static int
+dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_scan_t *scn = arg;
+ uint64_t elapsed_nanosecs;
+
+ elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+ if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+ (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ txg_sync_waiting(scn->scn_dp)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa))
+ return (ERESTART);
+
+ zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ dmu_tx_get_txg(tx), bp, 0));
+ dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+ -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+ -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+ scn->scn_visited_this_txg++;
+ return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t used = 0, comp, uncomp;
+
+ if (spa->spa_load_state != SPA_LOAD_NONE)
+ return (B_FALSE);
+ if (spa_shutting_down(spa))
+ return (B_FALSE);
+
+ if (scn->scn_phys.scn_state == DSS_SCANNING)
+ return (B_TRUE);
+
+ if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+ &used, &comp, &uncomp);
+ }
+ return (used != 0);
+}
+
void
dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
+ int err;
/*
* Check for scn_restart_txg before checking spa_load_state, so
@@ -1400,10 +1446,9 @@
dsl_scan_setup_sync(scn, &func, tx);
}
- if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE ||
- spa_shutting_down(spa) ||
- spa_sync_pass(dp->dp_spa) > 1 ||
- scn->scn_phys.scn_state != DSS_SCANNING)
+
+ if (!dsl_scan_active(scn) ||
+ spa_sync_pass(dp->dp_spa) > 1)
return;
scn->scn_visited_this_txg = 0;
@@ -1411,6 +1456,40 @@
scn->scn_sync_start_time = gethrtime();
spa->spa_scrub_active = B_TRUE;
+ /*
+ * First process the free list. If we pause the free, don't do
+ * any scanning. This ensures that there is no free list when
+ * we are scanning, so the scan code doesn't have to worry about
+ * traversing it.
+ */
+ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bpobj_iterate(&dp->dp_free_bpobj,
+ dsl_scan_free_cb, scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj txg %llu",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+ (longlong_t)tx->tx_txg);
+ scn->scn_visited_this_txg = 0;
+ /*
+ * Re-sync the ddt so that we can further modify
+ * it when doing bprewrite.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err == ERESTART)
+ return;
+ }
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1433,11 +1512,11 @@
(longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
}
- scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_CANFAIL);
dsl_scan_visit(scn, tx);
- (void) zio_wait(scn->scn_prefetch_zio_root);
- scn->scn_prefetch_zio_root = NULL;
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
zfs_dbgmsg("visited %llu blocks in %llums",
(longlong_t)scn->scn_visited_this_txg,
--- a/usr/src/uts/common/fs/zfs/spa.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c Fri May 21 17:29:22 2010 -0700
@@ -943,6 +943,8 @@
spa->spa_async_zio_root = NULL;
}
+ bpobj_close(&spa->spa_deferred_bpobj);
+
/*
* Close the dsl pool.
*/
@@ -1662,6 +1664,7 @@
uint64_t config_cache_txg = spa->spa_config_txg;
int orig_mode = spa->spa_mode;
int parse;
+ uint64_t obj;
/*
* If this is an untrusted config, access the pool in read-only mode.
@@ -1840,8 +1843,10 @@
return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
- if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST,
- &spa->spa_deferred_bplist_obj) != 0)
+ if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+ if (error != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
@@ -2687,7 +2692,7 @@
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version;
+ uint64_t version, obj;
/*
* If this pool already exists, return failure.
@@ -2834,20 +2839,20 @@
}
/*
- * Create the deferred-free bplist object. Turn off compression
+ * Create the deferred-free bpobj. Turn off compression
* because sync-to-convergence takes longer if the blocksize
* keeps changing.
*/
- spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
- 1 << 14, tx);
- dmu_object_set_compress(spa->spa_meta_objset,
- spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
-
+ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+ dmu_object_set_compress(spa->spa_meta_objset, obj,
+ ZIO_COMPRESS_OFF, tx);
if (zap_add(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
- cmn_err(CE_PANIC, "failed to add bplist");
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+ sizeof (uint64_t), 1, &obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bpobj");
}
+ VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+ spa->spa_meta_objset, obj));
/*
* Create the pool's history object.
@@ -4946,34 +4951,23 @@
* SPA syncing routines
* ==========================================================================
*/
-static void
-spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
+
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
- blkptr_t blk;
- uint64_t itor = 0;
- uint8_t c = 1;
-
- while (bplist_iterate(bpl, &itor, &blk) == 0) {
- ASSERT(blk.blk_birth < txg);
- zio_free(spa, txg, &blk);
- }
-
- bplist_vacate(bpl, tx);
-
- /*
- * Pre-dirty the first block so we sync to convergence faster.
- * (Usually only the first block is needed.)
- */
- dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+ bpobj_t *bpo = arg;
+ bpobj_enqueue(bpo, bp, tx);
+ return (0);
}
-static void
-spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
zio_t *zio = arg;
zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
zio->io_flags));
+ return (0);
}
static void
@@ -5204,6 +5198,42 @@
}
/*
+ * Perform one-time upgrade on-disk changes. spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ ASSERT(spa->spa_sync_pass == 1);
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+ dsl_pool_create_origin(dp, tx);
+
+ /* Keeping the origin open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+ dsl_pool_upgrade_clones(dp, tx);
+ }
+
+ if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+ spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+ dsl_pool_upgrade_dir_clones(dp, tx);
+
+ /* Keeping the freedir open increases spa_minref */
+ spa->spa_minref += 3;
+ }
+}
+
+/*
* Sync the specified transaction group. New blocks may be dirtied as
* part of the process, so we iterate until it converges.
*/
@@ -5212,7 +5242,7 @@
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bplist_t *defer_bpl = &spa->spa_deferred_bplist;
+ bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
@@ -5251,8 +5281,6 @@
}
spa_config_exit(spa, SCL_STATE, FTAG);
- VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
-
tx = dmu_tx_create_assigned(dp, txg);
/*
@@ -5276,19 +5304,6 @@
}
}
- if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
- dsl_pool_create_origin(dp, tx);
-
- /* Keeping the origin open increases spa_minref */
- spa->spa_minref += 3;
- }
-
- if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
- spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
- dsl_pool_upgrade_clones(dp, tx);
- }
-
/*
* If anything has changed in this txg, or if someone is waiting
* for this txg to sync (eg, spa_vdev_remove()), push the
@@ -5299,9 +5314,13 @@
if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
!txg_list_empty(&dp->dp_dirty_dirs, txg) ||
!txg_list_empty(&dp->dp_sync_tasks, txg) ||
- ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING ||
- txg_sync_waiting(dp)) && !spa_shutting_down(spa)))
- spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
+ ((dsl_scan_active(dp->dp_scan) ||
+ txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(defer_bpo,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY3U(zio_wait(zio), ==, 0);
+ }
/*
* Iterate to convergence.
@@ -5319,10 +5338,12 @@
if (pass <= SYNC_PASS_DEFERRED_FREE) {
zio_t *zio = zio_root(spa, NULL, NULL, 0);
- bplist_sync(free_bpl, spa_sync_free, zio, tx);
+ bplist_iterate(free_bpl, spa_free_sync_cb,
+ zio, tx);
VERIFY(zio_wait(zio) == 0);
} else {
- bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
+ bplist_iterate(free_bpl, bpobj_enqueue_cb,
+ defer_bpo, tx);
}
ddt_sync(spa, txg);
@@ -5331,12 +5352,11 @@
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
vdev_sync(vd, txg);
+ if (pass == 1)
+ spa_sync_upgrades(spa, tx);
+
} while (dmu_objset_is_dirty(mos, txg));
- ASSERT(list_is_empty(&free_bpl->bpl_queue));
-
- bplist_close(defer_bpl);
-
/*
* Rewrite the vdev configuration (which includes the uberblock)
* to commit the transaction group.
@@ -5423,8 +5443,6 @@
ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
- ASSERT(list_is_empty(&defer_bpl->bpl_queue));
- ASSERT(list_is_empty(&free_bpl->bpl_queue));
spa->spa_sync_pass = 0;
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c Fri May 21 17:29:22 2010 -0700
@@ -445,8 +445,7 @@
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < TXG_SIZE; t++)
- bplist_init(&spa->spa_free_bplist[t]);
- bplist_init(&spa->spa_deferred_bplist);
+ bplist_create(&spa->spa_free_bplist[t]);
(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
spa->spa_state = POOL_STATE_UNINITIALIZED;
@@ -524,8 +523,7 @@
spa_config_lock_destroy(spa);
for (int t = 0; t < TXG_SIZE; t++)
- bplist_fini(&spa->spa_free_bplist[t]);
- bplist_fini(&spa->spa_deferred_bplist);
+ bplist_destroy(&spa->spa_free_bplist[t]);
cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_proc_cv);
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h Fri May 21 17:29:22 2010 -0700
@@ -19,76 +19,36 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_BPLIST_H
#define _SYS_BPLIST_H
-#include <sys/dmu.h>
+#include <sys/zfs_context.h>
#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
#ifdef __cplusplus
extern "C" {
#endif
-typedef struct bplist_phys {
- /*
- * This is the bonus buffer for the dead lists. The object's
- * contents is an array of bpl_entries blkptr_t's, representing
- * a total of bpl_bytes physical space.
- */
- uint64_t bpl_entries;
- uint64_t bpl_bytes;
- uint64_t bpl_comp;
- uint64_t bpl_uncomp;
-} bplist_phys_t;
-
-#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
- blkptr_t bpq_blk;
- list_node_t bpq_node;
-} bplist_q_t;
+typedef struct bplist_entry {
+ blkptr_t bpe_blk;
+ list_node_t bpe_node;
+} bplist_entry_t;
typedef struct bplist {
kmutex_t bpl_lock;
- objset_t *bpl_mos;
- uint64_t bpl_object;
- uint8_t bpl_blockshift;
- uint8_t bpl_bpshift;
- uint8_t bpl_havecomp;
- kmutex_t bpl_q_lock;
- list_t bpl_queue;
- bplist_phys_t *bpl_phys;
- dmu_buf_t *bpl_dbuf;
- dmu_buf_t *bpl_cached_dbuf;
+ list_t bpl_list;
} bplist_t;
-typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_init(bplist_t *bpl);
-extern void bplist_fini(bplist_t *bpl);
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
void *arg, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
- uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
- uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
#ifdef __cplusplus
}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/bpobj.h Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_BPOBJ_H
+#define _SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+ /*
+ * This is the bonus buffer for the dead lists. The object's
+ * contents is an array of bpo_entries blkptr_t's, representing
+ * a total of bpo_bytes physical space.
+ */
+ uint64_t bpo_num_blkptrs;
+ uint64_t bpo_bytes;
+ uint64_t bpo_comp;
+ uint64_t bpo_uncomp;
+ uint64_t bpo_subobjs;
+ uint64_t bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
+#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+ kmutex_t bpo_lock;
+ objset_t *bpo_os;
+ uint64_t bpo_object;
+ int bpo_epb;
+ uint8_t bpo_havecomp;
+ uint8_t bpo_havesubobj;
+ bpobj_phys_t *bpo_phys;
+ dmu_buf_t *bpo_dbuf;
+ dmu_buf_t *bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Fri May 21 17:29:22 2010 -0700
@@ -77,8 +77,8 @@
DMU_OT_OBJECT_ARRAY, /* UINT64 */
DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */
DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */
- DMU_OT_BPLIST, /* UINT64 */
- DMU_OT_BPLIST_HDR, /* UINT64 */
+ DMU_OT_BPOBJ, /* UINT64 */
+ DMU_OT_BPOBJ_HDR, /* UINT64 */
/* spa: */
DMU_OT_SPACE_MAP_HEADER, /* UINT64 */
DMU_OT_SPACE_MAP, /* UINT64 */
@@ -130,6 +130,10 @@
DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */
DMU_OT_SCAN_XLATE, /* ZAP */
DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */
+ DMU_OT_DEADLIST, /* ZAP */
+ DMU_OT_DEADLIST_HDR, /* UINT64 */
+ DMU_OT_DSL_CLONES, /* ZAP */
+ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -211,7 +215,7 @@
#define DMU_POOL_DIRECTORY_OBJECT 1
#define DMU_POOL_CONFIG "config"
#define DMU_POOL_ROOT_DATASET "root_dataset"
-#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
#define DMU_POOL_ERRLOG_LAST "errlog_last"
#define DMU_POOL_SPARES "spares"
@@ -224,6 +228,7 @@
#define DMU_POOL_DDT_STATS "DDT-statistics"
#define DMU_POOL_CREATION_VERSION "creation_version"
#define DMU_POOL_SCAN "scan"
+#define DMU_POOL_FREE_BPOBJ "free_bpobj"
/*
* Allocate an object from this objset. The range of object numbers
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri May 21 17:29:22 2010 -0700
@@ -32,6 +32,7 @@
#include <sys/bplist.h>
#include <sys/dsl_synctask.h>
#include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
#ifdef __cplusplus
extern "C" {
@@ -82,7 +83,7 @@
uint64_t ds_num_children; /* clone/snap children; ==0 for head */
uint64_t ds_creation_time; /* seconds since 1970 */
uint64_t ds_creation_txg;
- uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */
+ uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */
uint64_t ds_used_bytes;
uint64_t ds_compressed_bytes;
uint64_t ds_uncompressed_bytes;
@@ -114,7 +115,8 @@
struct dsl_dataset *ds_prev;
/* has internal locking: */
- bplist_t ds_deadlist;
+ dsl_deadlist_t ds_deadlist;
+ bplist_t ds_pending_deadlist;
/* to protect against multiple concurrent incremental recv */
kmutex_t ds_recvlock;
@@ -160,7 +162,7 @@
boolean_t need_prep; /* do we need to retry due to EBUSY? */
};
-#define dsl_dataset_is_snapshot(ds) \
+#define dsl_dataset_is_snapshot(ds) \
((ds)->ds_phys->ds_num_children != 0)
#define DS_UNIQUE_IS_ACCURATE(ds) \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_deadlist.h Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DSL_DEADLIST_H
+#define _SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+ uint64_t dl_used;
+ uint64_t dl_comp;
+ uint64_t dl_uncomp;
+ uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+ objset_t *dl_os;
+ uint64_t dl_object;
+ avl_tree_t dl_tree;
+ boolean_t dl_havetree;
+ struct dmu_buf *dl_dbuf;
+ dsl_deadlist_phys_t *dl_phys;
+ kmutex_t dl_lock;
+
+ /* if it's the old on-disk format: */
+ bpobj_t dl_bpobj;
+ boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+ avl_node_t dle_node;
+ uint64_t dle_mintxg;
+ bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+ uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+ uint64_t mintxg, uint64_t maxtxg,
+ uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+ dmu_tx_t *tx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Fri May 21 17:29:22 2010 -0700
@@ -69,7 +69,8 @@
uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
uint64_t dd_flags;
uint64_t dd_used_breakdown[DD_USED_NUM];
- uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+ uint64_t dd_clones; /* dsl_dir objects */
+ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
} dsl_dir_phys_t;
struct dsl_dir {
@@ -143,6 +144,7 @@
#define MOS_DIR_NAME "$MOS"
#define ORIGIN_DIR_NAME "$ORIGIN"
#define XLATION_DIR_NAME "$XLATION"
+#define FREE_DIR_NAME "$FREE"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri May 21 17:29:22 2010 -0700
@@ -33,6 +33,7 @@
#include <sys/dnode.h>
#include <sys/ddt.h>
#include <sys/arc.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -71,6 +72,7 @@
struct objset *dp_meta_objset;
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
+ struct dsl_dir *dp_free_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
@@ -82,6 +84,7 @@
uint64_t dp_throughput; /* bytes per millisec */
uint64_t dp_write_limit;
uint64_t dp_tmp_userrefs_obj;
+ bpobj_t dp_free_bpobj;
struct dsl_scan *dp_scan;
@@ -130,6 +133,7 @@
uint32_t *arc_flags, const zbookmark_t *zb);
void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h Fri May 21 17:29:22 2010 -0700
@@ -77,7 +77,7 @@
boolean_t scn_pausing;
uint64_t scn_restart_txg;
uint64_t scn_sync_start_time;
- zio_t *scn_prefetch_zio_root;
+ zio_t *scn_zio_root;
/* for debugging / information */
uint64_t scn_visited_this_txg;
@@ -99,6 +99,7 @@
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
#ifdef __cplusplus
}
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h Fri May 21 17:29:22 2010 -0700
@@ -599,7 +599,6 @@
extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
-extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
/* Miscellaneous support routines */
extern int spa_rename(const char *oldname, const char *newname);
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Fri May 21 17:29:22 2010 -0700
@@ -35,6 +35,7 @@
#include <sys/avl.h>
#include <sys/refcount.h>
#include <sys/bplist.h>
+#include <sys/bpobj.h>
#ifdef __cplusplus
extern "C" {
@@ -140,8 +141,7 @@
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_config_generation; /* config generation number */
uint64_t spa_syncing_txg; /* txg currently syncing */
- uint64_t spa_deferred_bplist_obj; /* object for deferred frees */
- bplist_t spa_deferred_bplist; /* deferred-free bplist */
+ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Fri May 21 17:29:22 2010 -0700
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
#define _SYS_ZIO_CHECKSUM_H
#include <sys/zio.h>
-#include <zfs_fletcher.h>
#ifdef __cplusplus
extern "C" {
@@ -68,6 +66,7 @@
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
#ifdef __cplusplus
}
--- a/usr/src/uts/common/fs/zfs/txg.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/txg.c Fri May 21 17:29:22 2010 -0700
@@ -371,9 +371,7 @@
* us, or we have reached our timeout.
*/
timer = (delta >= timeout ? 0 : timeout - delta);
- while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING ||
- spa_load_state(spa) != SPA_LOAD_NONE ||
- spa_shutting_down(spa)) &&
+ while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
tx->tx_quiesced_txg == 0) {
--- a/usr/src/uts/common/fs/zfs/zio.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c Fri May 21 17:29:22 2010 -0700
@@ -652,7 +652,7 @@
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
- bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
}
zio_t *
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c Fri May 21 17:29:22 2010 -0700
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -28,6 +27,7 @@
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
+#include <zfs_fletcher.h>
/*
* Checksum vectors.
--- a/usr/src/uts/common/sys/fs/zfs.h Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h Fri May 21 17:29:22 2010 -0700
@@ -334,14 +334,15 @@
#define SPA_VERSION_23 23ULL
#define SPA_VERSION_24 24ULL
#define SPA_VERSION_25 25ULL
+#define SPA_VERSION_26 26ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/
-#define SPA_VERSION SPA_VERSION_25
-#define SPA_VERSION_STRING "25"
+#define SPA_VERSION SPA_VERSION_26
+#define SPA_VERSION_STRING "26"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -358,7 +359,7 @@
#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2
#define SPA_VERSION_SPARES SPA_VERSION_3
#define SPA_VERSION_RAIDZ2 SPA_VERSION_3
-#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3
+#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3
#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3
#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3
#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4
@@ -388,6 +389,8 @@
#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23
#define SPA_VERSION_SA SPA_VERSION_24
#define SPA_VERSION_SCAN SPA_VERSION_25
+#define SPA_VERSION_DIR_CLONES SPA_VERSION_26
+#define SPA_VERSION_DEADLISTS SPA_VERSION_26
/*
* ZPL version - rev'd whenever an incompatible on-disk format change