# HG changeset patch # User Matthew Ahrens # Date 1274488162 25200 # Node ID 54258108784bbf0b7d8366ea7f1c4323bfa3e4da # Parent 1664b63fef3255a44ad17f23b0e7ecbeec5d2429 6948890 snapshot deletion can induce pathologically long spa_sync() times diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/zdb/zdb.c --- a/usr/src/cmd/zdb/zdb.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/zdb/zdb.c Fri May 21 17:29:22 2010 -0700 @@ -900,9 +900,9 @@ } static void -sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp) +sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp) { - dva_t *dva = bp->blk_dva; + const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; if (dump_opt['b'] >= 5) { @@ -1127,12 +1127,21 @@ (void) printf("\t\tbp = %s\n", blkbuf); } -static void -dump_bplist(objset_t *mos, uint64_t object, char *name) +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { - bplist_t bpl = { 0 }; - blkptr_t blk, *bp = &blk; - uint64_t itor = 0; + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + sprintf_blkptr_compact(blkbuf, bp); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_bpobj(bpobj_t *bpo, char *name) +{ char bytes[32]; char comp[32]; char uncomp[32]; @@ -1140,45 +1149,59 @@ if (dump_opt['d'] < 3) return; - bplist_init(&bpl); - VERIFY(0 == bplist_open(&bpl, mos, object)); - if (bplist_empty(&bpl)) { - bplist_close(&bpl); - bplist_fini(&bpl); - return; - } - - zdb_nicenum(bpl.bpl_phys->bpl_bytes, bytes); - if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) { - zdb_nicenum(bpl.bpl_phys->bpl_comp, comp); - zdb_nicenum(bpl.bpl_phys->bpl_uncomp, uncomp); - (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n", - name, (u_longlong_t)bpl.bpl_phys->bpl_entries, + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); + if (bpo->bpo_havesubobj) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); + (void) printf("\n %s: %llu local blkptrs, %llu subobjs, " + "%s (%s/%s comp)\n", + name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, bytes, comp, uncomp); } else { - (void) printf("\n %s: %llu entries, %s\n", - name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes); + (void) printf("\n %s: %llu blkptrs, %s\n", + name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes); } - if (dump_opt['d'] < 5) { - bplist_close(&bpl); - bplist_fini(&bpl); + if (dump_opt['d'] < 5) return; - } (void) printf("\n"); - while (bplist_iterate(&bpl, &itor, bp) == 0) { - char blkbuf[BP_SPRINTF_LEN]; - - ASSERT(bp->blk_birth != 0); - sprintf_blkptr_compact(blkbuf, bp); - (void) printf("\tItem %3llu: %s\n", - (u_longlong_t)itor - 1, blkbuf); + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); +} + +static void +dump_deadlist(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + char bytes[32]; + char comp[32]; + char uncomp[32]; + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(dl->dl_phys->dl_used, bytes); + zdb_nicenum(dl->dl_phys->dl_comp, comp); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp); + (void) printf("\n Deadlist: %s (%s/%s comp)\n", + bytes, comp, uncomp); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + (void) printf(" mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + if (dump_opt['d'] >= 5) + dump_bpobj(&dle->dle_bpobj, ""); } - - bplist_close(&bpl); - bplist_fini(&bpl); } static avl_tree_t idx_tree; @@ -1404,6 +1427,10 @@ dump_sa_layouts, /* SA attribute layouts */ dump_zap, /* DSL scrub translations */ dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_none, /* bpobj subobjs */ dump_unknown, /* Unknown type, must be last */ }; @@ -1590,8 +1617,7 @@ dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) - dump_bplist(dmu_objset_pool(os)->dp_meta_objset, - dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist"); + dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); if (verbosity < 2) return; @@ -1867,10 +1893,11 @@ uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; + spa_t *zcb_spa; } zdb_cb_t; static void -zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp, +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; @@ -1898,7 +1925,7 @@ ddt_t *ddt; ddt_entry_t *dde; - ddt = ddt_select(spa, bp); + ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); @@ -1914,8 +1941,8 @@ ddt_exit(ddt); } - VERIFY3U(zio_wait(zio_claim(NULL, spa, - refcnt ? 0 : spa_first_txg(spa), + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_first_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } @@ -1934,7 +1961,7 @@ type = BP_GET_TYPE(bp); - zdb_count_block(spa, zilog, zcb, bp, type); + zdb_count_block(zcb, zilog, bp, type); is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); @@ -2048,8 +2075,7 @@ ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { - zdb_count_block(spa, NULL, zcb, &blk, - ZDB_OT_DITTO); + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); @@ -2070,6 +2096,8 @@ static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { + zcb->zcb_spa = spa; + if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { @@ -2111,6 +2139,22 @@ } } +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + sprintf_blkptr(blkbuf, bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + static int dump_block_stats(spa_t *spa) { @@ -2140,26 +2184,10 @@ /* * If there's a deferred-free bplist, process that first. */ - if (spa->spa_deferred_bplist_obj != 0) { - bplist_t *bpl = &spa->spa_deferred_bplist; - blkptr_t blk; - uint64_t itor = 0; - - VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset, - spa->spa_deferred_bplist_obj)); - - while (bplist_iterate(bpl, &itor, &blk) == 0) { - if (dump_opt['b'] >= 4) { - char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, &blk); - (void) printf("[%s] %s\n", - "deferred free", blkbuf); - } - zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED); - } - - bplist_close(bpl); - } + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + count_block_cb, &zcb, NULL); + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + count_block_cb, &zcb, NULL); if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; @@ -2438,8 +2466,11 @@ if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { - dump_bplist(dp->dp_meta_objset, - spa->spa_deferred_bplist_obj, "Deferred frees"); + dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees"); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj, + "Pool frees"); + } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/zinject/translate.c --- a/usr/src/cmd/zinject/translate.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/zinject/translate.c Fri May 21 17:29:22 2010 -0700 @@ -359,8 +359,8 @@ case TYPE_CONFIG: record->zi_type = DMU_OT_PACKED_NVLIST; break; - case TYPE_BPLIST: - record->zi_type = DMU_OT_BPLIST; + case TYPE_BPOBJ: + record->zi_type = DMU_OT_BPOBJ; break; case TYPE_SPACEMAP: record->zi_type = DMU_OT_SPACE_MAP; diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/zinject/zinject.c --- a/usr/src/cmd/zinject/zinject.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/zinject/zinject.c Fri May 21 17:29:22 2010 -0700 @@ -69,7 +69,7 @@ * mos Any data in the MOS * mosdir object directory * config pool configuration - * bplist blkptr list + * bpobj blkptr list * spacemap spacemap * metaslab metaslab * errlog persistent error log @@ -163,7 +163,7 @@ "mosdir", "metaslab", "config", - "bplist", + "bpobj", "spacemap", "errlog", "uber", @@ -193,8 +193,8 @@ return ("metaslab"); case DMU_OT_PACKED_NVLIST: return ("config"); - case DMU_OT_BPLIST: - return ("bplist"); + case DMU_OT_BPOBJ: + return ("bpobj"); case DMU_OT_SPACE_MAP: return ("spacemap"); case DMU_OT_ERROR_LOG: @@ -285,7 +285,7 @@ "\t\t\ton a ZFS filesystem.\n" "\n" "\t-t \tInject errors into the MOS for objects of the given\n" - "\t\t\ttype. Valid types are: mos, mosdir, config, bplist,\n" + "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" "\t\t\tspacemap, metaslab, errlog. The only valid is\n" "\t\t\tthe poolname.\n"); } diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/zinject/zinject.h --- a/usr/src/cmd/zinject/zinject.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/zinject/zinject.h Fri May 21 17:29:22 2010 -0700 @@ -38,7 +38,7 @@ TYPE_MOSDIR, /* MOS object directory */ TYPE_METASLAB, /* metaslab objects */ TYPE_CONFIG, /* MOS config */ - TYPE_BPLIST, /* block pointer list */ + TYPE_BPOBJ, /* block pointer list */ TYPE_SPACEMAP, /* space map objects */ TYPE_ERRLOG, /* persistent error log */ TYPE_LABEL_UBERBLOCK, /* label specific uberblock */ diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/zpool/zpool_main.c --- a/usr/src/cmd/zpool/zpool_main.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/zpool/zpool_main.c Fri May 21 17:29:22 2010 -0700 @@ -4007,6 +4007,8 @@ (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); + (void) printf(gettext(" 26 Improved snapshot deletion " + "performance\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); diff -r 1664b63fef32 -r 54258108784b usr/src/cmd/ztest/ztest.c --- a/usr/src/cmd/ztest/ztest.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/cmd/ztest/ztest.c Fri May 21 17:29:22 2010 -0700 @@ -94,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -3136,9 +3137,9 @@ fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); } - error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds); + error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds); if (error) - fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error); + fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, diff -r 1664b63fef32 -r 54258108784b usr/src/grub/capability --- a/usr/src/grub/capability Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/grub/capability Fri May 21 17:29:22 2010 -0700 @@ -39,7 +39,7 @@ # This file and the associated version are Solaris specific and are # not a part of the open source distribution of GRUB. # -VERSION=17 +VERSION=18 dboot xVM zfs diff -r 1664b63fef32 -r 54258108784b usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h --- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h Fri May 21 17:29:22 2010 -0700 @@ -26,7 +26,7 @@ /* * On-disk version number. */ -#define SPA_VERSION 25ULL +#define SPA_VERSION 26ULL /* * The following are configuration names used in the nvlist describing a pool's diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/Makefile.files --- a/usr/src/uts/common/Makefile.files Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/Makefile.files Fri May 21 17:29:22 2010 -0700 @@ -1327,6 +1327,7 @@ ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ + bpobj.o \ dbuf.o \ ddt.o \ ddt_zap.o \ @@ -1340,6 +1341,7 @@ dnode_sync.o \ dsl_dir.o \ dsl_dataset.o \ + dsl_deadlist.o \ dsl_pool.o \ dsl_synctask.o \ dmu_zfetch.o \ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/bplist.c --- a/usr/src/uts/common/fs/zfs/bplist.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/bplist.c Fri May 21 17:29:22 2010 -0700 @@ -25,353 +25,45 @@ #include #include -void -bplist_init(bplist_t *bpl) -{ - bzero(bpl, sizeof (*bpl)); - mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&bpl->bpl_q_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&bpl->bpl_queue, sizeof (bplist_q_t), - offsetof(bplist_q_t, bpq_node)); -} void -bplist_fini(bplist_t *bpl) -{ - ASSERT(list_is_empty(&bpl->bpl_queue)); - list_destroy(&bpl->bpl_queue); - mutex_destroy(&bpl->bpl_q_lock); - mutex_destroy(&bpl->bpl_lock); -} - -static int -bplist_hold(bplist_t *bpl) +bplist_create(bplist_t *bpl) { - ASSERT(MUTEX_HELD(&bpl->bpl_lock)); - if (bpl->bpl_dbuf == NULL) { - int err = dmu_bonus_hold(bpl->bpl_mos, - bpl->bpl_object, bpl, &bpl->bpl_dbuf); - if (err) - return (err); - bpl->bpl_phys = bpl->bpl_dbuf->db_data; - } - return (0); -} - -uint64_t -bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) -{ - int size; - - size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? - BPLIST_SIZE_V0 : sizeof (bplist_phys_t); - - return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, - DMU_OT_BPLIST_HDR, size, tx)); + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); } void -bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) -{ - VERIFY(dmu_object_free(mos, object, tx) == 0); -} - -int -bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) +bplist_destroy(bplist_t *bpl) { - dmu_object_info_t doi; - int err; - - err = dmu_object_info(mos, object, &doi); - if (err) - return (err); - - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_dbuf == NULL); - ASSERT(bpl->bpl_phys == NULL); - ASSERT(bpl->bpl_cached_dbuf == NULL); - ASSERT(list_is_empty(&bpl->bpl_queue)); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); - - bpl->bpl_mos = mos; - bpl->bpl_object = object; - bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); - bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; - bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); - - mutex_exit(&bpl->bpl_lock); - return (0); + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); } void -bplist_close(bplist_t *bpl) +bplist_append(bplist_t *bpl, const blkptr_t *bp) { - mutex_enter(&bpl->bpl_lock); - - ASSERT(list_is_empty(&bpl->bpl_queue)); + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); - if (bpl->bpl_cached_dbuf) { - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - bpl->bpl_cached_dbuf = NULL; - } - if (bpl->bpl_dbuf) { - dmu_buf_rele(bpl->bpl_dbuf, bpl); - bpl->bpl_dbuf = NULL; - bpl->bpl_phys = NULL; - } - + mutex_enter(&bpl->bpl_lock); + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); } -boolean_t -bplist_empty(bplist_t *bpl) -{ - boolean_t rv; - - if (bpl->bpl_object == 0) - return (B_TRUE); - - mutex_enter(&bpl->bpl_lock); - VERIFY(0 == bplist_hold(bpl)); /* XXX */ - rv = (bpl->bpl_phys->bpl_entries == 0); - mutex_exit(&bpl->bpl_lock); - - return (rv); -} - -static int -bplist_cache(bplist_t *bpl, uint64_t blkid) +void +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) { - int err = 0; - - if (bpl->bpl_cached_dbuf == NULL || - bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { - if (bpl->bpl_cached_dbuf != NULL) - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - err = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blkid << bpl->bpl_blockshift, - bpl, &bpl->bpl_cached_dbuf, DMU_READ_PREFETCH); - ASSERT(err || bpl->bpl_cached_dbuf->db_size == - 1ULL << bpl->bpl_blockshift); - } - return (err); -} - -int -bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; + bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - do { - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } - - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; - } while (bp->blk_birth == 0); - - mutex_exit(&bpl->bpl_lock); - return (0); -} - -int -bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; - off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { + while (bpe = list_head(&bpl->bpl_list)) { + list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); - return (err); - } - - dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); - bparray = bpl->bpl_cached_dbuf->db_data; - bparray[off] = *bp; - - /* We never need the fill count. */ - bparray[off].blk_fill = 0; - - /* The bplist will compress better if we can leave off the checksum */ - if (!BP_GET_DEDUP(&bparray[off])) - bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); - - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - bpl->bpl_phys->bpl_entries++; - bpl->bpl_phys->bpl_bytes += - bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp); - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); - bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpl->bpl_lock); - - return (0); -} - -void -bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx) -{ - VERIFY(bplist_enqueue(bpl, bp, tx) == 0); -} - -/* - * Deferred entry; will be processed later by bplist_sync(). - */ -void -bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) -{ - bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_q_lock); - bpq->bpq_blk = *bp; - list_insert_tail(&bpl->bpl_queue, bpq); - mutex_exit(&bpl->bpl_q_lock); -} - -void -bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx) -{ - bplist_q_t *bpq; - - mutex_enter(&bpl->bpl_q_lock); - while (bpq = list_head(&bpl->bpl_queue)) { - list_remove(&bpl->bpl_queue, bpq); - mutex_exit(&bpl->bpl_q_lock); - func(arg, &bpq->bpq_blk, tx); - kmem_free(bpq, sizeof (*bpq)); - mutex_enter(&bpl->bpl_q_lock); - } - mutex_exit(&bpl->bpl_q_lock); -} - -void -bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) -{ - mutex_enter(&bpl->bpl_lock); - ASSERT(list_is_empty(&bpl->bpl_queue)); - VERIFY(0 == bplist_hold(bpl)); - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - VERIFY(0 == dmu_free_range(bpl->bpl_mos, - bpl->bpl_object, 0, -1ULL, tx)); - bpl->bpl_phys->bpl_entries = 0; - bpl->bpl_phys->bpl_bytes = 0; - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp = 0; - bpl->bpl_phys->bpl_uncomp = 0; + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); + mutex_enter(&bpl->bpl_lock); } mutex_exit(&bpl->bpl_lock); } - -int -bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - *usedp = bpl->bpl_phys->bpl_bytes; - if (bpl->bpl_havecomp) { - *compp = bpl->bpl_phys->bpl_comp; - *uncompp = bpl->bpl_phys->bpl_uncomp; - } - mutex_exit(&bpl->bpl_lock); - - if (!bpl->bpl_havecomp) { - uint64_t itor = 0, comp = 0, uncomp = 0; - blkptr_t bp; - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - comp += BP_GET_PSIZE(&bp); - uncomp += BP_GET_UCSIZE(&bp); - } - if (err == ENOENT) - err = 0; - *compp = comp; - *uncompp = uncomp; - } - - return (err); -} - -/* - * Return (in *dsizep) the amount of space on the deadlist which is: - * mintxg < blk_birth <= maxtxg - */ -int -bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, - uint64_t *dsizep) -{ - uint64_t size = 0; - uint64_t itor = 0; - blkptr_t bp; - int err; - - /* - * As an optimization, if they want the whole txg range, just - * get bpl_bytes rather than iterating over the bps. - */ - if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err == 0) - *dsizep = bpl->bpl_phys->bpl_bytes; - mutex_exit(&bpl->bpl_lock); - return (err); - } - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { - size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp); - } - } - if (err == ENOENT) - err = 0; - *dsizep = size; - return (err); -} diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/bpobj.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/bpobj.c Fri May 21 17:29:22 2010 -0700 @@ -0,0 +1,462 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + + err = dmu_bonus_hold(bpo->bpo_os, + bpo->bpo_object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + + mutex_destroy(&bpo->bpo_lock); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + i++; + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + i * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) + return (err); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err) + break; + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - used_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || + bpo->bpo_phys->bpo_bytes == 0); + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp; + + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + bpobj_close(&subbpo); + + if (used == 0) { + /* No point in having an empty subobj. */ + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + } + + mutex_enter(&bpo->bpo_lock); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(!BP_IS_HOLE(bp)); + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + /* The bpobj will compress better if we can leave off the checksum */ + if (!BP_GET_DEDUP(bp)) + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + sra->used += bp_get_dsize_sync(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dmu.c --- a/usr/src/uts/common/fs/zfs/dmu.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu.c Fri May 21 17:29:22 2010 -0700 @@ -51,8 +51,8 @@ { byteswap_uint64_array, TRUE, "object array" }, { byteswap_uint8_array, TRUE, "packed nvlist" }, { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bplist" }, - { byteswap_uint64_array, TRUE, "bplist header" }, + { byteswap_uint64_array, TRUE, "bpobj" }, + { byteswap_uint64_array, TRUE, "bpobj header" }, { byteswap_uint64_array, TRUE, "SPA space map header" }, { byteswap_uint64_array, TRUE, "SPA space map" }, { byteswap_uint64_array, TRUE, "ZIL intent log" }, @@ -96,6 +96,10 @@ { zap_byteswap, TRUE, "SA attr layouts" }, { zap_byteswap, TRUE, "scan translations" }, { byteswap_uint8_array, FALSE, "deduplicated block" }, + { zap_byteswap, TRUE, "DSL deadlist map" }, + { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" }, + { zap_byteswap, TRUE, "DSL dir clones" }, + { byteswap_uint64_array, TRUE, "bpobj subobj" }, }; int diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dmu_send.c --- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Fri May 21 17:29:22 2010 -0700 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dsl_dataset.c --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c Fri May 21 17:29:22 2010 -0700 @@ -39,6 +39,7 @@ #include #include #include +#include /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -51,6 +52,13 @@ static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; static dsl_syncfunc_t dsl_dataset_set_reservation_sync; +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + #define DS_REF_MAX (1ULL << 62) #define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE @@ -178,13 +186,13 @@ /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't - * call bplist_enqueue() now because it may block + * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ - bplist_enqueue_deferred(&ds->ds_deadlist, bp); + bplist_append(&ds->ds_pending_deadlist, bp); } else { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); @@ -269,7 +277,13 @@ ds->ds_prev = NULL; } - bplist_close(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + if (db != NULL) { + dsl_deadlist_close(&ds->ds_deadlist); + } else { + ASSERT(ds->ds_deadlist.dl_dbuf == NULL); + ASSERT(!ds->ds_deadlist.dl_oldfmt); + } if (ds->ds_dir) dsl_dir_close(ds->ds_dir, ds); @@ -280,7 +294,6 @@ mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); } @@ -380,25 +393,23 @@ mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); - bplist_init(&ds->ds_deadlist); - - err = bplist_open(&ds->ds_deadlist, + + bplist_create(&ds->ds_pending_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + if (err == 0) { err = dsl_dir_open_obj(dp, ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); } if (err) { - /* - * we don't really need to close the blist if we - * just opened it. - */ mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); @@ -455,7 +466,8 @@ dsl_dataset_evict); } if (err || winner) { - bplist_close(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); @@ -464,7 +476,6 @@ mutex_destroy(&ds->ds_opening_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); @@ -726,7 +737,7 @@ if (ds->ds_dbuf) dsl_dataset_drop_ref(ds, tag); else - dsl_dataset_evict(ds->ds_dbuf, ds); + dsl_dataset_evict(NULL, ds); } boolean_t @@ -788,10 +799,12 @@ DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - - if (origin) { + + if (origin == NULL) { + dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); + } else { + dsl_dataset_t *ohds; + dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = origin->ds_phys->ds_creation_txg; @@ -807,6 +820,12 @@ dmu_buf_will_dirty(origin->ds_dbuf, tx); origin->ds_phys->ds_num_children++; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); + dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, + dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); + dsl_dataset_rele(ohds, FTAG); + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (origin->ds_phys->ds_next_clones_obj == 0) { origin->ds_phys->ds_next_clones_obj = @@ -820,6 +839,16 @@ dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_origin_obj = origin->ds_object; + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + if (origin->ds_dir->dd_phys->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + origin->ds_dir->dd_phys->dd_clones = + zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY3U(0, ==, zap_add_int(mos, + origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -1201,8 +1230,7 @@ else mrs_used = 0; - VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, - &dluncomp)); + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); ds->ds_phys->ds_unique_bytes = @@ -1462,6 +1490,103 @@ ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); } +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (ds->ds_dir->dd_phys->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + poa->ds_prev->ds_phys->ds_prev_snap_txg) { + poa->ds_prev->ds_phys->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY3U(zio_wait(poa.pio), ==, 0); + ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + SWITCH64(ds_next->ds_phys->ds_deadlist_obj, + ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) { @@ -1566,12 +1691,9 @@ } if (dsl_dataset_is_snapshot(ds)) { - blkptr_t bp; - zio_t *pio; dsl_dataset_t *ds_next; - uint64_t itor = 0; uint64_t old_unique; - int64_t used = 0, compressed = 0, uncompressed = 0; + uint64_t used = 0, comp = 0, uncomp = 0; VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); @@ -1587,53 +1709,49 @@ ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - - /* - * Transfer to our deadlist (which will become next's - * new deadlist) any entries from next's current - * deadlist which were born before prev, and free the - * other entries. - * - * XXX we're doing this long task with the config lock held - */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { - if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, - &bp, tx)); - if (ds_prev && !after_branch_point && - bp.blk_birth > - ds_prev->ds_phys->ds_prev_snap_txg) { - ds_prev->ds_phys->ds_unique_bytes += - bp_get_dsize_sync(dp->dp_spa, &bp); - } - } else { - used += bp_get_dsize_sync(dp->dp_spa, &bp); - compressed += BP_GET_PSIZE(&bp); - uncompressed += BP_GET_UCSIZE(&bp); - dsl_free_sync(pio, dp, tx->tx_txg, &bp); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds_prev->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_prev_snap_txg, + &used, &comp, &uncomp); + ds_prev->ds_phys->ds_unique_bytes += used; } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + ds->ds_phys->ds_deadlist_obj, tx); } - VERIFY3U(zio_wait(pio), ==, 0); - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); - - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -compressed, -uncompressed, tx); - - /* free next's deadlist */ - bplist_close(&ds_next->ds_deadlist); - bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); - - /* set next's deadlist to our deadlist */ - bplist_close(&ds->ds_deadlist); - ds_next->ds_phys->ds_deadlist_obj = - ds->ds_phys->ds_deadlist_obj; - VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj)); - ds->ds_phys->ds_deadlist_obj = 0; + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); + + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + ds->ds_phys->ds_creation_txg, tx); if (dsl_dataset_is_snapshot(ds_next)) { + dsl_dataset_t *ds_nextnext; + /* * Update next's unique to include blocks which * were previously shared by only this snapshot @@ -1642,25 +1760,27 @@ * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). - * - * XXX we're doing this long task with the - * config lock held */ - dsl_dataset_t *ds_after_next; - uint64_t space; - VERIFY(0 == dsl_dataset_hold_obj(dp, ds_next->ds_phys->ds_next_snap_obj, - FTAG, &ds_after_next)); - - VERIFY(0 == - bplist_space_birthrange(&ds_after_next->ds_deadlist, + FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, &space)); - ds_next->ds_phys->ds_unique_bytes += space; - - dsl_dataset_rele(ds_after_next, FTAG); + ds->ds_phys->ds_creation_txg, + &used, &comp, &uncomp); + ds_next->ds_phys->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + ds->ds_phys->ds_creation_txg, tx); + dsl_dataset_rele(hds, FTAG); + } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); @@ -1700,9 +1820,8 @@ */ struct killarg ka; - ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = 0; /* @@ -1721,6 +1840,11 @@ ds->ds_phys->ds_unique_bytes == 0); if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY3U(0, ==, zap_remove_int(mos, + ds->ds_prev->ds_dir->dd_phys->dd_clones, + ds->ds_object, tx)); + } dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = ds_prev = NULL; } @@ -1935,20 +2059,24 @@ delta, 0, 0, tx); } - bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); + zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", + ds->ds_dir->dd_myname, snapname, dsobj, + ds->ds_phys->ds_prev_snap_txg); + ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, + UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + dsl_deadlist_add_key(&ds->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, tx); + ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); ds->ds_phys->ds_prev_snap_obj = dsobj; ds->ds_phys->ds_prev_snap_txg = crtxg; ds->ds_phys->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); + err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx); ASSERT(err == 0); @@ -2357,6 +2485,7 @@ struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; int err; + uint64_t unused; /* Check that it is a real clone */ if (!dsl_dir_is_clone(hds->ds_dir)) @@ -2372,10 +2501,9 @@ /* compute origin's new unique space */ snap = list_tail(&pa->clone_snaps); ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); - err = bplist_space_birthrange(&snap->ds->ds_deadlist, - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); - if (err) - return (err); + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &pa->unique, &unused, &unused); /* * Walk the snapshots that we are moving @@ -2414,9 +2542,8 @@ if (ds->ds_phys->ds_prev_snap_obj == 0) continue; - if (err = bplist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp)) - goto out; + dsl_deadlist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp); pa->used += dlused; pa->comp += dlcomp; pa->uncomp += dluncomp; @@ -2450,7 +2577,7 @@ /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so - * these snaplist_space() -> bplist_space_birthrange() + * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ @@ -2530,6 +2657,26 @@ origin_head->ds_dir->dd_origin_txg = origin_ds->ds_phys->ds_creation_txg; + /* change dd_clone entries */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, hds->ds_object, tx)); + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + pa->origin_origin->ds_dir->dd_phys->dd_clones, + hds->ds_object, tx)); + + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + pa->origin_origin->ds_dir->dd_phys->dd_clones, + origin_head->ds_object, tx)); + if (dd->dd_phys->dd_clones == 0) { + dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, origin_head->ds_object, tx)); + + } + /* move snapshots to this dir */ for (snap = list_head(&pa->shared_snaps); snap; snap = list_next(&pa->shared_snaps, snap)) { @@ -2547,6 +2694,7 @@ VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); @@ -2556,6 +2704,40 @@ VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); + /* move any clone references */ + if (ds->ds_phys->ds_next_clones_obj && + spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; + + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; + + VERIFY3U(zap_remove_int(dp->dp_meta_objset, + odd->dd_phys->dd_clones, o, tx), ==, 0); + VERIFY3U(zap_add_int(dp->dp_meta_objset, + dd->dd_phys->dd_clones, o, tx), ==, 0); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } + ASSERT3U(dsl_prop_numcb(ds), ==, 0); } @@ -2651,11 +2833,9 @@ *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { - uint64_t used; - int err = bplist_space_birthrange(&snap->ds->ds_deadlist, - mintxg, UINT64_MAX, &used); - if (err) - return (err); + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); @@ -2742,10 +2922,10 @@ if (err != 0) goto out; - if (dsl_dir_is_clone(snap->ds->ds_dir)) { - err = dsl_dataset_own_obj(dp, + if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { + err = dsl_dataset_hold_obj(dp, snap->ds->ds_dir->dd_phys->dd_origin_obj, - 0, FTAG, &pa.origin_origin); + FTAG, &pa.origin_origin); if (err != 0) goto out; } @@ -2770,7 +2950,7 @@ snaplist_destroy(&pa.clone_snaps, B_FALSE); snaplist_destroy(&pa.origin_snaps, B_FALSE); if (pa.origin_origin) - dsl_dataset_disown(pa.origin_origin, FTAG); + dsl_dataset_rele(pa.origin_origin, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } @@ -2860,10 +3040,12 @@ */ if (csa->cds->ds_prev) { dsl_dataset_t *origin = csa->cds->ds_prev; + uint64_t comp, uncomp; + dmu_buf_will_dirty(origin->ds_dbuf, tx); - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + dsl_deadlist_space_range(&csa->cds->ds_deadlist, origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &origin->ds_phys->ds_unique_bytes)); + &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ @@ -2883,10 +3065,10 @@ ASSERT3U(csa->cds->ds_dir->dd_phys-> dd_used_breakdown[DD_USED_SNAP], ==, 0); - VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, - &cdl_comp, &cdl_uncomp)); - VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, - &odl_comp, &odl_uncomp)); + dsl_deadlist_space(&csa->cds->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&csa->ohds->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - (csa->ohds->ds_phys->ds_used_bytes + odl_used); @@ -2907,21 +3089,16 @@ * deadlist (since that's the only thing that's * changing that affects the snapused). */ - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used)); - VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, - csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used)); + dsl_deadlist_space_range(&csa->cds->ds_deadlist, + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&csa->ohds->ds_deadlist, + csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } -#define SWITCH64(x, y) \ - { \ - uint64_t __tmp = (x); \ - (x) = (y); \ - (y) = __tmp; \ - } - /* swap ds_*_bytes */ SWITCH64(csa->ohds->ds_phys->ds_used_bytes, csa->cds->ds_phys->ds_used_bytes); @@ -2936,15 +3113,17 @@ dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, csa->unused_refres_delta, 0, 0, tx); - /* swap deadlists */ - bplist_close(&csa->cds->ds_deadlist); - bplist_close(&csa->ohds->ds_deadlist); + /* + * Swap deadlists. + */ + dsl_deadlist_close(&csa->cds->ds_deadlist); + dsl_deadlist_close(&csa->ohds->ds_deadlist); SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, csa->cds->ds_phys->ds_deadlist_obj); - VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, - csa->cds->ds_phys->ds_deadlist_obj)); - VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, - csa->ohds->ds_phys->ds_deadlist_obj)); + dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, + csa->cds->ds_phys->ds_deadlist_obj); + dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, + csa->ohds->ds_phys->ds_deadlist_obj); dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); } diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dsl_deadlist.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c Fri May 21 17:29:22 2010 -0700 @@ -0,0 +1,474 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +static int +dsl_deadlist_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; + + if (dle1->dle_mintxg < dle2->dle_mintxg) + return (-1); + else if (dle1->dle_mintxg > dle2->dle_mintxg) + return (+1); + else + return (0); +} + +static void +dsl_deadlist_load_tree(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havetree) + return; + + avl_create(&dl->dl_tree, dsl_deadlist_compare, + sizeof (dsl_deadlist_entry_t), + offsetof(dsl_deadlist_entry_t, dle_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, + za.za_first_integer)); + avl_add(&dl->dl_tree, dle); + } + zap_cursor_fini(&zc); + dl->dl_havetree = B_TRUE; +} + +void +dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); + dl->dl_os = os; + dl->dl_object = object; + VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + dmu_object_info_from_db(dl->dl_dbuf, &doi); + if (doi.doi_type == DMU_OT_BPOBJ) { + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_oldfmt = B_TRUE; + VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + return; + } + + dl->dl_oldfmt = B_FALSE; + dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_havetree = B_FALSE; +} + +void +dsl_deadlist_close(dsl_deadlist_t *dl) +{ + void *cookie = NULL; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) { + dl->dl_oldfmt = B_FALSE; + bpobj_close(&dl->dl_bpobj); + return; + } + + if (dl->dl_havetree) { + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) + != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + } + dmu_buf_rele(dl->dl_dbuf, dl); + mutex_destroy(&dl->dl_lock); + dl->dl_dbuf = NULL; + dl->dl_phys = NULL; +} + +uint64_t +dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) +{ + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); + return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, + sizeof (dsl_deadlist_phys_t), tx)); +} + +void +dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) +{ + dmu_object_info_t doi; + zap_cursor_t zc; + zap_attribute_t za; + + VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_free(os, dlobj, tx); + return; + } + + for (zap_cursor_init(&zc, os, dlobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) + bpobj_free(os, za.za_first_integer, tx); + zap_cursor_fini(&zc); + VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); +} + +void +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + bpobj_enqueue(&dl->dl_bpobj, bp, tx); + return; + } + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += + bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = bp->blk_birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + else + dle = AVL_PREV(&dl->dl_tree, dle); + bpobj_enqueue(&dle->dle_bpobj, bp, tx); +} + +/* + * Insert new key in deadlist, which must be > all current entries. + * mintxg is not inclusive. + */ +void +dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t obj; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = mintxg; + obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + avl_add(&dl->dl_tree, dle); + + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + mintxg, obj, tx)); +} + +/* + * Remove this key, merging its entries into the previous key. + */ +void +dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle, *dle_prev; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + dle_prev = AVL_PREV(&dl->dl_tree, dle); + + bpobj_enqueue_subobj(&dle_prev->dle_bpobj, + dle->dle_bpobj.bpo_object, tx); + + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); +} + +/* + * Walk ds's snapshots to regenerate generate ZAP & AVL. + */ +static void +dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_t dl; + dsl_pool_t *dp = dmu_objset_pool(os); + + dsl_deadlist_open(&dl, os, dlobj); + if (dl.dl_oldfmt) { + dsl_deadlist_close(&dl); + return; + } + + while (mrs_obj != 0) { + dsl_dataset_t *ds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); + mrs_obj = ds->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + dsl_deadlist_close(&dl); +} + +uint64_t +dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t *dle; + uint64_t newobj; + + newobj = dsl_deadlist_alloc(dl->dl_os, tx); + + if (dl->dl_oldfmt) { + dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); + return (newobj); + } + + dsl_deadlist_load_tree(dl); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t obj; + + if (dle->dle_mintxg >= maxtxg) + break; + + obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + dle->dle_mintxg, obj, tx)); + } + return (newobj); +} + +void +dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + usedp, compp, uncompp)); + return; + } + + mutex_enter(&dl->dl_lock); + *usedp = dl->dl_phys->dl_used; + *compp = dl->dl_phys->dl_comp; + *uncompp = dl->dl_phys->dl_uncomp; + mutex_exit(&dl->dl_lock); +} + +/* + * return space used in the range (mintxg, maxtxg]. + * Includes maxtxg, does not include mintxg. + * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is + * UINT64_MAX). + */ +void +dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + mintxg, maxtxg, usedp, compp, uncompp)); + return; + } + + dsl_deadlist_load_tree(dl); + *usedp = *compp = *uncompp = 0; + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + /* + * If we don't find this mintxg, there shouldn't be anything + * after it either. + */ + ASSERT(dle != NULL || + avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + for (; dle && dle->dle_mintxg < maxtxg; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t used, comp, uncomp; + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } +} + +static void +dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + uint64_t used, comp, uncomp; + bpobj_t bpo; + + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + bpobj_close(&bpo); + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += used; + dl->dl_phys->dl_comp += comp; + dl->dl_phys->dl_uncomp += uncomp; + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); +} + +static int +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +/* + * Merge the deadlist pointed to by 'obj' into dl. obj will be left as + * an empty deadlist. + */ +void +dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + dmu_buf_t *bonus; + dsl_deadlist_phys_t *dlp; + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_t bpo; + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_iterate(&bpo, + dsl_deadlist_insert_cb, dl, tx)); + bpobj_close(&bpo); + return; + } + + for (zap_cursor_init(&zc, dl->dl_os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t mintxg = strtonum(za.za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + } + zap_cursor_fini(&zc); + + VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + dlp = bonus->db_data; + dmu_buf_will_dirty(bonus, tx); + bzero(dlp, sizeof (*dlp)); + dmu_buf_rele(bonus, FTAG); +} + +/* + * Remove entries on dl that are >= mintxg, and put them on the bpobj. + */ +void +dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(!dl->dl_oldfmt); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + while (dle) { + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t *dle_next; + + bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + mutex_enter(&dl->dl_lock); + ASSERT3U(dl->dl_phys->dl_used, >=, used); + ASSERT3U(dl->dl_phys->dl_comp, >=, comp); + ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + mutex_exit(&dl->dl_lock); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + dle->dle_mintxg, tx)); + + dle_next = AVL_NEXT(&dl->dl_tree, dle); + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + dle = dle_next; + } +} diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dsl_dir.c --- a/usr/src/uts/common/fs/zfs/dsl_dir.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Fri May 21 17:29:22 2010 -0700 @@ -412,7 +412,7 @@ { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; - dsl_dir_phys_t *dsphys; + dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, @@ -427,17 +427,17 @@ } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + ddphys = dbuf->db_data; - dsphys->dd_creation_time = gethrestime_sec(); + ddphys->dd_creation_time = gethrestime_sec(); if (pds) - dsphys->dd_parent_obj = pds->dd_object; - dsphys->dd_props_zapobj = zap_create(mos, + ddphys->dd_parent_obj = pds->dd_object; + ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsphys->dd_child_dir_zapobj = zap_create(mos, + ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) - dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dsl_pool.c --- a/usr/src/uts/common/fs/zfs/dsl_pool.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c Fri May 21 17:29:22 2010 -0700 @@ -38,6 +38,7 @@ #include #include #include +#include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ @@ -104,6 +105,7 @@ dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dsl_dir_t *dd; dsl_dataset_t *ds; + uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, @@ -143,6 +145,20 @@ goto out; } + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, + &dp->dp_free_dir); + if (err) + goto out; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err) + goto out; + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); @@ -177,9 +193,13 @@ dsl_dataset_drop_ref(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_close(dp->dp_mos_dir, dp); + if (dp->dp_free_dir) + dsl_dir_close(dp->dp_free_dir, dp); if (dp->dp_root_dir) dsl_dir_close(dp->dp_root_dir, dp); + bpobj_close(&dp->dp_free_bpobj); + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) dmu_objset_evict(dp->dp_meta_objset); @@ -208,7 +228,7 @@ dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); objset_t *os; dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t obj; /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, @@ -232,14 +252,29 @@ VERIFY(0 == dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + /* create and open the free dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + FREE_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* create and open the free_bplist */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); + VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* create the root dataset */ - dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); #ifdef _KERNEL @@ -252,6 +287,14 @@ return (dp); } +static int +deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { @@ -315,13 +358,14 @@ err = zio_wait(zio); /* - * If anything was added to a deadlist during a zio done callback, - * it had to be put on the deferred queue. Enqueue it for real now. + * Move dead blocks from the pending deadlist to the on-disk + * deadlist. */ for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) - bplist_sync(&ds->ds_deadlist, - bplist_enqueue_cb, &ds->ds_deadlist, tx); + ds = list_next(&dp->dp_synced_datasets, ds)) { + bplist_iterate(&ds->ds_pending_deadlist, + deadlist_enqueue_cb, &ds->ds_deadlist, tx); + } while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { /* @@ -612,6 +656,65 @@ tx, DS_FIND_CHILDREN)); } +/* ARGSUSED */ +static int +upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (ds->ds_dir->dd_phys->dd_origin_obj) { + dsl_dataset_t *origin; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); + + if (origin->ds_dir->dd_phys->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + origin->ds_dir->dd_phys->dd_clones = zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + + VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, + origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); + + dsl_dataset_rele(origin, FTAG); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t obj; + + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); + VERIFY(0 == dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* + * We can't use bpobj_alloc(), because spa_version() still + * returns the old version, and we need a new-version bpobj with + * subobj support. So call dmu_object_alloc() directly. + */ + obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, + SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + + VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); +} + void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/dsl_scan.c --- a/usr/src/uts/common/fs/zfs/dsl_scan.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c Fri May 21 17:29:22 2010 -0700 @@ -57,6 +57,7 @@ static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ @@ -599,7 +600,7 @@ * XXX need to make sure all of these arc_read() prefetches are * done before setting xlateall (similar to dsl_read()) */ - (void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp, + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, &czb); } @@ -1378,11 +1379,56 @@ zap_cursor_fini(&zc); } +static int +dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + uint64_t elapsed_nanosecs; + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)) + return (ERESTART); + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + return (0); +} + +boolean_t +dsl_scan_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + return (used != 0); +} + void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; + int err; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1400,10 +1446,9 @@ dsl_scan_setup_sync(scn, &func, tx); } - if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE || - spa_shutting_down(spa) || - spa_sync_pass(dp->dp_spa) > 1 || - scn->scn_phys.scn_state != DSS_SCANNING) + + if (!dsl_scan_active(scn) || + spa_sync_pass(dp->dp_spa) > 1) return; scn->scn_visited_this_txg = 0; @@ -1411,6 +1456,40 @@ scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; + /* + * First process the free list. If we pause the free, don't do + * any scanning. This ensures that there is no free list when + * we are scanning, so the scan code doesn't have to worry about + * traversing it. + */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + dsl_scan_free_cb, scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj txg %llu", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + (gethrtime() - scn->scn_sync_start_time) / MICROSEC, + (longlong_t)tx->tx_txg); + scn->scn_visited_this_txg = 0; + /* + * Re-sync the ddt so that we can further modify + * it when doing bprewrite. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err == ERESTART) + return; + } + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " @@ -1433,11 +1512,11 @@ (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); } - scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_scan_visit(scn, tx); - (void) zio_wait(scn->scn_prefetch_zio_root); - scn->scn_prefetch_zio_root = NULL; + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/spa.c --- a/usr/src/uts/common/fs/zfs/spa.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Fri May 21 17:29:22 2010 -0700 @@ -943,6 +943,8 @@ spa->spa_async_zio_root = NULL; } + bpobj_close(&spa->spa_deferred_bpobj); + /* * Close the dsl pool. */ @@ -1662,6 +1664,7 @@ uint64_t config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; + uint64_t obj; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1840,8 +1843,10 @@ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } - if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, - &spa->spa_deferred_bplist_obj) != 0) + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); + if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* @@ -2687,7 +2692,7 @@ uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version; + uint64_t version, obj; /* * If this pool already exists, return failure. @@ -2834,20 +2839,20 @@ } /* - * Create the deferred-free bplist object. Turn off compression + * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ - spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, - 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, - spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); - + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, + ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bplist"); + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); /* * Create the pool's history object. @@ -4946,34 +4951,23 @@ * SPA syncing routines * ========================================================================== */ -static void -spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) + +static int +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { - blkptr_t blk; - uint64_t itor = 0; - uint8_t c = 1; - - while (bplist_iterate(bpl, &itor, &blk) == 0) { - ASSERT(blk.blk_birth < txg); - zio_free(spa, txg, &blk); - } - - bplist_vacate(bpl, tx); - - /* - * Pre-dirty the first block so we sync to convergence faster. - * (Usually only the first block is needed.) - */ - dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); + bpobj_t *bpo = arg; + bpobj_enqueue(bpo, bp, tx); + return (0); } -static void -spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +static int +spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, zio->io_flags)); + return (0); } static void @@ -5204,6 +5198,42 @@ } /* + * Perform one-time upgrade on-disk changes. spa_version() does not + * reflect the new version this txg, so there must be no changes this + * txg to anything that the upgrade code depends on after it executes. + * Therefore this must be called after dsl_pool_sync() does the sync + * tasks. + */ +static void +spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + + ASSERT(spa->spa_sync_pass == 1); + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { + dsl_pool_upgrade_dir_clones(dp, tx); + + /* Keeping the freedir open increases spa_minref */ + spa->spa_minref += 3; + } +} + +/* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ @@ -5212,7 +5242,7 @@ { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; - bplist_t *defer_bpl = &spa->spa_deferred_bplist; + bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; @@ -5251,8 +5281,6 @@ } spa_config_exit(spa, SCL_STATE, FTAG); - VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); - tx = dmu_tx_create_assigned(dp, txg); /* @@ -5276,19 +5304,6 @@ } } - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && - spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { - dsl_pool_create_origin(dp, tx); - - /* Keeping the origin open increases spa_minref */ - spa->spa_minref += 3; - } - - if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && - spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { - dsl_pool_upgrade_clones(dp, tx); - } - /* * If anything has changed in this txg, or if someone is waiting * for this txg to sync (eg, spa_vdev_remove()), push the @@ -5299,9 +5314,13 @@ if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || !txg_list_empty(&dp->dp_sync_tasks, txg) || - ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING || - txg_sync_waiting(dp)) && !spa_shutting_down(spa))) - spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); + ((dsl_scan_active(dp->dp_scan) || + txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { + zio_t *zio = zio_root(spa, NULL, NULL, 0); + VERIFY3U(bpobj_iterate(defer_bpo, + spa_free_sync_cb, zio, tx), ==, 0); + VERIFY3U(zio_wait(zio), ==, 0); + } /* * Iterate to convergence. @@ -5319,10 +5338,12 @@ if (pass <= SYNC_PASS_DEFERRED_FREE) { zio_t *zio = zio_root(spa, NULL, NULL, 0); - bplist_sync(free_bpl, spa_sync_free, zio, tx); + bplist_iterate(free_bpl, spa_free_sync_cb, + zio, tx); VERIFY(zio_wait(zio) == 0); } else { - bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); + bplist_iterate(free_bpl, bpobj_enqueue_cb, + defer_bpo, tx); } ddt_sync(spa, txg); @@ -5331,12 +5352,11 @@ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); + if (pass == 1) + spa_sync_upgrades(spa, tx); + } while (dmu_objset_is_dirty(mos, txg)); - ASSERT(list_is_empty(&free_bpl->bpl_queue)); - - bplist_close(defer_bpl); - /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. @@ -5423,8 +5443,6 @@ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - ASSERT(list_is_empty(&defer_bpl->bpl_queue)); - ASSERT(list_is_empty(&free_bpl->bpl_queue)); spa->spa_sync_pass = 0; diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/spa_misc.c --- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Fri May 21 17:29:22 2010 -0700 @@ -445,8 +445,7 @@ cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) - bplist_init(&spa->spa_free_bplist[t]); - bplist_init(&spa->spa_deferred_bplist); + bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; @@ -524,8 +523,7 @@ spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) - bplist_fini(&spa->spa_free_bplist[t]); - bplist_fini(&spa->spa_deferred_bplist); + bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_proc_cv); diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/bplist.h --- a/usr/src/uts/common/fs/zfs/sys/bplist.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/bplist.h Fri May 21 17:29:22 2010 -0700 @@ -19,76 +19,36 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_BPLIST_H #define _SYS_BPLIST_H -#include +#include #include -#include -#include -#include #ifdef __cplusplus extern "C" { #endif -typedef struct bplist_phys { - /* - * This is the bonus buffer for the dead lists. The object's - * contents is an array of bpl_entries blkptr_t's, representing - * a total of bpl_bytes physical space. - */ - uint64_t bpl_entries; - uint64_t bpl_bytes; - uint64_t bpl_comp; - uint64_t bpl_uncomp; -} bplist_phys_t; - -#define BPLIST_SIZE_V0 (2 * sizeof (uint64_t)) - -typedef struct bplist_q { - blkptr_t bpq_blk; - list_node_t bpq_node; -} bplist_q_t; +typedef struct bplist_entry { + blkptr_t bpe_blk; + list_node_t bpe_node; +} bplist_entry_t; typedef struct bplist { kmutex_t bpl_lock; - objset_t *bpl_mos; - uint64_t bpl_object; - uint8_t bpl_blockshift; - uint8_t bpl_bpshift; - uint8_t bpl_havecomp; - kmutex_t bpl_q_lock; - list_t bpl_queue; - bplist_phys_t *bpl_phys; - dmu_buf_t *bpl_dbuf; - dmu_buf_t *bpl_cached_dbuf; + list_t bpl_list; } bplist_t; -typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_init(bplist_t *bpl); -extern void bplist_fini(bplist_t *bpl); -extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); -extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); -extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); -extern void bplist_close(bplist_t *bpl); -extern boolean_t bplist_empty(bplist_t *bpl); -extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx); -extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); -extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, +void bplist_create(bplist_t *bpl); +void bplist_destroy(bplist_t *bpl); +void bplist_append(bplist_t *bpl, const blkptr_t *bp); +void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx); -extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); -extern int bplist_space(bplist_t *bpl, - uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); -extern int bplist_space_birthrange(bplist_t *bpl, - uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep); #ifdef __cplusplus } diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/bpobj.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/bpobj.h Fri May 21 17:29:22 2010 -0700 @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_BPOBJ_H +#define _SYS_BPOBJ_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bpobj_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpo_entries blkptr_t's, representing + * a total of bpo_bytes physical space. + */ + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; +} bpobj_phys_t; + +#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) + +typedef struct bpobj { + kmutex_t bpo_lock; + objset_t *bpo_os; + uint64_t bpo_object; + int bpo_epb; + uint8_t bpo_havecomp; + uint8_t bpo_havesubobj; + bpobj_phys_t *bpo_phys; + dmu_buf_t *bpo_dbuf; + dmu_buf_t *bpo_cached_dbuf; +} bpobj_t; + +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); + +int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); +void bpobj_close(bpobj_t *bpo); + +int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *); +int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp); + +void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx); + +int bpobj_space(bpobj_t *bpo, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPOBJ_H */ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dmu.h --- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Fri May 21 17:29:22 2010 -0700 @@ -77,8 +77,8 @@ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ - DMU_OT_BPLIST, /* UINT64 */ - DMU_OT_BPLIST_HDR, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ @@ -130,6 +130,10 @@ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -211,7 +215,7 @@ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_ROOT_DATASET "root_dataset" -#define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" @@ -224,6 +228,7 @@ #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" /* * Allocate an object from this objset. The range of object numbers diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dsl_dataset.h --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h Fri May 21 17:29:22 2010 -0700 @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -82,7 +83,7 @@ uint64_t ds_num_children; /* clone/snap children; ==0 for head */ uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; - uint64_t ds_deadlist_obj; /* DMU_OT_BPLIST */ + uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ uint64_t ds_used_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; @@ -114,7 +115,8 @@ struct dsl_dataset *ds_prev; /* has internal locking: */ - bplist_t ds_deadlist; + dsl_deadlist_t ds_deadlist; + bplist_t ds_pending_deadlist; /* to protect against multiple concurrent incremental recv */ kmutex_t ds_recvlock; @@ -160,7 +162,7 @@ boolean_t need_prep; /* do we need to retry due to EBUSY? */ }; -#define dsl_dataset_is_snapshot(ds) \ +#define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) #define DS_UNIQUE_IS_ACCURATE(ds) \ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dsl_deadlist.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_deadlist.h Fri May 21 17:29:22 2010 -0700 @@ -0,0 +1,87 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_DSL_DEADLIST_H +#define _SYS_DSL_DEADLIST_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf; +struct dsl_dataset; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +typedef struct dsl_deadlist { + objset_t *dl_os; + uint64_t dl_object; + avl_tree_t dl_tree; + boolean_t dl_havetree; + struct dmu_buf *dl_dbuf; + dsl_deadlist_phys_t *dl_phys; + kmutex_t dl_lock; + + /* if it's the old on-disk format: */ + bpobj_t dl_bpobj; + boolean_t dl_oldfmt; +} dsl_deadlist_t; + +typedef struct dsl_deadlist_entry { + avl_node_t dle_node; + uint64_t dle_mintxg; + bpobj_t dle_bpobj; +} dsl_deadlist_entry_t; + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +void dsl_deadlist_close(dsl_deadlist_t *dl); +uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); +void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx); +void dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_space_range(dsl_deadlist_t *dl, + uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); +void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DEADLIST_H */ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dsl_dir.h --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h Fri May 21 17:29:22 2010 -0700 @@ -69,7 +69,8 @@ uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ uint64_t dd_flags; uint64_t dd_used_breakdown[DD_USED_NUM]; - uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ + uint64_t dd_clones; /* dsl_dir objects */ + uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { @@ -143,6 +144,7 @@ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" #define XLATION_DIR_NAME "$XLATION" +#define FREE_DIR_NAME "$FREE" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dsl_pool.h --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h Fri May 21 17:29:22 2010 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -71,6 +72,7 @@ struct objset *dp_meta_objset; struct dsl_dir *dp_root_dir; struct dsl_dir *dp_mos_dir; + struct dsl_dir *dp_free_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_vnrele_taskq; @@ -82,6 +84,7 @@ uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; uint64_t dp_tmp_userrefs_obj; + bpobj_t dp_free_bpobj; struct dsl_scan *dp_scan; @@ -130,6 +133,7 @@ uint32_t *arc_flags, const zbookmark_t *zb); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/dsl_scan.h --- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h Fri May 21 17:29:22 2010 -0700 @@ -77,7 +77,7 @@ boolean_t scn_pausing; uint64_t scn_restart_txg; uint64_t scn_sync_start_time; - zio_t *scn_prefetch_zio_root; + zio_t *scn_zio_root; /* for debugging / information */ uint64_t scn_visited_this_txg; @@ -99,6 +99,7 @@ void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, struct dmu_tx *tx); +boolean_t dsl_scan_active(dsl_scan_t *scn); #ifdef __cplusplus } diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/spa.h --- a/usr/src/uts/common/fs/zfs/sys/spa.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Fri May 21 17:29:22 2010 -0700 @@ -599,7 +599,6 @@ extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); -extern enum zio_checksum spa_dedup_checksum(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/spa_impl.h --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Fri May 21 17:29:22 2010 -0700 @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -140,8 +141,7 @@ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ - uint64_t spa_deferred_bplist_obj; /* object for deferred frees */ - bplist_t spa_deferred_bplist; /* deferred-free bplist */ + bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/sys/zio_checksum.h --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h Fri May 21 17:29:22 2010 -0700 @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_ZIO_CHECKSUM_H #define _SYS_ZIO_CHECKSUM_H #include -#include #ifdef __cplusplus extern "C" { @@ -68,6 +66,7 @@ extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); #ifdef __cplusplus } diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/txg.c --- a/usr/src/uts/common/fs/zfs/txg.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/txg.c Fri May 21 17:29:22 2010 -0700 @@ -371,9 +371,7 @@ * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING || - spa_load_state(spa) != SPA_LOAD_NONE || - spa_shutting_down(spa)) && + while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/zio.c --- a/usr/src/uts/common/fs/zfs/zio.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zio.c Fri May 21 17:29:22 2010 -0700 @@ -652,7 +652,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); + bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } zio_t * diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/fs/zfs/zio_checksum.c --- a/usr/src/uts/common/fs/zfs/zio_checksum.c Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c Fri May 21 17:29:22 2010 -0700 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -28,6 +27,7 @@ #include #include #include +#include /* * Checksum vectors. diff -r 1664b63fef32 -r 54258108784b usr/src/uts/common/sys/fs/zfs.h --- a/usr/src/uts/common/sys/fs/zfs.h Fri May 21 15:05:41 2010 -0700 +++ b/usr/src/uts/common/sys/fs/zfs.h Fri May 21 17:29:22 2010 -0700 @@ -334,14 +334,15 @@ #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL +#define SPA_VERSION_26 26ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_25 -#define SPA_VERSION_STRING "25" +#define SPA_VERSION SPA_VERSION_26 +#define SPA_VERSION_STRING "26" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -358,7 +359,7 @@ #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 -#define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3 +#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 @@ -388,6 +389,8 @@ #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 +#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 +#define SPA_VERSION_DEADLISTS SPA_VERSION_26 /* * ZPL version - rev'd whenever an incompatible on-disk format change