6347493 tar of 25K empty directory entries in ZFS takes 30+ seconds ...
authorahrens
Thu, 10 Nov 2005 18:43:50 -0800
changeset 885 d925b21dba78
parent 884 662fba6e367d
child 886 26d4f03e059f
6347493 tar of 25K empty directory entries in ZFS takes 30+ seconds ... 6348409 'zfs rename' process hangs after assigning a very long name ... 6348464 a few DMU object type macros are misnamed
usr/src/cmd/zdb/zdb.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/dsl_dir.c
usr/src/uts/common/fs/zfs/dsl_prop.c
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/zap.h
usr/src/uts/common/fs/zfs/sys/zap_impl.h
usr/src/uts/common/fs/zfs/zap.c
usr/src/uts/common/fs/zfs/zap_leaf.c
usr/src/uts/common/fs/zfs/zap_micro.c
usr/src/uts/common/fs/zfs/zfs_dir.c
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
--- a/usr/src/cmd/zdb/zdb.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Thu Nov 10 18:43:50 2005 -0800
@@ -346,6 +346,7 @@
 		(void) printf("\n");
 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
 	}
+	zap_cursor_fini(&zc);
 }
 
 static void
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Nov 10 18:43:50 2005 -0800
@@ -620,7 +620,7 @@
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
-    uint64_t *id, uint64_t *offp)
+    uint64_t *idp, uint64_t *offp)
 {
 	dsl_dataset_t *ds = os->os->os_dsl_dataset;
 	zap_cursor_t cursor;
@@ -633,16 +633,62 @@
 	    ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, *offp);
 
-	if (zap_cursor_retrieve(&cursor, &attr) != 0)
+	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+		zap_cursor_fini(&cursor);
+		return (ENOENT);
+	}
+
+	if (strlen(attr.za_name) + 1 > namelen) {
+		zap_cursor_fini(&cursor);
+		return (ENAMETOOLONG);
+	}
+
+	(void) strcpy(name, attr.za_name);
+	if (idp)
+		*idp = attr.za_first_integer;
+	zap_cursor_advance(&cursor);
+	*offp = zap_cursor_serialize(&cursor);
+	zap_cursor_fini(&cursor);
+
+	return (0);
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *idp, uint64_t *offp)
+{
+	dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+
+	if (dd->dd_phys->dd_child_dir_zapobj == 0)
 		return (ENOENT);
 
-	if (strlen(attr.za_name) + 1 > namelen)
+	/* there is no next dir on a snapshot! */
+	if (os->os->os_dsl_dataset->ds_object !=
+	    dd->dd_phys->dd_head_dataset_obj)
+		return (ENOENT);
+
+	zap_cursor_init_serialized(&cursor,
+	    dd->dd_pool->dp_meta_objset,
+	    dd->dd_phys->dd_child_dir_zapobj, *offp);
+
+	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+		zap_cursor_fini(&cursor);
+		return (ENOENT);
+	}
+
+	if (strlen(attr.za_name) + 1 > namelen) {
+		zap_cursor_fini(&cursor);
 		return (ENAMETOOLONG);
+	}
 
 	(void) strcpy(name, attr.za_name);
-	*id = attr.za_first_integer;
+	if (idp)
+		*idp = attr.za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
+	zap_cursor_fini(&cursor);
 
 	return (0);
 }
@@ -689,6 +735,7 @@
 			dmu_objset_find(child, func, arg, flags);
 			kmem_free(child, MAXPATHLEN);
 		}
+		zap_cursor_fini(&zc);
 	}
 
 	/*
@@ -715,6 +762,7 @@
 			func(child, arg);
 			kmem_free(child, MAXPATHLEN);
 		}
+		zap_cursor_fini(&zc);
 	}
 
 	dsl_dir_close(dd, FTAG);
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Thu Nov 10 18:43:50 2005 -0800
@@ -489,7 +489,7 @@
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
@@ -553,7 +553,7 @@
 	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 	    sizeof (dsphys->ds_guid));
 	dsphys->ds_snapnames_zapobj =
-	    zap_create(mos, DMU_OT_DSL_OBJSET_SNAP_MAP, DMU_OT_NONE, 0, tx);
+	    zap_create(mos, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
@@ -944,6 +944,8 @@
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
 		mutex_exit(&ds->ds_lock);
 		dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+		if (drop_lock)
+			rw_exit(&dp->dp_config_rwlock);
 		return (EAGAIN);
 	}
 
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Thu Nov 10 18:43:50 2005 -0800
@@ -305,6 +305,7 @@
 		err = getcomponent(next, buf, &nextnext);
 		if (err) {
 			dsl_dir_close(dd, tag);
+			rw_exit(&dp->dp_config_rwlock);
 			if (openedspa)
 				spa_close(spa, FTAG);
 			return (NULL);
@@ -377,7 +378,7 @@
 	if (pds->dd_phys->dd_child_dir_zapobj == 0) {
 		dmu_buf_will_dirty(pds->dd_dbuf, tx);
 		pds->dd_phys->dd_child_dir_zapobj = zap_create(mos,
-		    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+		    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	}
 
 	rw_enter(&pds->dd_pool->dp_config_rwlock, RW_WRITER);
@@ -405,7 +406,7 @@
 	dsphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsphys->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	dmu_buf_rele(dbuf);
 
 	rw_exit(&pds->dd_pool->dp_config_rwlock);
@@ -519,7 +520,7 @@
 	dsp->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsp->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DATASET_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 
 	dmu_buf_rele(dbuf);
 }
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c	Thu Nov 10 18:43:50 2005 -0800
@@ -294,6 +294,7 @@
 			dsl_prop_changed_notify(dp, za.za_first_integer,
 			    propname, value, FALSE);
 		}
+		zap_cursor_fini(&zc);
 	}
 	dsl_dir_close(dd, FTAG);
 }
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Nov 10 18:43:50 2005 -0800
@@ -81,8 +81,8 @@
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
-	DMU_OT_DSL_DATASET_CHILD_MAP,	/* ZAP */
-	DMU_OT_DSL_OBJSET_SNAP_MAP,	/* ZAP */
+	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
+	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_OBJSET,		/* UINT64 */
 	/* zpl: */
@@ -586,11 +586,13 @@
 extern uint64_t dmu_objset_id(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp);
+extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *idp, uint64_t *offp);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
-uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* XXX */
+uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
--- a/usr/src/uts/common/fs/zfs/sys/zap.h	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h	Thu Nov 10 18:43:50 2005 -0800
@@ -200,9 +200,13 @@
  */
 int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, char *name);
 
+struct zap;
+struct zap_leaf;
 typedef struct zap_cursor {
 	/* This structure is opaque! */
 	objset_t *zc_objset;
+	struct zap *zc_zap;
+	struct zap_leaf *zc_leaf;
 	uint64_t zc_zapobj;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
@@ -224,9 +228,10 @@
 
 /*
  * Initialize a zap cursor, pointing to the "first" attribute of the
- * zapobj.
+ * zapobj.  You must _fini the cursor when you are done with it.
  */
 void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_fini(zap_cursor_t *zc);
 
 /*
  * Get the attribute currently pointed to by the cursor.  Returns
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h	Thu Nov 10 18:43:50 2005 -0800
@@ -177,6 +177,7 @@
 int fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx);
 int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za);
 void fzap_get_stats(zap_t *zap, zap_stats_t *zs);
+void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers,
--- a/usr/src/uts/common/fs/zfs/zap.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zap.c	Thu Nov 10 18:43:50 2005 -0800
@@ -55,7 +55,6 @@
 static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
 static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
     dmu_tx_t *tx, krw_t lt);
-static void zap_put_leaf(zap_leaf_t *l);
 static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
 
 
@@ -422,7 +421,7 @@
  * Routines for obtaining zap_leaf_t's
  */
 
-static void
+void
 zap_put_leaf(zap_leaf_t *l)
 {
 	zap_leaf_t *nl = l->l_next;
@@ -893,6 +892,7 @@
 			break;
 		}
 	}
+	zap_cursor_fini(&zc);
 	kmem_free(za, sizeof (zap_attribute_t));
 	return (err);
 }
@@ -912,8 +912,22 @@
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
 
+	if (zc->zc_leaf &&
+	    (ZAP_HASH_IDX(zc->zc_hash, zc->zc_leaf->lh_prefix_len) !=
+	    zc->zc_leaf->lh_prefix)) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+		zap_put_leaf(zc->zc_leaf);
+		zc->zc_leaf = NULL;
+	}
+
 again:
-	l = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+	if (zc->zc_leaf == NULL) {
+		zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+	} else {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+	}
+	l = zc->zc_leaf;
+
 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
 
 	if (err == ENOENT) {
@@ -923,7 +937,8 @@
 		if (l->lh_prefix_len == 0 || zc->zc_hash == 0) {
 			zc->zc_hash = -1ULL;
 		} else {
-			zap_put_leaf(l);
+			zap_put_leaf(zc->zc_leaf);
+			zc->zc_leaf = NULL;
 			goto again;
 		}
 	}
@@ -943,7 +958,7 @@
 		    sizeof (za->za_name), za->za_name);
 		ASSERT(err == 0);
 	}
-	zap_put_leaf(l);
+	rw_exit(&zc->zc_leaf->l_rwlock);
 	return (err);
 }
 
--- a/usr/src/uts/common/fs/zfs/zap_leaf.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zap_leaf.c	Thu Nov 10 18:43:50 2005 -0800
@@ -314,6 +314,28 @@
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
+	/* Fast path for one 8-byte integer */
+	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
+		uint64_t *buf64 = (uint64_t *)buf;
+		uint64_t val = *(uint64_t *)la->la_array;
+		*buf64 = BE_64(val);
+		return;
+	}
+
+	/* Fast path for an array of 1-byte integers (eg. the entry name) */
+	if (array_int_len == 1 && buf_int_len == 1 &&
+	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+		while (chunk != CHAIN_END) {
+			struct zap_leaf_array *la =
+			    &l->l_phys->l_chunk[chunk].l_array;
+			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
+			buf += ZAP_LEAF_ARRAY_BYTES;
+			chunk = la->la_next;
+		}
+		return;
+	}
+
 	while (len > 0) {
 		struct zap_leaf_array *la = &l->l_phys->l_chunk[chunk].l_array;
 		int i;
@@ -408,15 +430,8 @@
 }
 
 /* Return (h1,cd1 >= h2,cd2) */
-static int
-hcd_gteq(uint64_t h1, uint32_t cd1, uint64_t h2, uint32_t cd2)
-{
-	if (h1 > h2)
-		return (TRUE);
-	if (h1 == h2 && cd1 >= cd2)
-		return (TRUE);
-	return (FALSE);
-}
+#define	HCD_GTEQ(h1, cd1, h2, cd2) \
+	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
 
 int
 zap_leaf_lookup_closest(zap_leaf_t *l,
@@ -442,8 +457,8 @@
 			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS);
 			ASSERT3U(le->le_type, ==, ZAP_LEAF_ENTRY);
 
-			if (hcd_gteq(le->le_hash, le->le_cd, h, cd) &&
-			    hcd_gteq(besth, bestcd, le->le_hash, le->le_cd)) {
+			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
 				ASSERT3U(bestlh, >=, lh);
 				bestlh = lh;
 				besth = le->le_hash;
--- a/usr/src/uts/common/fs/zfs/zap_micro.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c	Thu Nov 10 18:43:50 2005 -0800
@@ -31,6 +31,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
 #include <sys/avl.h>
 
 
@@ -694,15 +695,6 @@
  * Routines for iterating over the attributes.
  */
 
-void
-zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
-{
-	zc->zc_objset = os;
-	zc->zc_zapobj = zapobj;
-	zc->zc_hash = 0;
-	zc->zc_cd = 0;
-}
-
 /*
  * We want to keep the high 32 bits of the cursor zero if we can, so
  * that 32-bit programs can access this.  So use a small hash value so
@@ -715,6 +707,8 @@
     uint64_t serialized)
 {
 	zc->zc_objset = os;
+	zc->zc_zap = NULL;
+	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
 	if (serialized == -1ULL) {
 		zc->zc_hash = -1ULL;
@@ -727,6 +721,28 @@
 	}
 }
 
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_serialized(zc, os, zapobj, 0);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+	if (zc->zc_zap) {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+		zap_unlockdir(zc->zc_zap);
+		zc->zc_zap = NULL;
+	}
+	if (zc->zc_leaf) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+		zap_put_leaf(zc->zc_leaf);
+		zc->zc_leaf = NULL;
+	}
+	zc->zc_objset = NULL;
+}
+
 uint64_t
 zap_cursor_serialize(zap_cursor_t *zc)
 {
@@ -741,7 +757,6 @@
 int
 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
-	zap_t *zap;
 	int err;
 	avl_index_t idx;
 	mzap_ent_t mze_tofind;
@@ -750,25 +765,30 @@
 	if (zc->zc_hash == -1ULL)
 		return (ENOENT);
 
-	err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-	    RW_READER, TRUE, &zap);
-	if (err)
-		return (err);
-	if (!zap->zap_ismicro) {
-		err = fzap_cursor_retrieve(zap, zc, za);
+	if (zc->zc_zap == NULL) {
+		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+		    RW_READER, TRUE, &zc->zc_zap);
+		if (err)
+			return (err);
+	} else {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+	}
+	if (!zc->zc_zap->zap_ismicro) {
+		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
 		err = ENOENT;
 
 		mze_tofind.mze_hash = zc->zc_hash;
 		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
 
-		mze = avl_find(&zap->zap_m.zap_avl, &mze_tofind, &idx);
+		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		ASSERT(mze == NULL || 0 == bcmp(&mze->mze_phys,
-		    &zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+		    &zc->zc_zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
 		    sizeof (mze->mze_phys)));
-		if (mze == NULL)
-			mze = avl_nearest(&zap->zap_m.zap_avl, idx, AVL_AFTER);
-
+		if (mze == NULL) {
+			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+			    idx, AVL_AFTER);
+		}
 		if (mze) {
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
@@ -781,7 +801,7 @@
 			zc->zc_hash = -1ULL;
 		}
 	}
-	zap_unlockdir(zap);
+	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c	Thu Nov 10 18:43:50 2005 -0800
@@ -337,6 +337,7 @@
 
 		VN_RELE(ZTOV(xzp));
 	}
+	zap_cursor_fini(&zc);
 	ASSERT(error == ENOENT);
 	return (skipped);
 }
@@ -397,6 +398,7 @@
 		VN_RELE(ZTOV(zp));
 		break;
 	}
+	zap_cursor_fini(&zc);
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Thu Nov 10 18:43:50 2005 -0800
@@ -700,19 +700,28 @@
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
-	dsl_dir_t *dd;
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
+	objset_t *os;
 	int error;
 	char *p;
 
-	dd = dsl_dir_open(zc->zc_name, FTAG, NULL);
-	if (dd == NULL)
-		return (ESRCH);
-
-	if (dd->dd_phys->dd_child_dir_zapobj == 0) {
-		dsl_dir_close(dd, FTAG);
-		return (ESRCH);
+retry:
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	if (error != 0) {
+		/*
+		 * This is ugly: dmu_objset_open() can return EBUSY if
+		 * the objset is held exclusively. Fortunately this hold is
+		 * only for a short while, so we retry here.
+		 * This avoids user code having to handle EBUSY,
+		 * for example for a "zfs list".
+		 */
+		if (error == EBUSY) {
+			delay(1);
+			goto retry;
+		}
+		if (error == ENOENT)
+			error = ESRCH;
+		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
@@ -721,103 +730,67 @@
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
-		zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj, zc->zc_cookie);
-
-		error = zap_cursor_retrieve(&cursor, &attr);
+		error = dmu_dir_list_next(os,
+		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
+		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
-		if (error != 0) {
-			dsl_dir_close(dd, FTAG);
-			*p = '\0';
-			return (error);
-		}
-
-		(void) strlcpy(p, attr.za_name, sizeof (zc->zc_name) -
-		    (p - zc->zc_name));
-
-		zap_cursor_advance(&cursor);
-		zc->zc_cookie = zap_cursor_serialize(&cursor);
-
-	} while (!INGLOBALZONE(curproc) &&
+	} while (error == 0 && !INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(zc->zc_name, NULL));
 
-	dsl_dir_close(dd, FTAG);
+	/*
+	 * If it's a hidden dataset (ie. with a '$' in its name), don't
+	 * try to get stats for it.  Userland will skip over it.
+	 */
+	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
-	/*
-	 * If it's a hidden dataset, don't try to get stats for it.
-	 * User land will skip over it.
-	 */
-	if (strchr(zc->zc_name, '$') != NULL)
-		return (0);
-
-	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
+	dmu_objset_close(os);
 	return (error);
 }
 
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
-	zap_cursor_t cursor;
-	zap_attribute_t attr;
-	dsl_dataset_t *ds;
+	objset_t *os;
 	int error;
 
 retry:
-	error = dsl_dataset_open(zc->zc_name,
-	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
-	if (error) {
+	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	if (error != 0) {
 		/*
-		 * This is ugly: dsl_dataset_open() can return EBUSY if
+		 * This is ugly: dmu_objset_open() can return EBUSY if
 		 * the objset is held exclusively. Fortunately this hold is
 		 * only for a short while, so we retry here.
 		 * This avoids user code having to handle EBUSY,
-		 * for example for a "zfs list -s".
+		 * for example for a "zfs list".
 		 */
 		if (error == EBUSY) {
 			delay(1);
 			goto retry;
 		}
 		if (error == ENOENT)
-			return (ESRCH);
+			error = ESRCH;
 		return (error);
 	}
 
-	/*
-	 * If ds_snapnames_zapobj is 0, someone is trying to iterate over
-	 * snapshots of a snapshot.  In this case, pretend that it has no
-	 * snapshots; otherwise zap_cursor_retrieve() will blow up.
-	 */
-	if (ds->ds_phys->ds_snapnames_zapobj == 0) {
-		error = ESRCH;
-		goto out;
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    sizeof (zc->zc_name)) {
+		dmu_objset_close(os);
+		return (ENAMETOOLONG);
 	}
 
-	zap_cursor_init_serialized(&cursor,
-	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, zc->zc_cookie);
-
-	error = zap_cursor_retrieve(&cursor, &attr);
+	error = dmu_snapshot_list_next(os,
+	    sizeof (zc->zc_name) - strlen(zc->zc_name),
+	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie);
 	if (error == ENOENT)
 		error = ESRCH;
-	if (error != 0)
-		goto out;
 
-	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
-	    sizeof (zc->zc_name) ||
-	    strlcat(zc->zc_name, attr.za_name, sizeof (zc->zc_name)) >=
-	    sizeof (zc->zc_name)) {
-		error = ENAMETOOLONG;
-		goto out;
-	}
+	if (error == 0)
+		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 
-	zap_cursor_advance(&cursor);
-	zc->zc_cookie = zap_cursor_serialize(&cursor);
-
-	error = zfs_ioc_objset_stats(zc); /* will just fill in the stats */
-
-out:
-	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	dmu_objset_close(os);
 	return (error);
 }
 
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Thu Nov 10 17:51:31 2005 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Thu Nov 10 18:43:50 2005 -0800
@@ -1778,6 +1778,7 @@
 	}
 
 update:
+	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);