6861581 ZFS frees in synching context during rollback
authorMatthew Ahrens <Matthew.Ahrens@Sun.COM>
Thu, 06 Aug 2009 22:16:07 -0700
changeset 10272 a0669934e974
parent 10271 7c80b70bb8de
child 10273 84bfc6d78f00
6861581 ZFS frees in synching context during rollback 6869470 panic from refcount_remove when destroy clone
usr/src/cmd/ztest/ztest.c
usr/src/lib/libzpool/common/kernel.c
usr/src/lib/libzpool/common/sys/zfs_context.h
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zvol.c
usr/src/uts/common/os/strext.c
usr/src/uts/common/sys/systm.h
--- a/usr/src/cmd/ztest/ztest.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Thu Aug 06 22:16:07 2009 -0700
@@ -1477,7 +1477,7 @@
 	/*
 	 * Verify that we can create a new dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
+	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
 	    ztest_create_cb, NULL);
 	if (error) {
 		if (error == ENOSPC) {
@@ -1533,7 +1533,7 @@
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL);
+	error = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL);
 	if (error != EEXIST)
 		fatal(0, "created existing dataset, error = %d", error);
 
@@ -1675,8 +1675,7 @@
 	if (error)
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
 
-	error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
-	    NULL, NULL);
+	error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
 	dmu_objset_close(clone);
 	if (error) {
 		if (error == ENOSPC) {
@@ -1711,8 +1710,7 @@
 	if (error)
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 
-	error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
-	    NULL, NULL);
+	error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
 	dmu_objset_close(clone);
 	if (error) {
 		if (error == ENOSPC) {
@@ -3796,7 +3794,7 @@
 			int test_future = FALSE;
 			(void) rw_rdlock(&ztest_shared->zs_name_lock);
 			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-			error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
+			error = dmu_objset_create(name, DMU_OST_OTHER, 0,
 			    ztest_create_cb, NULL);
 			if (error == EEXIST) {
 				test_future = TRUE;
--- a/usr/src/lib/libzpool/common/kernel.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/lib/libzpool/common/kernel.c	Thu Aug 06 22:16:07 2009 -0700
@@ -884,3 +884,27 @@
 	spa_strfree(ksid->kd_name);
 	umem_free(ksid, sizeof (ksiddomain_t));
 }
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	size = vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Thu Aug 06 22:16:07 2009 -0700
@@ -490,6 +490,9 @@
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)
 
+extern char *kmem_asprintf(const char *fmt, ...);
+#define	strfree(str) kmem_free((str), strlen(str)+1)
+
 /*
  * Hostname information
  */
--- a/usr/src/uts/common/fs/zfs/arc.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Thu Aug 06 22:16:07 2009 -0700
@@ -3160,11 +3160,14 @@
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
-			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-			    BP_IDENTITY(zio->io_bp)));
-			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-			    zio->io_bp->blk_birth);
+			if (!(zio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+			    !DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
+			    BP_IDENTITY(zio->io_bp)) ||
+			    zio->io_bp_orig.blk_birth !=
+			    zio->io_bp->blk_birth) {
+				panic("bad overwrite, hdr=%p exists=%p",
+				    (void *)hdr, (void *)exists);
+			}
 
 			ASSERT(refcount_is_zero(&exists->b_refcnt));
 			arc_change_state(arc_anon, exists, hash_lock);
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Thu Aug 06 22:16:07 2009 -0700
@@ -564,7 +564,7 @@
 struct oscarg {
 	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	void *userarg;
-	dsl_dataset_t *clone_parent;
+	dsl_dataset_t *clone_origin;
 	const char *lastname;
 	dmu_objset_type_t type;
 	uint64_t flags;
@@ -585,17 +585,13 @@
 	if (err != ENOENT)
 		return (err ? err : EEXIST);
 
-	if (oa->clone_parent != NULL) {
-		/*
-		 * You can't clone across pools.
-		 */
-		if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+	if (oa->clone_origin != NULL) {
+		/* You can't clone across pools. */
+		if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
 			return (EXDEV);
 
-		/*
-		 * You can only clone snapshots, not the head datasets.
-		 */
-		if (oa->clone_parent->ds_phys->ds_num_children == 0)
+		/* You can only clone snapshots, not the head datasets. */
+		if (!dsl_dataset_is_snapshot(oa->clone_origin))
 			return (EINVAL);
 	}
 
@@ -607,37 +603,37 @@
 {
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
-	dsl_dataset_t *ds;
-	blkptr_t *bp;
 	uint64_t dsobj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_parent, oa->flags, cr, tx);
+	    oa->clone_origin, oa->flags, cr, tx);
 
-	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
-	bp = dsl_dataset_get_blkptr(ds);
-	if (BP_IS_HOLE(bp)) {
+	if (oa->clone_origin == NULL) {
+		dsl_dataset_t *ds;
+		blkptr_t *bp;
 		objset_impl_t *osi;
 
-		/* This is an empty dmu_objset; not a clone. */
+		VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
+		    FTAG, &ds));
+		bp = dsl_dataset_get_blkptr(ds);
+		ASSERT(BP_IS_HOLE(bp));
+
 		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
 		    ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
 			oa->userfunc(&osi->os, oa->userarg, cr, tx);
+		dsl_dataset_rele(ds, FTAG);
 	}
 
 	spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
 	    tx, cr, "dataset = %llu", dsobj);
-
-	dsl_dataset_rele(ds, FTAG);
 }
 
 int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dsl_dir_t *pdd;
@@ -654,24 +650,39 @@
 		return (EEXIST);
 	}
 
-	dprintf("name=%s\n", name);
-
 	oa.userfunc = func;
 	oa.userarg = arg;
 	oa.lastname = tail;
 	oa.type = type;
 	oa.flags = flags;
 
-	if (clone_parent != NULL) {
-		/*
-		 * You can't clone to a different type.
-		 */
-		if (clone_parent->os->os_phys->os_type != type) {
-			dsl_dir_close(pdd, FTAG);
-			return (EINVAL);
-		}
-		oa.clone_parent = clone_parent->os->os_dsl_dataset;
+	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+	    dmu_objset_create_sync, pdd, &oa, 5);
+	dsl_dir_close(pdd, FTAG);
+	return (err);
+}
+
+int
+dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
+{
+	dsl_dir_t *pdd;
+	const char *tail;
+	int err = 0;
+	struct oscarg oa = { 0 };
+
+	ASSERT(strchr(name, '@') == NULL);
+	err = dsl_dir_open(name, FTAG, &pdd, &tail);
+	if (err)
+		return (err);
+	if (tail == NULL) {
+		dsl_dir_close(pdd, FTAG);
+		return (EEXIST);
 	}
+
+	oa.lastname = tail;
+	oa.clone_origin = clone_origin;
+	oa.flags = flags;
+
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
 	dsl_dir_close(pdd, FTAG);
@@ -685,10 +696,11 @@
 	int error;
 
 	/*
-	 * If it looks like we'll be able to destroy it, and there's
-	 * an unplayed replay log sitting around, destroy the log.
-	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
-	 * but the replay log objset is modified in open context.
+	 * dsl_dataset_destroy() can free any claimed-but-unplayed
+	 * intent log, but if there is an active log, it has blocks that
+	 * are allocated, but may not yet be reflected in the on-disk
+	 * structure.  Only the ZIL knows how to free them, so we have
+	 * to call into it here.
 	 */
 	error = dmu_objset_open(name, DMU_OST_ANY,
 	    DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
@@ -697,43 +709,13 @@
 		zil_destroy(dmu_objset_zil(os), B_FALSE);
 
 		error = dsl_dataset_destroy(ds, os, defer);
-		/*
-		 * dsl_dataset_destroy() closes the ds.
-		 */
+		/* dsl_dataset_destroy() closes the ds. */
 		kmem_free(os, sizeof (objset_t));
 	}
 
 	return (error);
 }
 
-/*
- * This will close the objset.
- */
-int
-dmu_objset_rollback(objset_t *os)
-{
-	int err;
-	dsl_dataset_t *ds;
-
-	ds = os->os->os_dsl_dataset;
-
-	if (!dsl_dataset_tryown(ds, TRUE, os)) {
-		dmu_objset_close(os);
-		return (EBUSY);
-	}
-
-	err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
-
-	/*
-	 * NB: we close the objset manually because the rollback
-	 * actually implicitly called dmu_objset_evict(), thus freeing
-	 * the objset_impl_t.
-	 */
-	dsl_dataset_disown(ds, os);
-	kmem_free(os, sizeof (objset_t));
-	return (err);
-}
-
 struct snaparg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Thu Aug 06 22:16:07 2009 -0700
@@ -321,31 +321,9 @@
 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 };
 
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
-    cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds;
-
-	/* This should always work, since we just created it */
-	/* XXX - create should return an owned ds */
-	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
-	    DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
-
-	if (type != DMU_OST_NONE) {
-		(void) dmu_objset_create_impl(dp->dp_spa,
-		    ds, &ds->ds_phys->ds_bp, type, tx);
-	}
-
-	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
-	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
-
-	return (ds);
-}
-
 /* ARGSUSED */
 static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -363,7 +341,7 @@
 		/* make sure it's a snap in the same pool */
 		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
 			return (EXDEV);
-		if (rbsa->origin->ds_phys->ds_num_children == 0)
+		if (!dsl_dataset_is_snapshot(rbsa->origin))
 			return (EINVAL);
 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
 			return (ENODEV);
@@ -373,82 +351,31 @@
 }
 
 static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 	uint64_t dsobj;
 
+	/* Create and open new dataset. */
 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
 	    rbsa->origin, flags, cr, tx);
-
-	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
-
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	int err;
-	struct dsl_ds_destroyarg dsda = {0};
-
-	/* must be a head ds */
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		return (EINVAL);
+	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
+	    DS_MODE_INCONSISTENT, dmu_recv_tag, &rbsa->ds));
 
-	/* must not be a clone ds */
-	if (dsl_dir_is_clone(ds->ds_dir))
-		return (EINVAL);
-
-	dsda.ds = ds;
-	err = dsl_dataset_destroy_check(&dsda, rbsa->tag, tx);
-	if (err)
-		return (err);
-
-	if (rbsa->origin) {
-		/* make sure it's a snap in the same pool */
-		if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
-			return (EXDEV);
-		if (rbsa->origin->ds_phys->ds_num_children == 0)
-			return (EINVAL);
-		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-			return (ENODEV);
+	if (rbsa->origin == NULL) {
+		(void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
+		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 	}
 
-	return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-	uint64_t dsobj;
-	struct dsl_ds_destroyarg dsda = {0};
-
-	/*
-	 * NB: caller must provide an extra hold on the dsl_dir_t, so it
-	 * won't go away when dsl_dataset_destroy_sync() closes the
-	 * dataset.
-	 */
-	dsda.ds = ds;
-	dsl_dataset_destroy_sync(&dsda, rbsa->tag, cr, tx);
-	ASSERT3P(dsda.rm_origin, ==, NULL);
-
-	dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
-
-	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
+	    dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
 }
 
 /* ARGSUSED */
 static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -459,13 +386,17 @@
 	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
 		return (ETXTBSY);
 
-	/* must already be a snapshot of this fs */
-	if (ds->ds_phys->ds_prev_snap_obj == 0)
-		return (ENODEV);
-
-	/* most recent snapshot must match fromguid */
-	if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
-		return (ENODEV);
+	if (rbsa->fromguid) {
+		/* if incremental, most recent snapshot must match fromguid */
+		if (ds->ds_prev == NULL)
+			return (ENODEV);
+		if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
+			return (ENODEV);
+	} else {
+		/* if full, most recent snapshot must be $ORIGIN */
+		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+			return (ENODEV);
+	}
 
 	/* temporary clone name must not exist */
 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
@@ -488,26 +419,30 @@
 
 /* ARGSUSED */
 static void
-recv_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ohds = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
 	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-	dsl_dataset_t *ods, *cds;
+	dsl_dataset_t *cds;
 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 	uint64_t dsobj;
 
-	/* create the temporary clone */
-	VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
-	    FTAG, &ods));
-	dsobj = dsl_dataset_create_sync(ohds->ds_dir,
-	    rbsa->clonelastname, ods, flags, cr, tx);
-	dsl_dataset_rele(ods, FTAG);
-
-	/* open the temporary clone */
+	/* create and open the temporary clone */
+	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
+	    ohds->ds_prev, flags, cr, tx);
 	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
 	    DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
 
+	/*
+	 * If we actually created a non-clone, we need to create the
+	 * objset in our new dataset.
+	 */
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+	}
+
 	/* copy the refquota from the target fs to the clone */
 	if (ohds->ds_quota > 0)
 		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
@@ -528,7 +463,7 @@
 {
 	int err = 0;
 	boolean_t byteswap;
-	struct recvbeginsyncarg rbsa;
+	struct recvbeginsyncarg rbsa = { 0 };
 	uint64_t version;
 	int flags;
 	dsl_dataset_t *ds;
@@ -573,17 +508,17 @@
 	/*
 	 * Process the begin in syncing context.
 	 */
-	if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
-		/* incremental receive */
+
+	/* open the dataset we are logically receiving into */
+	err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+	if (err == 0) {
+		/* target fs already exists; recv into temp clone */
 
-		/* tmp clone name is: tofs/%tosnap" */
-		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
-		    "%%%s", tosnap);
-
-		/* open the dataset we are logically receiving into */
-		err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-		if (err)
-			return (err);
+		/* Can't recv a clone into an existing fs */
+		if (flags & DRR_FLAG_CLONE) {
+			dsl_dataset_rele(ds, dmu_recv_tag);
+			return (EINVAL);
+		}
 
 		/* must not have an incremental recv already in progress */
 		if (!mutex_tryenter(&ds->ds_recvlock)) {
@@ -591,10 +526,12 @@
 			return (EBUSY);
 		}
 
+		/* tmp clone name is: tofs/%tosnap" */
+		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
+		    "%%%s", tosnap);
 		rbsa.force = force;
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_incremental_check,
-		    recv_incremental_sync, ds, &rbsa, 5);
+		    recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
 		if (err) {
 			mutex_exit(&ds->ds_recvlock);
 			dsl_dataset_rele(ds, dmu_recv_tag);
@@ -602,47 +539,36 @@
 		}
 		drc->drc_logical_ds = ds;
 		drc->drc_real_ds = rbsa.ds;
-	} else {
-		/* create new fs -- full backup or clone */
-		dsl_dir_t *dd = NULL;
-		const char *tail;
+	} else if (err == ENOENT) {
+		/* target fs does not exist; must be a full backup or clone */
+		dsl_dataset_t *parent;
+		char *cp;
 
-		err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+		/*
+		 * If it's a non-clone incremental, we are missing the
+		 * target fs, so fail the recv.
+		 */
+		if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
+			return (ENOENT);
+
+		/* Open the parent of tofs */
+		cp = strrchr(tofs, '/');
+		*cp = '\0';
+		err = dsl_dataset_hold(tofs, FTAG, &parent);
+		*cp = '/';
 		if (err)
 			return (err);
-		if (tail == NULL) {
-			if (!force) {
-				dsl_dir_close(dd, FTAG);
-				return (EEXIST);
-			}
 
-			rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-			err = dsl_dataset_own_obj(dd->dd_pool,
-			    dd->dd_phys->dd_head_dataset_obj,
-			    DS_MODE_INCONSISTENT, FTAG, &ds);
-			rw_exit(&dd->dd_pool->dp_config_rwlock);
-			if (err) {
-				dsl_dir_close(dd, FTAG);
-				return (err);
-			}
-
-			dsl_dataset_make_exclusive(ds, FTAG);
-			err = dsl_sync_task_do(dd->dd_pool,
-			    recv_full_existing_check,
-			    recv_full_existing_sync, ds, &rbsa, 5);
-			dsl_dataset_disown(ds, FTAG);
-		} else {
-			err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
-			    recv_full_sync, dd, &rbsa, 5);
-		}
-		dsl_dir_close(dd, FTAG);
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+		dsl_dataset_rele(parent, FTAG);
 		if (err)
 			return (err);
 		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
 		drc->drc_newfs = B_TRUE;
 	}
 
-	return (0);
+	return (err);
 }
 
 struct restorearg {
@@ -1079,37 +1005,31 @@
 	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
 }
 
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 	struct recvendsyncarg resa;
 	dsl_dataset_t *ds = drc->drc_logical_ds;
 	int err;
 
 	/*
-	 * XXX hack; seems the ds is still dirty and
-	 * dsl_pool_zil_clean() expects it to have a ds_user_ptr
-	 * (and zil), but clone_swap() can close it.
+	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
+	 * can close it.
 	 */
 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
 
-	if (ds != drc->drc_real_ds) {
-		/* we are doing an online recv */
-		if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-			err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-			    drc->drc_force);
-			if (err)
-				dsl_dataset_disown(ds, dmu_recv_tag);
-		} else {
-			err = EBUSY;
-			dsl_dataset_rele(ds, dmu_recv_tag);
-		}
-		/* dsl_dataset_destroy() will disown the ds */
+	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+		    drc->drc_force);
+		if (err)
+			goto out;
+	} else {
+		mutex_exit(&ds->ds_recvlock);
+		dsl_dataset_rele(ds, dmu_recv_tag);
 		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
 		    B_FALSE);
-		mutex_exit(&drc->drc_logical_ds->ds_recvlock);
-		if (err)
-			return (err);
+		return (EBUSY);
 	}
 
 	resa.creation_time = drc->drc_drrb->drr_creation_time;
@@ -1119,17 +1039,52 @@
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    recv_end_check, recv_end_sync, ds, &resa, 3);
 	if (err) {
-		if (drc->drc_newfs) {
-			ASSERT(ds == drc->drc_real_ds);
-			(void) dsl_dataset_destroy(ds, dmu_recv_tag,
-			    B_FALSE);
-			return (err);
-		} else {
-			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
-		}
+		/* swap back */
+		(void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
 	}
 
-	/* release the hold from dmu_recv_begin */
+out:
+	mutex_exit(&ds->ds_recvlock);
 	dsl_dataset_disown(ds, dmu_recv_tag);
+	(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
 	return (err);
 }
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	struct recvendsyncarg resa;
+	dsl_dataset_t *ds = drc->drc_logical_ds;
+	int err;
+
+	/*
+	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
+	 * can close it.
+	 */
+	txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+	resa.creation_time = drc->drc_drrb->drr_creation_time;
+	resa.toguid = drc->drc_drrb->drr_toguid;
+	resa.tosnap = drc->drc_tosnap;
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    recv_end_check, recv_end_sync, ds, &resa, 3);
+	if (err) {
+		/* clean up the fs we just recv'd into */
+		(void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
+	} else {
+		/* release the hold from dmu_recv_begin */
+		dsl_dataset_disown(ds, dmu_recv_tag);
+	}
+	return (err);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+	if (drc->drc_logical_ds != drc->drc_real_ds)
+		return (dmu_recv_existing_end(drc));
+	else
+		return (dmu_recv_new_end(drc));
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Thu Aug 06 22:16:07 2009 -0700
@@ -45,8 +45,6 @@
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
@@ -244,8 +242,6 @@
 
 	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 
-	dprintf_ds(ds, "evicting %s\n", "");
-
 	unique_remove(ds->ds_fsid_guid);
 
 	if (ds->ds_user_ptr != NULL)
@@ -867,15 +863,11 @@
 	dsl_dataset_t *ds;
 	int err;
 	char *dsname;
-	size_t buflen;
-
-	/* alloc a buffer to hold name@snapname, plus the terminating NULL */
-	buflen = strlen(name) + strlen(da->snapname) + 2;
-	dsname = kmem_alloc(buflen, KM_SLEEP);
-	(void) snprintf(dsname, buflen, "%s@%s", name, da->snapname);
+
+	dsname = kmem_asprintf("%s@%s", name, da->snapname);
 	err = dsl_dataset_own(dsname, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
 	    da->dstg, &ds);
-	kmem_free(dsname, buflen);
+	strfree(dsname);
 	if (err == 0) {
 		struct dsl_ds_destroyarg *dsda;
 
@@ -1181,25 +1173,6 @@
 	return (err);
 }
 
-int
-dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
-{
-	int err;
-
-	ASSERT(ds->ds_owner);
-
-	dsl_dataset_make_exclusive(ds, ds->ds_owner);
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-	    ds, &ost, 0);
-	/* drop exclusive access */
-	mutex_enter(&ds->ds_lock);
-	rw_exit(&ds->ds_rwlock);
-	cv_broadcast(&ds->ds_exclusive_cv);
-	mutex_exit(&ds->ds_lock);
-	return (err);
-}
-
 void *
 dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func)
@@ -1345,145 +1318,6 @@
 
 /* ARGSUSED */
 static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dmu_objset_type_t *ost = arg2;
-
-	/*
-	 * We can only roll back to emptyness if it is a ZPL objset.
-	 */
-	if (*ost != DMU_OST_ZFS &&
-	    ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL)
-		return (EINVAL);
-
-	/*
-	 * This must not be a snapshot.
-	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		return (EINVAL);
-
-	/*
-	 * If we made changes this txg, traverse_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dmu_objset_type_t *ost = arg2;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	if (ds->ds_user_ptr != NULL) {
-		/*
-		 * We need to make sure that the objset_impl_t is reopened after
-		 * we do the rollback, otherwise it will have the wrong
-		 * objset_phys_t.  Normally this would happen when this
-		 * dataset-open is closed, thus causing the
-		 * dataset to be immediately evicted.  But when doing "zfs recv
-		 * -F", we reopen the objset before that, so that there is no
-		 * window where the dataset is closed and inconsistent.
-		 */
-		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-		ds->ds_user_ptr = NULL;
-	}
-
-	/* Transfer space that was freed since last snap back to the head. */
-	{
-		uint64_t used;
-
-		VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
-		    ds->ds_origin_txg, UINT64_MAX, &used));
-		dsl_dir_transfer_space(ds->ds_dir, used,
-		    DD_USED_SNAP, DD_USED_HEAD, tx);
-	}
-
-	/* Zero out the deadlist. */
-	bplist_close(&ds->ds_deadlist);
-	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	{
-		/*
-		 * Free blkptrs that we gave birth to - this covers
-		 * claimed but not played log blocks too.
-		 */
-		zio_t *zio;
-		struct killarg ka;
-
-		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED);
-		ka.ds = ds;
-		ka.zio = zio;
-		ka.tx = tx;
-		(void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    TRAVERSE_POST, kill_blkptr, &ka);
-		(void) zio_wait(zio);
-	}
-
-	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
-
-	if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
-		/* Change our contents to that of the prev snapshot */
-
-		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    ds->ds_phys->ds_prev_snap_obj);
-		ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
-		    ds->ds_prev->ds_phys->ds_used_bytes);
-
-		ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-		ds->ds_phys->ds_used_bytes =
-		    ds->ds_prev->ds_phys->ds_used_bytes;
-		ds->ds_phys->ds_compressed_bytes =
-		    ds->ds_prev->ds_phys->ds_compressed_bytes;
-		ds->ds_phys->ds_uncompressed_bytes =
-		    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-		ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
-		}
-	} else {
-		objset_impl_t *osi;
-
-		ASSERT(*ost != DMU_OST_ZVOL);
-		ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
-		ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
-		ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
-
-		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
-		ds->ds_phys->ds_flags = 0;
-		ds->ds_phys->ds_unique_bytes = 0;
-		if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-		    SPA_VERSION_UNIQUE_ACCURATE)
-			ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
-		osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
-		    &ds->ds_phys->ds_bp, *ost, tx);
-#ifdef _KERNEL
-		zfs_create_fs(&osi->os, kcred, NULL, tx);
-#endif
-	}
-
-	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", ds->ds_object);
-}
-
-/* ARGSUSED */
-static int
 dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
@@ -1991,7 +1825,7 @@
 			origin->ds_user_ptr = NULL;
 		}
 
-		dsl_dataset_rele(origin, tag);
+		dsl_dataset_rele(origin, ds);
 		ds->ds_prev = NULL;
 
 		ndsda.ds = origin;
@@ -3002,9 +2836,11 @@
 	if (csa->cds->ds_prev != csa->ohds->ds_prev)
 		return (EINVAL);
 
-	/* cds should be the clone */
-	if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
-	    csa->ohds->ds_object)
+	/* cds should be the clone (unless they are unrelated) */
+	if (csa->cds->ds_prev != NULL &&
+	    csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
+	    csa->ohds->ds_object !=
+	    csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
 		return (EINVAL);
 
 	/* the clone should be a child of the origin */
@@ -3042,7 +2878,6 @@
 
 	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
 	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
 
 	if (csa->cds->ds_user_ptr != NULL) {
 		csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
@@ -3055,10 +2890,16 @@
 		csa->ohds->ds_user_ptr = NULL;
 	}
 
-	/* reset origin's unique bytes */
-	VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-	    csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-	    &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+	/*
+	 * Reset origin's unique bytes, if it exists.
+	 */
+	if (csa->cds->ds_prev) {
+		dsl_dataset_t *origin = csa->cds->ds_prev;
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    &origin->ds_phys->ds_unique_bytes));
+	}
 
 	/* swap blkptrs */
 	{
@@ -3144,8 +2985,10 @@
 }
 
 /*
- * Swap 'clone' with its origin head file system.  Used at the end
- * of "online recv" to swizzle the file system to the new version.
+ * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
+ * recv" into an existing fs to swizzle the file system to the new
+ * version, and by "zfs rollback".  Can also be used to swap two
+ * independent head datasets if neither has any snapshots.
  */
 int
 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
@@ -3481,14 +3324,11 @@
 	dsl_dataset_t *ds;
 	int error;
 	char *name;
-	size_t buflen;
 
 	/* alloc a buffer to hold dsname@snapname plus terminating NULL */
-	buflen = strlen(dsname) + strlen(ha->snapname) + 2;
-	name = kmem_alloc(buflen, KM_SLEEP);
-	(void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
 	error = dsl_dataset_hold(name, ha->dstg, &ds);
-	kmem_free(name, buflen);
+	strfree(name);
 	if (error == 0) {
 		ha->gotone = B_TRUE;
 		dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
@@ -3678,7 +3518,6 @@
 	int error;
 	void *dtag = ha->dstg;
 	char *name;
-	size_t buflen;
 	boolean_t own = B_FALSE;
 	boolean_t might_destroy;
 
@@ -3686,11 +3525,9 @@
 		return (ENAMETOOLONG);
 
 	/* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
-	buflen = strlen(dsname) + strlen(ha->snapname) + 2;
-	name = kmem_alloc(buflen, KM_SLEEP);
-	(void) snprintf(name, buflen, "%s@%s", dsname, ha->snapname);
+	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
 	error = dsl_dataset_hold(name, dtag, &ds);
-	kmem_free(name, buflen);
+	strfree(name);
 	if (error == ENOENT && ha->recursive)
 		return (0);
 	(void) strcpy(ha->failed, dsname);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Aug 06 22:16:07 2009 -0700
@@ -172,12 +172,12 @@
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
 int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
 int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
     boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Thu Aug 06 22:16:07 2009 -0700
@@ -114,11 +114,11 @@
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
     boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Thu Aug 06 22:16:07 2009 -0700
@@ -195,7 +195,6 @@
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Thu Aug 06 22:16:07 2009 -0700
@@ -2406,14 +2406,12 @@
 			return (error);
 		}
 
-		error = dmu_objset_create(zc->zc_name, type, clone, 0,
-		    NULL, NULL);
+		error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
+		dmu_objset_close(clone);
 		if (error) {
-			dmu_objset_close(clone);
 			nvlist_free(nvprops);
 			return (error);
 		}
-		dmu_objset_close(clone);
 	} else {
 		boolean_t is_insensitive = B_FALSE;
 
@@ -2470,7 +2468,7 @@
 				return (error);
 			}
 		}
-		error = dmu_objset_create(zc->zc_name, type, NULL,
+		error = dmu_objset_create(zc->zc_name, type,
 		    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
 		nvlist_free(zct.zct_zplprops);
 	}
@@ -2617,19 +2615,42 @@
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
-	objset_t *os;
+	dsl_dataset_t *ds, *clone;
 	int error;
-	zfsvfs_t *zfsvfs = NULL;
+	zfsvfs_t *zfsvfs;
+	char *clone_name;
+
+	error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
+	if (error)
+		return (error);
+
+	/* must not be a snapshot */
+	if (dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	/* must have a most recent snapshot */
+	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
 
 	/*
-	 * Get the zfsvfs for the receiving objset. There
-	 * won't be one if we're operating on a zvol, if the
-	 * objset doesn't exist yet, or is not mounted.
+	 * Create clone of most recent snapshot.
 	 */
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+	clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
+	error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
 	if (error)
-		return (error);
-
+		goto out;
+
+	error = dsl_dataset_own(clone_name, DS_MODE_INCONSISTENT, FTAG, &clone);
+	if (error)
+		goto out;
+
+	/*
+	 * Do clone swap.
+	 */
 	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
 		int mode;
 
@@ -2637,18 +2658,37 @@
 		if (error == 0) {
 			int resume_err;
 
-			error = dmu_objset_rollback(os);
+			if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+				error = dsl_dataset_clone_swap(clone, ds,
+				    B_TRUE);
+				dsl_dataset_disown(ds, FTAG);
+				ds = NULL;
+			} else {
+				error = EBUSY;
+			}
 			resume_err = zfs_resume_fs(zfsvfs, zc->zc_name, mode);
 			error = error ? error : resume_err;
-		} else {
-			dmu_objset_close(os);
 		}
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
-		error = dmu_objset_rollback(os);
+		if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+			error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
+			dsl_dataset_disown(ds, FTAG);
+			ds = NULL;
+		} else {
+			error = EBUSY;
+		}
 	}
-	/* Note, the dmu_objset_rollback() releases the objset for us. */
-
+
+	/*
+	 * Destroy clone (which also closes it).
+	 */
+	(void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+
+out:
+	strfree(clone_name);
+	if (ds)
+		dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
--- a/usr/src/uts/common/fs/zfs/zvol.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Thu Aug 06 22:16:07 2009 -0700
@@ -435,7 +435,6 @@
 	int ds_mode = DS_MODE_OWNER;
 	vnode_t *vp = NULL;
 	char *devpath;
-	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
 	char chrbuf[30], blkbuf[30];
 	int error;
 
@@ -468,13 +467,9 @@
 	 * If there's an existing /dev/zvol symlink, try to use the
 	 * same minor number we used last time.
 	 */
-	devpath = kmem_alloc(devpathlen, KM_SLEEP);
-
-	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name);
-
+	devpath = kmem_asprintf("%s%s", ZVOL_FULL_DEV_DIR, name);
 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
-
-	kmem_free(devpath, devpathlen);
+	strfree(devpath);
 
 	if (error == 0 && vp->v_type != VLNK)
 		error = EINVAL;
@@ -1612,14 +1607,11 @@
 	vnode_t *vp;
 	boolean_t ret = B_FALSE;
 	char *devpath;
-	size_t devpathlen;
 	int error;
 
-	devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
-	devpath = kmem_alloc(devpathlen, KM_SLEEP);
-	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
+	devpath = kmem_asprintf("%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
 	error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
-	kmem_free(devpath, devpathlen);
+	strfree(devpath);
 
 	ret = !error && IS_SWAPVP(common_specvp(vp));
 
--- a/usr/src/uts/common/os/strext.c	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/os/strext.c	Thu Aug 06 22:16:07 2009 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1998-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 #include <sys/systm.h>
@@ -147,3 +144,27 @@
 	(void) vsnprintf(buf, INT_MAX, fmt, args);
 	return (buf);
 }
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	size = vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
--- a/usr/src/uts/common/sys/systm.h	Thu Aug 06 17:39:39 2009 -0700
+++ b/usr/src/uts/common/sys/systm.h	Thu Aug 06 22:16:07 2009 -0700
@@ -183,6 +183,7 @@
 int bcmp(const void *, const void *, size_t) __PURE;
 int stoi(char **);
 void numtos(ulong_t, char *);
+char *kmem_asprintf(const char *fmt, ...);
 int strident_valid(const char *);
 void strident_canon(char *, size_t);
 int getsubopt(char **optionsp, char * const *tokens, char **valuep);