6425096 want online 'zfs recv' (read only and read/write)
authorek110237
Wed, 24 Oct 2007 16:54:46 -0700
changeset 5326 6752aa2bd5bc
parent 5325 101705820ea0
child 5327 7341f36188f5
6425096 want online 'zfs recv' (read only and read/write) 6597182 .zfs/snapshot code could use a little more comments
usr/src/common/zfs/zfs_namecheck.c
usr/src/lib/libzfs/common/libzfs_dataset.c
usr/src/uts/common/Makefile.files
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/rrwlock.c
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/sys/rrwlock.h
usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h
usr/src/uts/common/fs/zfs/sys/zfs_znode.h
usr/src/uts/common/fs/zfs/zfs_ctldir.c
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zfs_vfsops.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
usr/src/uts/common/fs/zfs/zfs_znode.c
--- a/usr/src/common/zfs/zfs_namecheck.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/common/zfs/zfs_namecheck.c	Wed Oct 24 16:54:46 2007 -0700
@@ -54,14 +54,14 @@
 	return ((c >= 'a' && c <= 'z') ||
 	    (c >= 'A' && c <= 'Z') ||
 	    (c >= '0' && c <= '9') ||
-	    c == '-' || c == '_' || c == '.' || c == ':');
+	    c == '-' || c == '_' || c == '.' || c == ':' || c == '%');
 }
 
 /*
  * Snapshot names must be made up of alphanumeric characters plus the following
  * characters:
  *
- * 	[-_.:]
+ * 	[-_.:%]
  */
 int
 snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -126,7 +126,7 @@
  * Where each component is made up of alphanumeric characters plus the following
  * characters:
  *
- * 	[-_.:]
+ * 	[-_.:%]
  */
 int
 dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Wed Oct 24 16:54:46 2007 -0700
@@ -130,7 +130,8 @@
  * 'buf' detailing exactly why the name was not valid.
  */
 static int
-zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type)
+zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
+    boolean_t modifying)
 {
 	namecheck_err_t why;
 	char what;
@@ -203,13 +204,20 @@
 		return (0);
 	}
 
+	if (modifying && strchr(path, '%') != NULL) {
+		if (hdl != NULL)
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "invalid character %c in name"), '%');
+		return (0);
+	}
+
 	return (-1);
 }
 
 int
 zfs_name_valid(const char *name, zfs_type_t type)
 {
-	return (zfs_validate_name(NULL, name, type));
+	return (zfs_validate_name(NULL, name, type, B_FALSE));
 }
 
 /*
@@ -420,7 +428,7 @@
 	/*
 	 * Validate the name before we even try to open it.
 	 */
-	if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET)) {
+	if (!zfs_validate_name(hdl, path, ZFS_TYPE_DATASET, B_FALSE)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid dataset name"));
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
@@ -2428,7 +2436,7 @@
 {
 	zfs_handle_t *zhp;
 
-	if (!zfs_validate_name(hdl, path, types))
+	if (!zfs_validate_name(hdl, path, types, B_FALSE))
 		return (B_FALSE);
 
 	/*
@@ -2486,7 +2494,7 @@
 	    "cannot create '%s'"), path);
 
 	/* validate the path, taking care to note the extended error message */
-	if (!zfs_validate_name(hdl, path, type))
+	if (!zfs_validate_name(hdl, path, type, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
@@ -2777,7 +2785,7 @@
 	    "cannot create '%s'"), target);
 
 	/* validate the target name */
-	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM))
+	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
@@ -3042,7 +3050,7 @@
 	    "cannot snapshot '%s'"), path);
 
 	/* validate the target name */
-	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT))
+	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* make sure the parent exists and is of the appropriate type */
@@ -3246,7 +3254,6 @@
 	dmu_replay_record_t drr;
 	struct drr_begin *drrb = &zc.zc_begin_record;
 	char errbuf[1024];
-	prop_changelist_t *clp;
 	char chopprefix[ZFS_MAXNAMELEN];
 
 	begin_time = time(NULL);
@@ -3331,7 +3338,7 @@
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, drr.drr_u.drr_begin.drr_toname+choplen,
 	    sizeof (zc.zc_value));
-	if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT))
+	if (!zfs_validate_name(hdl, zc.zc_value, ZFS_TYPE_SNAPSHOT, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	(void) strcpy(zc.zc_name, zc.zc_value);
@@ -3347,26 +3354,10 @@
 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
 		if (h == NULL)
 			return (-1);
-		if (!dryrun) {
-			/*
-			 * We need to unmount all the dependents of the dataset
-			 * and the dataset itself. If it's a volume
-			 * then remove device link.
-			 */
-			if (h->zfs_type == ZFS_TYPE_FILESYSTEM) {
-				clp = changelist_gather(h, ZFS_PROP_NAME, 0);
-				if (clp == NULL)
-					return (-1);
-				if (changelist_prefix(clp) != 0) {
-					changelist_free(clp);
-					return (-1);
-				}
-			} else {
-				if (zvol_remove_link(hdl, h->zfs_name) != 0) {
-					zfs_close(h);
-					return (-1);
-				}
-
+		if (!dryrun && h->zfs_type == ZFS_TYPE_VOLUME) {
+			if (zvol_remove_link(hdl, h->zfs_name) != 0) {
+				zfs_close(h);
+				return (-1);
 			}
 		}
 		zfs_close(h);
@@ -3474,13 +3465,8 @@
 				if (err == 0 && ioctl_err == 0)
 					err = zvol_create_link(hdl,
 					    zc.zc_value);
-			} else {
-				if (drrb->drr_fromguid) {
-					err = changelist_postfix(clp);
-					changelist_free(clp);
-				} else {
-					err = zfs_mount(h, NULL, 0);
-				}
+			} else if (!drrb->drr_fromguid) {
+				err = zfs_mount(h, NULL, 0);
 			}
 		zfs_close(h);
 		}
@@ -3750,7 +3736,7 @@
 				    errbuf));
 			}
 		}
-		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
+		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
 		if (recursive) {
@@ -3759,7 +3745,7 @@
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 		}
 
-		if (!zfs_validate_name(hdl, target, zhp->zfs_type))
+		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		uint64_t unused;
 
--- a/usr/src/uts/common/Makefile.files	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/Makefile.files	Wed Oct 24 16:54:46 2007 -0700
@@ -1010,6 +1010,7 @@
 	zfs_log.o		\
 	zfs_replay.o		\
 	zfs_rlock.o		\
+	rrwlock.o		\
 	zfs_vfsops.o		\
 	zfs_vnops.o		\
 	zvol.o
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Wed Oct 24 16:54:46 2007 -0700
@@ -44,7 +44,6 @@
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 
-
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
@@ -244,6 +243,7 @@
 
 	mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	osi->os_meta_dnode = dnode_special_open(osi,
 	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
@@ -266,10 +266,10 @@
 dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp)
 {
+	objset_t *os;
 	dsl_dataset_t *ds;
+	objset_impl_t *osi;
 	int err;
-	objset_t *os;
-	objset_impl_t *osi;
 
 	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
 	err = dsl_dataset_open(name, mode, os, &ds);
@@ -387,6 +387,7 @@
 	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
 	mutex_destroy(&osi->os_lock);
 	mutex_destroy(&osi->os_obj_lock);
+	mutex_destroy(&osi->os_user_ptr_lock);
 	kmem_free(osi, sizeof (objset_impl_t));
 }
 
@@ -1049,3 +1050,17 @@
 		err = func(name, arg);
 	return (err);
 }
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+	os->os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
+	return (os->os->os_user_ptr);
+}
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Wed Oct 24 16:54:46 2007 -0700
@@ -295,12 +295,9 @@
 	zio_cksum_t zc;
 };
 
-/* ARGSUSED */
 static int
-replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+replay_incremental_check(dsl_dataset_t *ds, struct drr_begin *drrb)
 {
-	dsl_dataset_t *ds = arg1;
-	struct drr_begin *drrb = arg2;
 	const char *snapname;
 	int err;
 	uint64_t val;
@@ -312,10 +309,6 @@
 	/* most recent snapshot must match fromguid */
 	if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid)
 		return (ENODEV);
-	/* must not have any changes since most recent snapshot */
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    ds->ds_prev->ds_phys->ds_creation_txg)
-		return (ETXTBSY);
 
 	/* new snapshot name must not exist */
 	snapname = strrchr(drrb->drr_toname, '@');
@@ -326,16 +319,31 @@
 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
 	if (err == 0)
-		return (EEXIST);
+	return (EEXIST);
 	if (err != ENOENT)
-		return (err);
+	return (err);
 
 	return (0);
 }
 
 /* ARGSUSED */
+static int
+replay_offline_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct drr_begin *drrb = arg2;
+
+	/* must not have any changes since most recent snapshot */
+	if (dsl_dataset_modified_since_lastsnap(ds))
+		return (ETXTBSY);
+
+	return (replay_incremental_check(ds, drrb));
+}
+
+/* ARGSUSED */
 static void
-replay_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+replay_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr,
+    dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -402,6 +410,57 @@
 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
 }
 
+struct onlineincarg {
+	dsl_dir_t *dd;
+	dsl_dataset_t *ohds;
+	boolean_t force;
+	const char *cosname;
+};
+
+/* ARGSUSED */
+static int
+replay_online_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	struct onlineincarg *oia = arg1;
+
+	if (dsl_dataset_modified_since_lastsnap(oia->ohds) && !oia->force)
+		return (ETXTBSY);
+
+	return (replay_incremental_check(oia->ohds, arg2));
+}
+
+/* ARGSUSED */
+static void
+replay_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	struct onlineincarg *oia = arg1;
+	dsl_dataset_t *ohds = oia->ohds;
+	dsl_dir_t *dd = oia->dd;
+	dsl_dataset_t *ods, *ds;
+	uint64_t dsobj;
+
+	VERIFY(0 == dsl_dataset_open_obj(ohds->ds_dir->dd_pool,
+	    ohds->ds_phys->ds_prev_snap_obj, NULL,
+	    DS_MODE_STANDARD, FTAG, &ods));
+
+	dsobj = dsl_dataset_create_sync(dd, strrchr(oia->cosname, '/') + 1,
+	    ods, tx);
+
+	/* open the temporary clone */
+	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
+	    DS_MODE_EXCLUSIVE, FTAG, &ds));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
+	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
+	    ds->ds_phys->ds_dir_obj);
+
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+}
+
 static int
 replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@@ -729,13 +788,16 @@
 
 int
 dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, vnode_t *vp, uint64_t voffset)
+    boolean_t force, boolean_t online, vnode_t *vp, uint64_t voffset,
+    char *cosname)
 {
 	struct restorearg ra;
 	dmu_replay_record_t *drr;
 	char *cp;
 	objset_t *os = NULL;
 	zio_cksum_t pzc;
+	char *clonebuf = NULL;
+	size_t len;
 
 	bzero(&ra, sizeof (ra));
 	ra.vp = vp;
@@ -790,8 +852,9 @@
 	/*
 	 * Process the begin in syncing context.
 	 */
-	if (drrb->drr_fromguid) {
-		/* incremental backup */
+	if (drrb->drr_fromguid && !online) {
+		/* offline incremental receive */
+
 		dsl_dataset_t *ds = NULL;
 
 		cp = strchr(tosnap, '@');
@@ -816,11 +879,52 @@
 			(void) dsl_dataset_rollback(ds);
 		}
 		ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    replay_incremental_check, replay_incremental_sync,
-		    ds, drrb, 1);
+		    replay_offline_incremental_check,
+		    replay_offline_incremental_sync, ds, drrb, 1);
 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	} else if (drrb->drr_fromguid && online) {
+		/* online incremental receive */
+
+		const char *tail;
+		struct onlineincarg oia = { 0 };
+
+		/*
+		 * Get the dsl_dir for the parent of the
+		 * temporary clone.
+		 */
+		cp = strchr(tosnap, '@');
+		*cp = '\0';
+
+		/* tmp clone is: tonsap + '/' + '%' + "snapX" */
+		len = strlen(tosnap) + 2 + strlen(cp + 1) + 1;
+		clonebuf = kmem_alloc(len, KM_SLEEP);
+		(void) snprintf(clonebuf, len, "%s%c%c%s%c",
+		    tosnap, '/', '%', cp + 1, '\0');
+		ra.err = dsl_dir_open(tosnap, FTAG, &oia.dd, &tail);
+		*cp = '@';
+		if (ra.err)
+			goto out;
+
+		/* open the dataset we are logically receiving into */
+		*cp = '\0';
+		ra.err = dsl_dataset_open(tosnap, DS_MODE_STANDARD,
+		    FTAG, &oia.ohds);
+		*cp = '@';
+		if (ra.err) {
+			dsl_dir_close(oia.dd, FTAG);
+			goto out;
+		}
+
+		oia.force = force;
+		oia.cosname = clonebuf;
+		ra.err = dsl_sync_task_do(oia.dd->dd_pool,
+		    replay_online_incremental_check,
+		    replay_online_incremental_sync, &oia, drrb, 5);
+		dsl_dataset_close(oia.ohds, DS_MODE_STANDARD, FTAG);
+		dsl_dir_close(oia.dd, FTAG);
 	} else {
 		/* full backup */
+
 		dsl_dir_t *dd = NULL;
 		const char *tail;
 
@@ -854,8 +958,8 @@
 
 	cp = strchr(tosnap, '@');
 	*cp = '\0';
-	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
-	    DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
+	ra.err = dmu_objset_open(clonebuf == NULL ? tosnap : clonebuf,
+	    DMU_OST_ANY, DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os);
 	*cp = '@';
 	ASSERT3U(ra.err, ==, 0);
 
@@ -918,9 +1022,11 @@
 				goto out;
 			}
 
-			ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
-			    ds_dir->dd_pool, replay_end_check, replay_end_sync,
-			    os, drrb, 3);
+			if (clonebuf == NULL) {
+				ra.err = dsl_sync_task_do(dmu_objset_ds(os)->
+				    ds_dir->dd_pool, replay_end_check,
+				    replay_end_sync, os, drrb, 3);
+			}
 			goto out;
 		}
 		default:
@@ -931,8 +1037,11 @@
 	}
 
 out:
-	if (os)
+	if (os) {
+		if (drrb->drr_fromguid && online && !ra.err)
+			dmu_objset_name(os, cosname);
 		dmu_objset_close(os);
+	}
 
 	/*
 	 * Make sure we don't rollback/destroy unless we actually
@@ -949,15 +1058,29 @@
 
 		cp = strchr(tosnap, '@');
 		*cp = '\0';
-		err = dsl_dataset_open(tosnap,
+		err = dsl_dataset_open(clonebuf == NULL ? tosnap : clonebuf,
 		    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT,
 		    FTAG, &ds);
 		if (err == 0) {
 			txg_wait_synced(ds->ds_dir->dd_pool, 0);
 			if (drrb->drr_fromguid) {
-				/* incremental: rollback to most recent snap */
-				(void) dsl_dataset_rollback(ds);
-				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+				if (clonebuf != NULL) {
+					/*
+					 * online incremental: destroy
+					 * the temporarily created clone.
+					 */
+					dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
+					    FTAG);
+					(void) dmu_objset_destroy(clonebuf);
+				} else {
+					/*
+					 * offline incremental: rollback to
+					 * most recent snapshot.
+					 */
+					(void) dsl_dataset_rollback(ds);
+					dsl_dataset_close(ds, DS_MODE_EXCLUSIVE,
+					    FTAG);
+				}
 			} else {
 				/* full: destroy whole fs */
 				dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
@@ -967,8 +1090,26 @@
 		*cp = '@';
 	}
 
+	if (clonebuf != NULL)
+		kmem_free(clonebuf, len);
 	kmem_free(ra.buf, ra.bufsize);
 	if (sizep)
 		*sizep = ra.voff;
 	return (ra.err);
 }
+
+int
+dmu_replay_end_snapshot(char *name, struct drr_begin *drrb)
+{
+	objset_t *os;
+	int err;
+
+	err = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_STANDARD, &os);
+	if (err)
+		return (err);
+
+	err = dsl_sync_task_do(dmu_objset_ds(os)->ds_dir->dd_pool,
+	    replay_end_check, replay_end_sync, os, drrb, 3);
+	dmu_objset_close(os);
+	return (err);
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Wed Oct 24 16:54:46 2007 -0700
@@ -1535,6 +1535,21 @@
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
+boolean_t
+dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
+	    dsl_pool_sync_context(dp));
+	if (ds->ds_prev == NULL)
+		return (B_FALSE);
+	if (ds->ds_phys->ds_bp.blk_birth >
+	    ds->ds_prev->ds_phys->ds_creation_txg)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 /* ARGSUSED */
 static int
 dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1601,7 +1616,7 @@
 	dsl_dataset_close(hds, DS_MODE_NONE, FTAG);
 }
 
-struct renamearg {
+struct renamesnaparg {
 	dsl_sync_task_group_t *dstg;
 	char failed[MAXPATHLEN];
 	char *oldsnap;
@@ -1611,7 +1626,7 @@
 static int
 dsl_snapshot_rename_one(char *name, void *arg)
 {
-	struct renamearg *ra = arg;
+	struct renamesnaparg *ra = arg;
 	dsl_dataset_t *ds = NULL;
 	char *cp;
 	int err;
@@ -1659,7 +1674,7 @@
 dsl_recursive_rename(char *oldname, const char *newname)
 {
 	int err;
-	struct renamearg *ra;
+	struct renamesnaparg *ra;
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp, *fsname = spa_strdup(oldname);
@@ -1674,7 +1689,7 @@
 		kmem_free(fsname, len + 1);
 		return (err);
 	}
-	ra = kmem_alloc(sizeof (struct renamearg), KM_SLEEP);
+	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
 	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 
 	ra->oldsnap = strchr(oldname, '@') + 1;
@@ -1704,7 +1719,7 @@
 		(void) strcpy(oldname, ra->failed);
 
 	dsl_sync_task_group_destroy(ra->dstg);
-	kmem_free(ra, sizeof (struct renamearg));
+	kmem_free(ra, sizeof (struct renamesnaparg));
 	spa_close(spa, FTAG);
 	return (err);
 }
@@ -2051,6 +2066,186 @@
 	return (err);
 }
 
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
+
+/* ARGSUSED */
+static int
+dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *cds = arg1;	/* clone to become new head */
+	boolean_t *forcep = arg2;
+	dsl_dir_t *cdd = cds->ds_dir;
+	dsl_pool_t *dp = cds->ds_dir->dd_pool;
+	dsl_dataset_t *ods;	/* the snapshot cds is cloned off of */
+	dsl_dataset_t *ohds = NULL;
+	dsl_dir_t *odd;
+	int err;
+
+	/* check that it is a clone */
+	if (cdd->dd_phys->dd_clone_parent_obj == 0)
+		return (EINVAL);
+
+	/* check that cds is not a snapshot */
+	if (dsl_dataset_is_snapshot(cds))
+		return (EINVAL);
+
+	/* open the origin */
+	if (err = dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods))
+		return (err);
+	odd = ods->ds_dir;
+
+	/* make sure the clone is descendant of origin */
+	if (cdd->dd_parent != odd) {
+		err = EINVAL;
+		goto out;
+	}
+
+	/* check that there are no snapshots after the origin */
+	if (cds->ds_phys->ds_prev_snap_obj != ods->ds_object ||
+	    ods->ds_phys->ds_next_snap_obj !=
+	    odd->dd_phys->dd_head_dataset_obj) {
+		err = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Verify origin head dataset hasn't been modified or
+	 * 'force' has been passed down.
+	 */
+	if (!(*forcep) &&
+	    (err = dsl_dataset_open_obj(cdd->dd_pool,
+	    odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
+	    FTAG, &ohds)) == 0) {
+		if (dsl_dataset_modified_since_lastsnap(ohds))
+			err = ETXTBSY;
+		dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
+	}
+out:
+	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+	return (err);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *cds = arg1;	/* clone to become new head */
+	dsl_dir_t *cdd = cds->ds_dir;
+	dsl_pool_t *dp = cds->ds_dir->dd_pool;
+	dsl_dataset_t *ods, *ohds;
+	dsl_dir_t *odd;
+	uint64_t itor = 0;
+	blkptr_t bp;
+	uint64_t unique = 0;
+	int err;
+
+	ASSERT(cdd->dd_phys->dd_clone_parent_obj != 0);
+	ASSERT(dsl_dataset_is_snapshot(cds) == 0);
+
+	/* open the origin */
+	VERIFY(0 == dsl_dataset_open_obj(dp, cdd->dd_phys->dd_clone_parent_obj,
+	    NULL, DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ods));
+	odd = ods->ds_dir;
+	ASSERT(cds->ds_phys->ds_prev_snap_obj == ods->ds_object);
+	ASSERT(ods->ds_phys->ds_next_snap_obj ==
+	    odd->dd_phys->dd_head_dataset_obj);
+
+	/* open the origin head */
+	VERIFY(0 == dsl_dataset_open_obj(cdd->dd_pool,
+	    odd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_EXCLUSIVE,
+	    FTAG, &ohds));
+	ASSERT(odd == ohds->ds_dir);
+
+	dmu_buf_will_dirty(cds->ds_dbuf, tx);
+	dmu_buf_will_dirty(ohds->ds_dbuf, tx);
+	dmu_buf_will_dirty(ods->ds_dbuf, tx);
+
+	/* compute unique space */
+	while ((err = bplist_iterate(&cds->ds_deadlist, &itor, &bp)) == 0) {
+		if (bp.blk_birth > ods->ds_phys->ds_prev_snap_txg)
+			unique += bp_get_dasize(cdd->dd_pool->dp_spa, &bp);
+	}
+	VERIFY(err == ENOENT);
+
+	/* reset origin's unique bytes */
+	ods->ds_phys->ds_unique_bytes = unique;
+
+	/* swap blkptrs */
+	{
+		blkptr_t tmp;
+		tmp = ohds->ds_phys->ds_bp;
+		ohds->ds_phys->ds_bp = cds->ds_phys->ds_bp;
+		cds->ds_phys->ds_bp = tmp;
+	}
+
+	/* set dd_*_bytes */
+	{
+		int64_t dused, dcomp, duncomp;
+		uint64_t cdl_used, cdl_comp, cdl_uncomp;
+		uint64_t odl_used, odl_comp, odl_uncomp;
+
+		VERIFY(0 == bplist_space(&cds->ds_deadlist, &cdl_used,
+		    &cdl_comp, &cdl_uncomp));
+		VERIFY(0 == bplist_space(&ohds->ds_deadlist, &odl_used,
+		    &odl_comp, &odl_uncomp));
+		dused = cds->ds_phys->ds_used_bytes + cdl_used -
+		    (ohds->ds_phys->ds_used_bytes + odl_used);
+		dcomp = cds->ds_phys->ds_compressed_bytes + cdl_comp -
+		    (ohds->ds_phys->ds_compressed_bytes + odl_comp);
+		duncomp = cds->ds_phys->ds_uncompressed_bytes + cdl_uncomp -
+		    (ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+
+		dsl_dir_diduse_space(odd, dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(cdd, -dused, -dcomp, -duncomp, tx);
+	}
+
+	/* swap ds_*_bytes */
+	SWITCH64(ohds->ds_phys->ds_used_bytes, cds->ds_phys->ds_used_bytes);
+	SWITCH64(ohds->ds_phys->ds_compressed_bytes,
+	    cds->ds_phys->ds_compressed_bytes);
+	SWITCH64(ohds->ds_phys->ds_uncompressed_bytes,
+	    cds->ds_phys->ds_uncompressed_bytes);
+
+	/* swap deadlists */
+	bplist_close(&cds->ds_deadlist);
+	bplist_close(&ohds->ds_deadlist);
+	SWITCH64(ohds->ds_phys->ds_deadlist_obj, cds->ds_phys->ds_deadlist_obj);
+	VERIFY(0 == bplist_open(&cds->ds_deadlist, dp->dp_meta_objset,
+	    cds->ds_phys->ds_deadlist_obj));
+	VERIFY(0 == bplist_open(&ohds->ds_deadlist, dp->dp_meta_objset,
+	    ohds->ds_phys->ds_deadlist_obj));
+
+	dsl_dataset_close(ohds, DS_MODE_EXCLUSIVE, FTAG);
+	dsl_dataset_close(ods, DS_MODE_STANDARD, FTAG);
+}
+
+/*
+ * Swap the clone "cosname" with its origin head file system.
+ */
+int
+dsl_dataset_clone_swap(const char *cosname, boolean_t force)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_open(cosname,
+	    DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, FTAG, &ds);
+	if (err)
+		return (err);
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    dsl_dataset_clone_swap_check,
+	    dsl_dataset_clone_swap_sync, ds, &force, 9);
+	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
+	return (err);
+}
+
 /*
  * Given a pool name and a dataset object number in that pool,
  * return the name of that dataset.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/rrwlock.c	Wed Oct 24 16:54:46 2007 -0700
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t.  This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting.  If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed.  Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock).  Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out.  At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed.  If they are not, then the reader blocks for the
+ * waiting writers.  Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+	struct rrw_node	*rn_next;
+	rrwlock_t	*rn_rrl;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl)
+			return (rn);
+	}
+	return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+	rn->rn_rrl = rrl;
+	rn->rn_next = tsd_get(rrw_tsd_key);
+	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+	rrw_node_t *prev = NULL;
+
+	if (refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl) {
+			if (prev)
+				prev->rn_next = rn->rn_next;
+			else
+				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+			kmem_free(rn, sizeof (*rn));
+			return (B_TRUE);
+		}
+		prev = rn;
+	}
+	return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl)
+{
+	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+	rrl->rr_writer = NULL;
+	refcount_create(&rrl->rr_anon_rcount);
+	refcount_create(&rrl->rr_linked_rcount);
+	rrl->rr_writer_wanted = B_FALSE;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+	mutex_destroy(&rrl->rr_lock);
+	cv_destroy(&rrl->rr_cv);
+	ASSERT(rrl->rr_writer == NULL);
+	refcount_destroy(&rrl->rr_anon_rcount);
+	refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
+	    refcount_is_zero(&rrl->rr_anon_rcount) &&
+	    rrn_find(rrl) == NULL))
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+	if (rrl->rr_writer_wanted) {
+		/* may or may not be a re-entrant enter */
+		rrn_add(rrl);
+		(void) refcount_add(&rrl->rr_linked_rcount, tag);
+	} else {
+		(void) refcount_add(&rrl->rr_anon_rcount, tag);
+	}
+	ASSERT(rrl->rr_writer == NULL);
+	mutex_exit(&rrl->rr_lock);
+}
+
+static void
+rrw_enter_write(rrwlock_t *rrl)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+
+	while (refcount_count(&rrl->rr_anon_rcount) > 0 ||
+	    refcount_count(&rrl->rr_linked_rcount) > 0 ||
+	    rrl->rr_writer != NULL) {
+		rrl->rr_writer_wanted = B_TRUE;
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+	}
+	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_writer = curthread;
+	mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrw_enter_read(rrl, tag);
+	else
+		rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
+	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
+	    rrl->rr_writer != NULL);
+
+	if (rrl->rr_writer == NULL) {
+		if (rrn_find_and_remove(rrl)) {
+			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+
+		} else {
+			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
+				cv_broadcast(&rrl->rr_cv);
+		}
+	} else {
+		ASSERT(rrl->rr_writer == curthread);
+		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
+		    refcount_is_zero(&rrl->rr_linked_rcount));
+		rrl->rr_writer = NULL;
+		cv_broadcast(&rrl->rr_cv);
+	}
+	mutex_exit(&rrl->rr_lock);
+}
+
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+	boolean_t held;
+
+	mutex_enter(&rrl->rr_lock);
+	if (rw == RW_WRITER) {
+		held = (rrl->rr_writer == curthread);
+	} else {
+		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
+		    !refcount_is_zero(&rrl->rr_linked_rcount));
+	}
+	mutex_exit(&rrl->rr_lock);
+
+	return (held);
+}
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Wed Oct 24 16:54:46 2007 -0700
@@ -533,6 +533,8 @@
     uint64_t *id, uint64_t *offp);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
+extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
+extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
@@ -573,7 +575,9 @@
 
 int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
 int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
-    boolean_t force, struct vnode *vp, uint64_t voffset);
+    boolean_t force, boolean_t online, struct vnode *vp, uint64_t voffset,
+    char *cosname);
+int dmu_replay_end_snapshot(char *name, struct drr_begin *drrb);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Wed Oct 24 16:54:46 2007 -0700
@@ -86,6 +86,10 @@
 	list_t os_free_dnodes[TXG_SIZE];
 	list_t os_dnodes;
 	list_t os_downgraded_dbufs;
+
+	/* stuff we store for the user */
+	kmutex_t os_user_ptr_lock;
+	void *os_user_ptr;
 } objset_impl_t;
 
 #define	DMU_META_DNODE_OBJECT	0
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Wed Oct 24 16:54:46 2007 -0700
@@ -138,6 +138,7 @@
 int dsl_dataset_rollback(dsl_dataset_t *ds);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
 int dsl_dataset_promote(const char *name);
+int dsl_dataset_clone_swap(const char *name, boolean_t force);
 
 void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
     void *p, dsl_dataset_evict_func_t func);
@@ -148,6 +149,8 @@
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
+boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/rrwlock.h	Wed Oct 24 16:54:46 2007 -0700
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_RR_RW_LOCK_H
+#define	_SYS_RR_RW_LOCK_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/inttypes.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+/*
+ * A reader-writer lock implementation that allows re-entrant reads, but
+ * still gives writers priority on "new" reads.
+ *
+ * See rrwlock.c for more details about the implementation.
+ *
+ * Fields of the rrwlock_t structure:
+ * - rr_lock: protects modification and reading of rrwlock_t fields
+ * - rr_cv: cv for waking up readers or waiting writers
+ * - rr_writer: thread id of the current writer
+ * - rr_anon_rount: number of active anonymous readers
+ * - rr_linked_rcount: total number of non-anonymous active readers
+ * - rr_writer_wanted: a writer wants the lock
+ */
+typedef struct rrwlock {
+	kmutex_t	rr_lock;
+	kcondvar_t	rr_cv;
+	kthread_t	*rr_writer;
+	refcount_t	rr_anon_rcount;
+	refcount_t	rr_linked_rcount;
+	boolean_t	rr_writer_wanted;
+} rrwlock_t;
+
+/*
+ * 'tag' is used in reference counting tracking.  The
+ * 'tag' must be the same in a rrw_enter() as in its
+ * corresponding rrw_exit().
+ */
+void rrw_init(rrwlock_t *rrl);
+void rrw_destroy(rrwlock_t *rrl);
+void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_exit(rrwlock_t *rrl, void *tag);
+boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+
+#define	RRW_READ_HELD(x)	rrw_held(x, RW_READER)
+#define	RRW_WRITE_HELD(x)	rrw_held(x, RW_WRITER)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_RR_RW_LOCK_H */
--- a/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h	Wed Oct 24 16:54:46 2007 -0700
@@ -33,6 +33,7 @@
 #include <sys/list.h>
 #include <sys/vfs.h>
 #include <sys/zil.h>
+#include <sys/rrwlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -53,8 +54,8 @@
 	uint_t		z_acl_inherit;	/* acl inheritance behavior */
 	boolean_t	z_atime;	/* enable atimes mount option */
 	boolean_t	z_unmounted;	/* unmounted */
-	krwlock_t	z_unmount_lock;
-	krwlock_t	z_unmount_inactive_lock;
+	rrwlock_t	z_teardown_lock;
+	krwlock_t	z_teardown_inactive_lock;
 	list_t		z_all_znodes;	/* all vnodes in the fs */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
 	vnode_t		*z_ctldir;	/* .zfs directory pointer */
@@ -115,6 +116,9 @@
 
 extern uint_t zfs_fsyncer_key;
 
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Wed Oct 24 16:54:46 2007 -0700
@@ -34,6 +34,7 @@
 #include <sys/list.h>
 #include <sys/dmu.h>
 #include <sys/zfs_vfsops.h>
+#include <sys/rrwlock.h>
 #endif
 #include <sys/zfs_acl.h>
 #include <sys/zil.h>
@@ -153,6 +154,7 @@
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
+	uint64_t	z_gen;		/* generation (same as zp_gen) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	list_node_t	z_link_node;	/* all znodes in fs link */
@@ -189,18 +191,27 @@
 /*
  * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
  * ZFS_EXIT() must be called before exitting the vop.
+ * ZFS_ENTER_VERIFY_ZP() does ZFS_ENTER plus verifies the znode is valid.
  */
 #define	ZFS_ENTER(zfsvfs) \
 	{ \
-		if (rw_tryenter(&(zfsvfs)->z_unmount_lock, RW_READER) == 0) \
-			return (EIO); \
+		rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \
 		if ((zfsvfs)->z_unmounted) { \
 			ZFS_EXIT(zfsvfs); \
 			return (EIO); \
 		} \
 	}
 
-#define	ZFS_EXIT(zfsvfs) rw_exit(&(zfsvfs)->z_unmount_lock)
+#define	ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+
+#define	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp) \
+	{ \
+		ZFS_ENTER((zfsvfs)); \
+		if (!(zp)->z_dbuf_held) { \
+			ZFS_EXIT(zfsvfs); \
+			return (EIO); \
+		} \
+	}
 
 /*
  * Macros for dealing with dmu_buf_hold
@@ -250,6 +261,7 @@
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
+extern int	zfs_rezget(znode_t *);
 extern void	zfs_zinactive(znode_t *);
 extern void	zfs_znode_delete(znode_t *, dmu_tx_t *);
 extern void	zfs_znode_free(znode_t *);
--- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c	Wed Oct 24 16:54:46 2007 -0700
@@ -53,6 +53,16 @@
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()).
  */
 
 #include <fs/fs_subr.h>
@@ -578,6 +588,9 @@
 	return (err);
 }
 
+/*
+ * This creates a snapshot under '.zfs/snapshot'.
+ */
 /* ARGSUSED */
 static int
 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
@@ -711,6 +724,9 @@
 	if (err == 0) {
 		/*
 		 * Return the mounted root rather than the covered mount point.
+		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
+		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
+		 * vnode is the root the newly created vfsp.
 		 */
 		VFS_RELE(vfsp);
 		err = traverse(vpp);
@@ -718,11 +734,11 @@
 
 	if (err == 0) {
 		/*
-		 * Fix up the root vnode.
+		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
 		 *
 		 * This is where we lie about our v_vfsp in order to
-		 * make .zfs/snapshot/<snapdir> accessible over NFS
-		 * without requiring manual mounts of <snapdir>.
+		 * make .zfs/snapshot/<snapname> accessible over NFS
+		 * without requiring manual mounts of <snapname>.
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
@@ -771,6 +787,13 @@
 	return (0);
 }
 
+/*
+ * pvp is the '.zfs' directory (zfsctl_node_t).
+ * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
+ *
+ * This function is the callback to create a GFS vnode for '.zfs/snapshot'
+ * when a lookup is performed on .zfs for "snapshot".
+ */
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
@@ -838,6 +861,13 @@
 	{ NULL }
 };
 
+/*
+ * pvp is the GFS vnode '.zfs/snapshot'.
+ *
+ * This creates a GFS node under '.zfs/snapshot' representing each
+ * snapshot.  This newly created GFS node is what we mount snapshot
+ * vfs_t's ontop of.
+ */
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
@@ -937,6 +967,12 @@
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
+		/*
+		 * Return the mounted root rather than the covered mount point.
+		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
+		 * and returns the ZFS vnode mounted on top of the GFS node.
+		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+		 */
 		error = traverse(&vp);
 		if (error == 0) {
 			if (vp == sep->se_root)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Wed Oct 24 16:54:46 2007 -0700
@@ -63,6 +63,8 @@
 #include <sys/zvol.h>
 #include <sharefs/share.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu_objset.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -1671,7 +1673,8 @@
 	default:
 		cbfunc = NULL;
 	}
-	if (strchr(zc->zc_name, '@'))
+	if (strchr(zc->zc_name, '@') ||
+	    strchr(zc->zc_name, '%'))
 		return (EINVAL);
 
 	if (zc->zc_nvlist_src != NULL &&
@@ -1847,7 +1850,8 @@
 	boolean_t recursive = zc->zc_cookie & 1;
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0)
+	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+	    strchr(zc->zc_value, '%'))
 		return (EINVAL);
 
 	/*
@@ -1869,21 +1873,84 @@
 zfs_ioc_recvbackup(zfs_cmd_t *zc)
 {
 	file_t *fp;
+	offset_t new_off;
+	objset_t *os;
+	zfsvfs_t *zfsvfs = NULL;
+	char *cp;
+	char cosname[MAXNAMELEN];
+	boolean_t force = (boolean_t)zc->zc_guid;
 	int error, fd;
-	offset_t new_off;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
-	    strchr(zc->zc_value, '@') == NULL)
+	    strchr(zc->zc_value, '@') == NULL ||
+	    strchr(zc->zc_value, '%'))
 		return (EINVAL);
 
 	fd = zc->zc_cookie;
 	fp = getf(fd);
 	if (fp == NULL)
 		return (EBADF);
+
+	/*
+	 * Get the zfsvfs for the receiving objset. There
+	 * won't be one if we're operating on a zvol, if the
+	 * objset doesn't exist yet, or is not mounted.
+	 */
+	cp = strchr(zc->zc_value, '@');
+	*cp = '\0';
+	error = dmu_objset_open(zc->zc_value, DMU_OST_ANY,
+	    DS_MODE_STANDARD | DS_MODE_READONLY, &os);
+	*cp = '@';
+	if (!error) {
+		if (dmu_objset_type(os) == DMU_OST_ZFS) {
+			mutex_enter(&os->os->os_user_ptr_lock);
+			zfsvfs = dmu_objset_get_user(os);
+			if (zfsvfs != NULL)
+				VFS_HOLD(zfsvfs->z_vfs);
+			mutex_exit(&os->os->os_user_ptr_lock);
+		}
+		dmu_objset_close(os);
+	}
+
 	error = dmu_recvbackup(zc->zc_value, &zc->zc_begin_record,
-	    &zc->zc_cookie, (boolean_t)zc->zc_guid, fp->f_vnode,
-	    fp->f_offset);
+	    &zc->zc_cookie, force, zfsvfs != NULL, fp->f_vnode,
+	    fp->f_offset, cosname);
+
+	/*
+	 * For incremental snapshots where we created a
+	 * temporary clone, we now swap zfsvfs::z_os with
+	 * the newly created and received "cosname".
+	 */
+	if (!error && zfsvfs != NULL) {
+		char osname[MAXNAMELEN];
+		int mode;
+
+		error = zfs_suspend_fs(zfsvfs, osname, &mode);
+		if (!error) {
+			int swap_err;
+			int snap_err = 0;
 
+			swap_err = dsl_dataset_clone_swap(cosname, force);
+			if (!swap_err) {
+				char *cp = strrchr(zc->zc_value, '@');
+
+				*cp = '\0';
+				snap_err = dmu_replay_end_snapshot(zc->zc_value,
+				    &zc->zc_begin_record);
+				*cp = '@';
+			}
+			error = zfs_resume_fs(zfsvfs, osname, mode);
+			if (!error)
+				error = swap_err;
+			if (!error)
+				error = snap_err;
+		}
+
+		/* destroy the clone we created */
+		(void) dmu_objset_destroy(cosname);
+	}
+	if (zfsvfs != NULL)
+		VFS_RELE(zfsvfs->z_vfs);
 	new_off = fp->f_offset + zc->zc_cookie;
 	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &new_off) == 0)
 		fp->f_offset = new_off;
@@ -2327,6 +2394,7 @@
 
 
 uint_t zfs_fsyncer_key;
+extern uint_t rrw_tsd_key;
 
 int
 _init(void)
@@ -2345,6 +2413,7 @@
 	}
 
 	tsd_create(&zfs_fsyncer_key, NULL);
+	tsd_create(&rrw_tsd_key, NULL);
 
 	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
 	ASSERT(error == 0);
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Wed Oct 24 16:54:46 2007 -0700
@@ -59,6 +59,7 @@
 #include <sys/bootconf.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
+#include <sys/dmu_objset.h>
 
 int zfsfstype;
 vfsops_t *zfs_vfsops = NULL;
@@ -498,6 +499,76 @@
 }
 
 static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	uint_t readonly;
+	int error;
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+		if (readonly != 0)
+			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		else
+			zfs_unlinked_drain(zfsvfs);
+
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest doesn't
+		 * use readonly mounts, where zfs_unlinked_drain() isn't
+		 * called.)  This is because ziltest causes spa_sync()
+		 * to think it's committed, but actually it is not, so
+		 * the intent log contains many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated in
+		 * a yet later txg.  This would write a "create object
+		 * N" record to the intent log.  Normally, this would be
+		 * fine because the spa_sync() would have written out
+		 * the fact that object N is free, before we could write
+		 * the "create object N" intent log record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+		    zfs_replay_vector);
+
+		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+	}
+
+	if (!zil_disable)
+		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	return (0);
+}
+
+static int
 zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
 {
 	dev_t mount_dev;
@@ -525,8 +596,8 @@
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
-	rw_init(&zfsvfs->z_unmount_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zfsvfs->z_unmount_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rrw_init(&zfsvfs->z_teardown_lock);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
@@ -583,54 +654,7 @@
 		xattr_changed_cb(zfsvfs, xattr);
 		zfsvfs->z_issnap = B_TRUE;
 	} else {
-		uint_t readonly;
-
-		error = zfs_register_callbacks(vfsp);
-		if (error)
-			goto out;
-
-		/*
-		 * During replay we remove the read only flag to
-		 * allow replays to succeed.
-		 */
-		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-		if (readonly != 0)
-			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-		else
-			zfs_unlinked_drain(zfsvfs);
-
-		/*
-		 * Parse and replay the intent log.
-		 *
-		 * Because of ziltest, this must be done after
-		 * zfs_unlinked_drain().  (Further note: ziltest doesn't
-		 * use readonly mounts, where zfs_unlinked_drain() isn't
-		 * called.)  This is because ziltest causes spa_sync()
-		 * to think it's committed, but actually it is not, so
-		 * the intent log contains many txg's worth of changes.
-		 *
-		 * In particular, if object N is in the unlinked set in
-		 * the last txg to actually sync, then it could be
-		 * actually freed in a later txg and then reallocated in
-		 * a yet later txg.  This would write a "create object
-		 * N" record to the intent log.  Normally, this would be
-		 * fine because the spa_sync() would have written out
-		 * the fact that object N is free, before we could write
-		 * the "create object N" intent log record.
-		 *
-		 * But when we are in ziltest mode, we advance the "open
-		 * txg" without actually spa_sync()-ing the changes to
-		 * disk.  So we would see that object N is still
-		 * allocated and in the unlinked set, and there is an
-		 * intent log record saying to allocate it.
-		 */
-		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector);
-
-		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
-
-		if (!zil_disable)
-			zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	if (!zfsvfs->z_issnap)
@@ -641,8 +665,8 @@
 			dmu_objset_close(zfsvfs->z_os);
 		mutex_destroy(&zfsvfs->z_znodes_lock);
 		list_destroy(&zfsvfs->z_all_znodes);
-		rw_destroy(&zfsvfs->z_unmount_lock);
-		rw_destroy(&zfsvfs->z_unmount_inactive_lock);
+		rrw_destroy(&zfsvfs->z_teardown_lock);
+		rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
@@ -1019,13 +1043,130 @@
 	return (error);
 }
 
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	objset_t *os = zfsvfs->z_os;
+	znode_t	*zp, *nextzp;
+	znode_t markerzp;
+
+	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's vfsp as the parent
+		 * filesystem and all of its snapshots have their vnode's
+		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
+		 * 'z_parent' is self referential for non-snapshots.
+		 */
+		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		return (EIO);
+	}
+
+	/*
+	 * At this point there are no vops active, and any new vops will
+	 * fail with EIO since we have z_teardown_lock for writer (only
+	 * relavent for forced unmount).
+	 *
+	 * Release all holds on dbufs.
+	 * Note, the dmu can still callback via znode_pageout_func()
+	 * which can zfs_znode_free() the znode.  So we lock
+	 * z_all_znodes; search the list for a held dbuf; drop the lock
+	 * (we know zp can't disappear if we hold a dbuf lock) then
+	 * regrab the lock and restart.
+	 *
+	 * Since we have to restart the search after finding each held dbuf,
+	 * we do two things to speed up searching: we insert a dummy znode
+	 * ('markerzp') to detect the original tail of the list, and move
+	 * non-held znodes to the end of the list.  Once we hit 'markerzp',
+	 * we know we've looked at each znode and can break out.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, &markerzp);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp != &markerzp;
+	    zp = nextzp) {
+		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
+		if (zp->z_dbuf_held) {
+			/* dbufs should only be held when force unmounting */
+			zp->z_dbuf_held = 0;
+			mutex_exit(&zfsvfs->z_znodes_lock);
+			dmu_buf_rele(zp->z_dbuf, NULL);
+			/* Start again */
+			mutex_enter(&zfsvfs->z_znodes_lock);
+			nextzp = list_head(&zfsvfs->z_all_znodes);
+		} else {
+			list_remove(&zfsvfs->z_all_znodes, zp);
+			list_insert_tail(&zfsvfs->z_all_znodes, zp);
+		}
+	}
+	list_remove(&zfsvfs->z_all_znodes, &markerzp);
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new vops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other vops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data
+	 */
+	(void) dmu_objset_evict_dbufs(os);
+
+	return (0);
+}
+
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	objset_t *os = zfsvfs->z_os;
-	znode_t	*zp, *nextzp;
+	objset_t *os;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
@@ -1069,79 +1210,35 @@
 				return (EBUSY);
 		} else {
 			if (vfsp->vfs_count > 2 ||
-			    zfsvfs->z_ctldir->v_count > 1) {
+			    zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
-			}
 		}
 	}
 
 	vfsp->vfs_flag |= VFS_UNMOUNTED;
 
-	rw_enter(&zfsvfs->z_unmount_lock, RW_WRITER);
-	rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_WRITER);
-
-	/*
-	 * At this point there are no vops active, and any new vops will
-	 * fail with EIO since we have z_unmount_lock for writer (only
-	 * relavent for forced unmount).
-	 *
-	 * Release all holds on dbufs.
-	 * Note, the dmu can still callback via znode_pageout_func()
-	 * which can zfs_znode_free() the znode.  So we lock
-	 * z_all_znodes; search the list for a held dbuf; drop the lock
-	 * (we know zp can't disappear if we hold a dbuf lock) then
-	 * regrab the lock and restart.
-	 */
-	mutex_enter(&zfsvfs->z_znodes_lock);
-	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
-		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
-		if (zp->z_dbuf_held) {
-			/* dbufs should only be held when force unmounting */
-			zp->z_dbuf_held = 0;
-			mutex_exit(&zfsvfs->z_znodes_lock);
-			dmu_buf_rele(zp->z_dbuf, NULL);
-			/* Start again */
-			mutex_enter(&zfsvfs->z_znodes_lock);
-			nextzp = list_head(&zfsvfs->z_all_znodes);
-		}
-	}
-	mutex_exit(&zfsvfs->z_znodes_lock);
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
 
 	/*
-	 * Set the unmounted flag and let new vops unblock.
-	 * zfs_inactive will have the unmounted behavior, and all other
-	 * vops will fail with EIO.
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
 	 */
-	zfsvfs->z_unmounted = B_TRUE;
-	rw_exit(&zfsvfs->z_unmount_lock);
-	rw_exit(&zfsvfs->z_unmount_inactive_lock);
-
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os))
-		zfs_unregister_callbacks(zfsvfs);
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os->os_user_ptr_lock);
 
-	/*
-	 * Close the zil. NB: Can't close the zil while zfs_inactive
-	 * threads are blocked as zil_close can call zfs_inactive.
-	 */
-	if (zfsvfs->z_log) {
-		zil_close(zfsvfs->z_log);
-		zfsvfs->z_log = NULL;
+		/*
+		 * Finally close the objset
+		 */
+		dmu_objset_close(os);
 	}
 
 	/*
-	 * Evict cached data
-	 */
-	(void) dmu_objset_evict_dbufs(os);
-
-	/*
-	 * Finally close the objset
-	 */
-	dmu_objset_close(os);
-
-	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
@@ -1234,6 +1331,77 @@
 	return (0);
 }
 
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
+{
+	int error;
+
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
+
+	*mode = zfsvfs->z_os->os_mode;
+	dmu_objset_name(zfsvfs->z_os, name);
+	dmu_objset_close(zfsvfs->z_os);
+
+	return (0);
+}
+
+/*
+ * Reopen zfsvfs_t::z_os and release VOPs.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+{
+	int err;
+
+	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+	if (err) {
+		zfsvfs->z_os = NULL;
+	} else {
+		znode_t *zp;
+
+		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+		/*
+		 * Attempt to re-establish all the active znodes with
+		 * their dbufs.  If a zfs_rezget() fails, then we'll let
+		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+		 * when they try to use their znode.
+		 */
+		mutex_enter(&zfsvfs->z_znodes_lock);
+		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+			ASSERT(!zp->z_dbuf_held);
+			(void) zfs_rezget(zp);
+		}
+		mutex_exit(&zfsvfs->z_znodes_lock);
+
+	}
+
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+	if (err) {
+		/*
+		 * Since we couldn't reopen zfsvfs::z_os, force
+		 * unmount this file system.
+		 */
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
+			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
+	}
+	return (err);
+}
+
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
@@ -1245,8 +1413,8 @@
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
-	rw_destroy(&zfsvfs->z_unmount_lock);
-	rw_destroy(&zfsvfs->z_unmount_inactive_lock);
+	rrw_destroy(&zfsvfs->z_teardown_lock);
+	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 
 	atomic_add_32(&zfs_active_fs_count, -1);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Wed Oct 24 16:54:46 2007 -0700
@@ -83,8 +83,9 @@
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
- *	This is done avoiding races using ZFS_ENTER(zfsvfs).
- *	A ZFS_EXIT(zfsvfs) is needed before all returns.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs) or
+ *      ZFS_ENTER_VERIFY(zfsvfs, zp).  A ZFS_EXIT(zfsvfs) is needed before
+ *      all returns.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
@@ -239,6 +240,7 @@
 	offset_t off;
 	int error;
 	zfsvfs_t *zfsvfs;
+	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
@@ -257,8 +259,9 @@
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (EFAULT);
 
-		zfsvfs = VTOZ(vp)->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
@@ -398,12 +401,13 @@
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os = zfsvfs->z_os;
+	objset_t	*os;
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+	os = zfsvfs->z_os;
 
 	/*
 	 * Validate file offset
@@ -568,7 +572,7 @@
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
@@ -585,7 +589,8 @@
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+	zilog = zfsvfs->z_log;
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
@@ -906,7 +911,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 	error = zfs_zaccess_rwx(zp, mode, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -941,7 +946,7 @@
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zdp);
 
 	*vpp = NULL;
 
@@ -1044,14 +1049,16 @@
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
-	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	zoid;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
 
 top:
 	*vpp = NULL;
@@ -1221,7 +1228,7 @@
 	znode_t		*xzp = NULL;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
@@ -1229,7 +1236,8 @@
 	boolean_t	unlinked;
 	int		error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	zilog = zfsvfs->z_log;
 
 top:
 	/*
@@ -1386,7 +1394,7 @@
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	zoid = 0;
 	dmu_tx_t	*tx;
@@ -1394,7 +1402,8 @@
 
 	ASSERT(vap->va_type == VDIR);
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	zilog = zfsvfs->z_log;
 
 	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
@@ -1483,12 +1492,13 @@
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	zilog = zfsvfs->z_log;
 
 top:
 	zp = NULL;
@@ -1613,7 +1623,7 @@
 	int		error;
 	uint8_t		prefetch;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	/*
 	 * If we are not given an eof variable,
@@ -1812,7 +1822,7 @@
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (0);
@@ -1837,11 +1847,12 @@
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_phys_t *pzp = zp->z_phys;
+	znode_phys_t *pzp;
 	int	error;
 	uint64_t links;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+	pzp = zp->z_phys;
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
@@ -1917,10 +1928,10 @@
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
 {
-	struct znode	*zp = VTOZ(vp);
-	znode_phys_t	*pzp = zp->z_phys;
+	znode_t		*zp = VTOZ(vp);
+	znode_phys_t	*pzp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	uint_t		mask = vap->va_mask;
@@ -1943,7 +1954,9 @@
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
 		return (EINVAL);
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+	pzp = zp->z_phys;
+	zilog = zfsvfs->z_log;
 
 top:
 	attrzp = NULL;
@@ -2298,14 +2311,15 @@
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr, error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, sdzp);
+	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure we have the real vp for the target directory.
@@ -2319,6 +2333,10 @@
 	}
 
 	tdzp = VTOZ(tdvp);
+	if (!tdzp->z_dbuf_held) {
+		ZFS_EXIT(zfsvfs);
+		return (EIO);
+	}
 top:
 	szp = NULL;
 	tzp = NULL;
@@ -2529,14 +2547,15 @@
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	uint64_t	zoid;
 	int		len = strlen(link);
 	int		error;
 
 	ASSERT(vap->va_type == VLNK);
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	zilog = zfsvfs->z_log;
 top:
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
 		ZFS_EXIT(zfsvfs);
@@ -2650,7 +2669,7 @@
 	size_t		bufsz;
 	int		error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	bufsz = (size_t)zp->z_phys->zp_size;
 	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
@@ -2695,7 +2714,7 @@
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
+	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
@@ -2703,7 +2722,8 @@
 
 	ASSERT(tdvp->v_type == VDIR);
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, dzp);
+	zilog = zfsvfs->z_log;
 
 	if (VOP_REALVP(svp, &realvp) == 0)
 		svp = realvp;
@@ -2714,6 +2734,10 @@
 	}
 
 	szp = VTOZ(svp);
+	if (!szp->z_dbuf_held) {
+		ZFS_EXIT(zfsvfs);
+		return (EIO);
+	}
 top:
 	/*
 	 * We do not support links between attributes and non-attributes
@@ -2947,7 +2971,7 @@
 	uint64_t	filesz;
 	int		error = 0;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	ASSERT(zp->z_dbuf_held && zp->z_phys);
 
@@ -3005,10 +3029,8 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	rw_enter(&zfsvfs->z_unmount_inactive_lock, RW_READER);
-	if (zfsvfs->z_unmounted) {
-		ASSERT(zp->z_dbuf_held == 0);
-
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_dbuf_held == 0) {
 		if (vn_has_cached_data(vp)) {
 			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
 			    B_INVAL, cr);
@@ -3022,7 +3044,7 @@
 		} else {
 			mutex_exit(&zp->z_lock);
 		}
-		rw_exit(&zfsvfs->z_unmount_inactive_lock);
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		VFS_RELE(zfsvfs->z_vfs);
 		return;
 	}
@@ -3053,7 +3075,7 @@
 	}
 
 	zfs_zinactive(zp);
-	rw_exit(&zfsvfs->z_unmount_inactive_lock);
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 /*
@@ -3087,7 +3109,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	/*
 	 * We are following the UFS semantics with respect to mapcnt
@@ -3239,7 +3261,7 @@
 	int		need_unlock = 0, err = 0;
 	offset_t	orig_off;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	if (protp)
 		*protp = PROT_ALL;
@@ -3371,7 +3393,7 @@
 	segvn_crargs_t	vn_a;
 	int		error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 	if (vp->v_flag & VNOMAP) {
 		ZFS_EXIT(zfsvfs);
@@ -3507,7 +3529,7 @@
 	uint64_t	off, len;
 	int		error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 
 top:
 	if (cmd != F_FREESP) {
@@ -3542,12 +3564,13 @@
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
+	uint32_t	gen;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
+	gen = (uint32_t)zp->z_gen;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	if (fidp->fid_len < size) {
@@ -3607,7 +3630,7 @@
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
-		ZFS_ENTER(zfsvfs);
+		ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
@@ -3647,7 +3670,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 	error = zfs_getacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 
@@ -3662,7 +3685,7 @@
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ZFS_ENTER_VERIFY_ZP(zfsvfs, zp);
 	error = zfs_setacl(zp, vsecp, cr);
 	ZFS_EXIT(zfsvfs);
 	return (error);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Wed Oct 24 16:50:08 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Wed Oct 24 16:54:46 2007 -0700
@@ -41,7 +41,6 @@
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
-#include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/mode.h>
@@ -417,6 +416,7 @@
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
+	zp->z_gen = zp->z_phys->zp_gen;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -706,6 +706,53 @@
 	return (0);
 }
 
+int
+zfs_rezget(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	uint64_t obj_num = zp->z_id;
+	int err;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
+		dmu_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EINVAL);
+	}
+
+	ASSERT(db->db_object == obj_num);
+	ASSERT(db->db_offset == -1);
+	ASSERT(db->db_data != NULL);
+
+	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
+		dmu_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EIO);
+	}
+
+	zp->z_dbuf = db;
+	zp->z_phys = db->db_data;
+	zfs_znode_dmu_init(zp);
+	zp->z_unlinked = (zp->z_phys->zp_links == 0);
+
+	/* release the hold from zfs_znode_dmu_init() */
+	VFS_RELE(zfsvfs->z_vfs);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+	return (0);
+}
+
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {