PSARC 2007/555 zfs fs-only quotas and reservations onnv_77
authorck153898
Mon, 29 Oct 2007 22:45:33 -0700
changeset 5378 111aa1baa84a
parent 5377 549efb40503e
child 5379 7a18baf3c731
PSARC 2007/555 zfs fs-only quotas and reservations 6431277 want filesystem-only quotas 6483677 need immediate reservation
usr/src/cmd/fs.d/df.c
usr/src/cmd/zfs/zfs_main.c
usr/src/cmd/zpool/zpool_main.c
usr/src/common/zfs/zfs_prop.c
usr/src/lib/libzfs/common/libzfs_dataset.c
usr/src/lib/libzfs/common/libzfs_util.c
usr/src/lib/libzfs_jni/common/libzfs_jni_property.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dmu_tx.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/dsl_dir.c
usr/src/uts/common/fs/zfs/dsl_prop.c
usr/src/uts/common/fs/zfs/dsl_synctask.c
usr/src/uts/common/fs/zfs/sys/dmu_tx.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/sys/dsl_dir.h
usr/src/uts/common/fs/zfs/sys/dsl_prop.h
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/cmd/fs.d/df.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/cmd/fs.d/df.c	Mon Oct 29 22:45:33 2007 -0700
@@ -1224,55 +1224,60 @@
 adjust_total_blocks(struct df_request *dfrp, fsblkcnt64_t *total,
     uint64_t blocksize)
 {
-	zfs_handle_t	*zhp;
 	char *dataset, *slash;
-	uint64_t quota;
+	boolean_t first = TRUE;
+	uint64_t quota = 0;
 
-	if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 ||
-	    !load_libzfs())
+	if (strcmp(DFR_FSTYPE(dfrp), MNTTYPE_ZFS) != 0 || !load_libzfs())
 		return;
 
 	/*
 	 * We want to get the total size for this filesystem as bounded by any
 	 * quotas. In order to do this, we start at the current filesystem and
-	 * work upwards until we find a dataset with a quota.  If we reach the
-	 * pool itself, then the total space is the amount used plus the amount
+	 * work upwards looking for the smallest quota.  When we reach the
+	 * pool itself, the quota is the amount used plus the amount
 	 * available.
 	 */
 	if ((dataset = strdup(DFR_SPECIAL(dfrp))) == NULL)
 		return;
 
 	slash = dataset + strlen(dataset);
-	do {
+	while (slash != NULL) {
+		zfs_handle_t *zhp;
+		uint64_t this_quota;
+
 		*slash = '\0';
 
-		if ((zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET))
-		    == NULL) {
-			free(dataset);
-			return;
+		zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET);
+		if (zhp == NULL)
+			break;
+
+		/* true at first iteration of loop */
+		if (first) {
+			quota = _zfs_prop_get_int(zhp, ZFS_PROP_REFQUOTA);
+			if (quota == 0)
+				quota = UINT64_MAX;
+			first = FALSE;
 		}
 
-		if ((quota = _zfs_prop_get_int(zhp, ZFS_PROP_QUOTA)) != 0) {
-			*total = quota / blocksize;
-			_zfs_close(zhp);
-			free(dataset);
-			return;
+		this_quota = _zfs_prop_get_int(zhp, ZFS_PROP_QUOTA);
+		if (this_quota && this_quota < quota)
+			quota = this_quota;
+
+		/* true at last iteration of loop */
+		if ((slash = strrchr(dataset, '/')) == NULL) {
+			uint64_t size;
+
+			size = _zfs_prop_get_int(zhp, ZFS_PROP_USED) +
+			    _zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE);
+			if (size < quota)
+				quota = size;
 		}
 
 		_zfs_close(zhp);
-
-	} while ((slash = strrchr(dataset, '/')) != NULL);
-
-
-	if ((zhp = _zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) {
-		free(dataset);
-		return;
 	}
 
-	*total = (_zfs_prop_get_int(zhp, ZFS_PROP_USED) +
-	    _zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE)) / blocksize;
-
-	_zfs_close(zhp);
+	*total = quota / blocksize;
 	free(dataset);
 }
 
--- a/usr/src/cmd/zfs/zfs_main.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/cmd/zfs/zfs_main.c	Mon Oct 29 22:45:33 2007 -0700
@@ -281,7 +281,7 @@
 {
 	FILE *fp = cb;
 
-	(void) fprintf(fp, "\t%-13s  ", zfs_prop_to_name(prop));
+	(void) fprintf(fp, "\t%-14s ", zfs_prop_to_name(prop));
 
 	if (prop == ZFS_PROP_CASE)
 		(void) fprintf(fp, "NO    ");
@@ -348,7 +348,7 @@
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
-		(void) fprintf(fp, "\n\t%-13s  %s  %s   %s\n\n",
+		(void) fprintf(fp, "\n\t%-14s %s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
 
 		/* Iterate over all properties */
@@ -1270,7 +1270,9 @@
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be inherited\n"), propname);
 			if (prop == ZFS_PROP_QUOTA ||
-			    prop == ZFS_PROP_RESERVATION)
+			    prop == ZFS_PROP_RESERVATION ||
+			    prop == ZFS_PROP_REFQUOTA ||
+			    prop == ZFS_PROP_REFRESERVATION)
 				(void) fprintf(stderr, gettext("use 'zfs set "
 				    "%s=none' to clear\n"), propname);
 			return (1);
--- a/usr/src/cmd/zpool/zpool_main.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Mon Oct 29 22:45:33 2007 -0700
@@ -3301,8 +3301,8 @@
 		(void) printf(gettext(" 6   bootfs pool property\n"));
 		(void) printf(gettext(" 7   Separate intent log devices\n"));
 		(void) printf(gettext(" 8   Delegated administration\n"));
-		(void) printf(gettext(" 9   Case insensitive support and "
-		    "File system unique identifiers (FUID)\n"));
+		(void) printf(gettext(" 9  refquota and refreservation "
+		    "properties\n"));
 		(void) printf(gettext("For more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -3385,6 +3385,8 @@
 	"rollback",
 	"snapshot",
 	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
 };
 
 /*
--- a/usr/src/common/zfs/zfs_prop.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/common/zfs/zfs_prop.c	Mon Oct 29 22:45:33 2007 -0700
@@ -250,6 +250,11 @@
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
 	register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
 	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+	register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+	register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "REFRESERV");
 
 	/* inherit number properties */
 	register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Mon Oct 29 22:45:33 2007 -0700
@@ -772,6 +772,7 @@
 
 			switch (prop) {
 			case ZFS_PROP_RESERVATION:
+			case ZFS_PROP_REFRESERVATION:
 				if (intval > volsize) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' is greater than current "
@@ -1627,6 +1628,7 @@
 			 */
 			switch (prop) {
 			case ZFS_PROP_QUOTA:
+			case ZFS_PROP_REFQUOTA:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is less than current used or "
 				    "reserved space"));
@@ -1634,6 +1636,7 @@
 				break;
 
 			case ZFS_PROP_RESERVATION:
+			case ZFS_PROP_REFRESERVATION:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "size is greater than available space"));
 				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
@@ -1953,7 +1956,9 @@
 		break;
 
 	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
+	case ZFS_PROP_REFRESERVATION:
 		*val = getprop_uint64(zhp, prop, source);
 		if (*val == 0)
 			*source = "";	/* default */
@@ -2122,7 +2127,10 @@
 		break;
 
 	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
+	case ZFS_PROP_REFRESERVATION:
+
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
--- a/usr/src/lib/libzfs/common/libzfs_util.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_util.c	Mon Oct 29 22:45:33 2007 -0700
@@ -1065,7 +1065,6 @@
 	const char *propname;
 	char *value;
 	boolean_t isnone = B_FALSE;
-	boolean_t boolval;
 
 	if (type == ZFS_TYPE_POOL) {
 		proptype = zpool_prop_get_type(prop);
@@ -1116,34 +1115,23 @@
 		/*
 		 * Quota special: force 'none' and don't allow 0.
 		 */
-		if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 &&
-		    !isnone && prop == ZFS_PROP_QUOTA) {
+		if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone &&
+		    (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "use 'none' to disable quota"));
+			    "use 'none' to disable quota/refquota"));
 			goto error;
 		}
 		break;
 
 	case PROP_TYPE_INDEX:
-		switch (datatype) {
-		case DATA_TYPE_STRING:
-			(void) nvpair_value_string(elem, &value);
-			break;
-
-		case DATA_TYPE_BOOLEAN_VALUE:
-			(void) nvpair_value_boolean_value(elem, &boolval);
-			if (boolval)
-				value = "on";
-			else
-				value = "off";
-			break;
-
-		default:
+		if (datatype != DATA_TYPE_STRING) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be a string"), nvpair_name(elem));
 			goto error;
 		}
 
+		(void) nvpair_value_string(elem, &value);
+
 		if (zprop_string_to_index(prop, value, ivalp, type) != 0) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' must be one of '%s'"), propname,
--- a/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_property.c	Mon Oct 29 22:45:33 2007 -0700
@@ -100,6 +100,8 @@
 	ZFS_PROP_RESERVATION,
 	ZFS_PROP_USED,
 	ZFS_PROP_VOLSIZE,
+	ZFS_PROP_REFQUOTA,
+	ZFS_PROP_REFRESERVATION,
 	ZPROP_INVAL
 };
 
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Mon Oct 29 22:45:33 2007 -0700
@@ -498,6 +498,10 @@
 	VERIFY(0 == dsl_dataset_open_obj(dp, dsobj, NULL,
 	    DS_MODE_EXCLUSIVE, dmu_recv_tag, &cds));
 
+	/* copy the refquota from the target fs to the clone */
+	if (ohds->ds_quota > 0)
+		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
+
 	dmu_buf_will_dirty(cds->ds_dbuf, tx);
 	cds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -513,6 +517,7 @@
 recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
+
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Mon Oct 29 22:45:33 2007 -0700
@@ -294,6 +294,8 @@
 		txh->txh_space_tooverwrite += space;
 	} else {
 		txh->txh_space_towrite += space;
+		if (dn && dn->dn_dbuf->db_blkptr)
+			txh->txh_space_tounref += space;
 	}
 }
 
@@ -319,7 +321,7 @@
 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	uint64_t blkid, nblks;
-	uint64_t space = 0;
+	uint64_t space = 0, unref = 0;
 	dnode_t *dn = txh->txh_dnode;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
@@ -383,6 +385,7 @@
 				dprintf_bp(bp, "can free old%s", "");
 				space += bp_get_dasize(spa, bp);
 			}
+			unref += BP_GET_ASIZE(bp);
 		}
 		nblks = 0;
 	}
@@ -418,6 +421,7 @@
 					    "can free old%s", "");
 					space += bp_get_dasize(spa, &bp[i]);
 				}
+				unref += BP_GET_ASIZE(bp);
 			}
 			dbuf_rele(dbuf, FTAG);
 		}
@@ -432,6 +436,7 @@
 	rw_exit(&dn->dn_struct_rwlock);
 
 	txh->txh_space_tofree += space;
+	txh->txh_space_tounref += unref;
 }
 
 void
@@ -550,10 +555,13 @@
 		 * the size will change between now and the dbuf dirty call.
 		 */
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth))
+		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
-		else
+		} else {
 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref +=
+			    BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
+		}
 		return;
 	}
 
@@ -733,8 +741,9 @@
 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	dmu_tx_hold_t *txh;
-	uint64_t lsize, asize, fsize, towrite, tofree, tooverwrite;
 	spa_t *spa = tx->tx_pool->dp_spa;
+	uint64_t lsize, asize, fsize, usize;
+	uint64_t towrite, tofree, tooverwrite, tounref;
 
 	ASSERT3U(tx->tx_txg, ==, 0);
 
@@ -767,7 +776,7 @@
 	 * dmu_tx_unassign() logic.
 	 */
 
-	towrite = tofree = tooverwrite = 0;
+	towrite = tofree = tooverwrite = tounref = 0;
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
@@ -787,6 +796,7 @@
 		towrite += txh->txh_space_towrite;
 		tofree += txh->txh_space_tofree;
 		tooverwrite += txh->txh_space_tooverwrite;
+		tounref += txh->txh_space_tounref;
 	}
 
 	/*
@@ -813,16 +823,18 @@
 	fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 	lsize = towrite + tooverwrite;
 	asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+	usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 
 #ifdef ZFS_DEBUG
 	tx->tx_space_towrite = asize;
 	tx->tx_space_tofree = tofree;
 	tx->tx_space_tooverwrite = tooverwrite;
+	tx->tx_space_tounref = tounref;
 #endif
 
 	if (tx->tx_dir && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir,
-		    lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
+		    lsize, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
 		if (err)
 			return (err);
 	}
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Oct 29 22:45:33 2007 -0700
@@ -45,6 +45,7 @@
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_checkfunc_t dsl_dataset_rollback_check;
 static dsl_syncfunc_t dsl_dataset_rollback_sync;
+static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
 
@@ -67,6 +68,25 @@
 	DS_REF_MAX		/* DS_MODE_EXCLUSIVE - no other opens	*/
 };
 
+/*
+ * Figure out how much of this delta should be propogated to the dsl_dir
+ * layer.  If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+	uint64_t old_bytes, new_bytes;
+
+	if (ds->ds_reserved == 0)
+		return (delta);
+
+	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+	return (new_bytes - old_bytes);
+}
 
 void
 dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
@@ -74,6 +94,7 @@
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
+	int64_t delta;
 
 	dprintf_bp(bp, "born, ds=%p\n", ds);
 
@@ -96,13 +117,13 @@
 	}
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
+	delta = parent_delta(ds, used);
 	ds->ds_phys->ds_used_bytes += used;
 	ds->ds_phys->ds_compressed_bytes += compressed;
 	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 	ds->ds_phys->ds_unique_bytes += used;
 	mutex_exit(&ds->ds_lock);
-	dsl_dir_diduse_space(ds->ds_dir,
-	    used, compressed, uncompressed, tx);
+	dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx);
 }
 
 void
@@ -140,6 +161,7 @@
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
 		int err;
+		int64_t delta;
 
 		dprintf_bp(bp, "freeing: %s", "");
 		err = arc_free(pio, tx->tx_pool->dp_spa,
@@ -147,12 +169,13 @@
 		ASSERT(err == 0);
 
 		mutex_enter(&ds->ds_lock);
-		/* XXX unique_bytes is not accurate for head datasets */
-		/* ASSERT3U(ds->ds_phys->ds_unique_bytes, >=, used); */
+		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+		    !DS_UNIQUE_IS_ACCURATE(ds));
+		delta = parent_delta(ds, -used);
 		ds->ds_phys->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir,
-		    -used, -compressed, -uncompressed, tx);
+		    delta, -compressed, -uncompressed, tx);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
@@ -375,6 +398,24 @@
 			ds->ds_fsid_guid =
 			    unique_insert(ds->ds_phys->ds_fsid_guid);
 		}
+
+		if (!dsl_dataset_is_snapshot(ds)) {
+			boolean_t need_lock =
+			    !RW_LOCK_HELD(&dp->dp_config_rwlock);
+
+			if (need_lock)
+				rw_enter(&dp->dp_config_rwlock, RW_READER);
+			VERIFY(0 == dsl_prop_get_ds_locked(ds->ds_dir,
+			    "refreservation", sizeof (uint64_t), 1,
+			    &ds->ds_reserved, NULL));
+			VERIFY(0 == dsl_prop_get_ds_locked(ds->ds_dir,
+			    "refquota", sizeof (uint64_t), 1, &ds->ds_quota,
+			    NULL));
+			if (need_lock)
+				rw_exit(&dp->dp_config_rwlock);
+		} else {
+			ds->ds_reserved = ds->ds_quota = 0;
+		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
@@ -591,6 +632,8 @@
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
@@ -633,6 +676,9 @@
 	dsphys->ds_creation_txg = tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
 	if (origin) {
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
@@ -943,10 +989,53 @@
 	}
 }
 
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+static void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
+
+	ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
+
+	if (ds->ds_phys->ds_prev_snap_obj != 0)
+		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+	else
+		mrs_used = 0;
+
+	VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
+	    &dluncomp));
+
+	ASSERT3U(dlused, <=, mrs_used);
+	ds->ds_phys->ds_unique_bytes =
+	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+
+	if (!DS_UNIQUE_IS_ACCURATE(ds) &&
+	    spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+static uint64_t
+dsl_dataset_unique(dsl_dataset_t *ds)
+{
+	if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+
+	return (ds->ds_phys->ds_unique_bytes);
+}
+
 struct killarg {
-	uint64_t *usedp;
-	uint64_t *compressedp;
-	uint64_t *uncompressedp;
+	int64_t *usedp;
+	int64_t *compressedp;
+	int64_t *uncompressedp;
 	zio_t *zio;
 	dmu_tx_t *tx;
 };
@@ -1042,7 +1131,7 @@
 	{
 		/* Free blkptrs that we gave birth to */
 		zio_t *zio;
-		uint64_t used = 0, compressed = 0, uncompressed = 0;
+		int64_t used = 0, compressed = 0, uncompressed = 0;
 		struct killarg ka;
 
 		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
@@ -1175,7 +1264,7 @@
 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t used = 0, compressed = 0, uncompressed = 0;
+	int64_t used = 0, compressed = 0, uncompressed = 0;
 	zio_t *zio;
 	int err;
 	int after_branch_point = FALSE;
@@ -1190,6 +1279,13 @@
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
+	/* Remove our reservation */
+	if (ds->ds_reserved != 0) {
+		uint64_t val = 0;
+		dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+		ASSERT3U(ds->ds_reserved, ==, 0);
+	}
+
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
 	obj = ds->ds_object;
@@ -1223,6 +1319,7 @@
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
 		uint64_t itor = 0;
+		uint64_t old_unique;
 
 		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 
@@ -1231,6 +1328,8 @@
 		    DS_MODE_NONE, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
 
+		old_unique = dsl_dataset_unique(ds_next);
+
 		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
 		ds_next->ds_phys->ds_prev_snap_obj =
 		    ds->ds_phys->ds_prev_snap_obj;
@@ -1312,13 +1411,6 @@
 			dsl_dataset_close(ds_after_next, DS_MODE_NONE, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
 		} else {
-			/*
-			 * It would be nice to update the head dataset's
-			 * unique.  To do so we would have to traverse
-			 * it for blocks born after ds_prev, which is
-			 * pretty expensive just to maintain something
-			 * for debugging purposes.
-			 */
 			ASSERT3P(ds_next->ds_prev, ==, ds);
 			dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
 			    ds_next);
@@ -1329,13 +1421,32 @@
 			} else {
 				ds_next->ds_prev = NULL;
 			}
+
+			dsl_dataset_recalc_head_uniq(ds_next);
+
+			/*
+			 * Reduce the amount of our unconsmed refreservation
+			 * being charged to our parent by the amount of
+			 * new unique data we have gained.
+			 */
+			if (old_unique < ds_next->ds_reserved) {
+				int64_t mrsdelta;
+				uint64_t new_unique =
+				    ds_next->ds_phys->ds_unique_bytes;
+
+				ASSERT(old_unique <= new_unique);
+				mrsdelta = MIN(new_unique - old_unique,
+				    ds_next->ds_reserved - old_unique);
+				dsl_dir_diduse_space(ds->ds_dir, -mrsdelta,
+				    0, 0, tx);
+			}
 		}
 		dsl_dataset_close(ds_next, DS_MODE_NONE, FTAG);
 
 		/*
-		 * NB: unique_bytes is not accurate for head objsets
-		 * because we don't update it when we delete the most
-		 * recent snapshot -- see above comment.
+		 * NB: unique_bytes might not be accurate for the head objset.
+		 * Before SPA_VERSION 9, we didn't update its value when we
+		 * deleted the most recent snapshot.
 		 */
 		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
 	} else {
@@ -1366,6 +1477,9 @@
 		err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    ADVANCE_POST, kill_blkptr, &ka);
 		ASSERT3U(err, ==, 0);
+		ASSERT(spa_version(dp->dp_spa) <
+		    SPA_VERSION_UNIQUE_ACCURATE ||
+		    used == ds->ds_phys->ds_unique_bytes);
 	}
 
 	err = zio_wait(zio);
@@ -1421,6 +1535,33 @@
 
 }
 
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t asize;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * If there's an fs-only reservation, any blocks that might become
+	 * owned by the snapshot dataset must be accommodated by space
+	 * outside of the reservation.
+	 */
+	asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
+		return (ENOSPC);
+
+	/*
+	 * Propogate any reserved space for this snapshot to other
+	 * snapshot checks in this sync group.
+	 */
+	if (asize > 0)
+		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+	return (0);
+}
+
 /* ARGSUSED */
 int
 dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
@@ -1455,6 +1596,10 @@
 	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
 		return (ENAMETOOLONG);
 
+	err = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (err)
+		return (err);
+
 	ds->ds_trysnap_txg = tx->tx_txg;
 	return (0);
 }
@@ -1510,12 +1655,24 @@
 		}
 	}
 
+	/*
+	 * If we have a reference-reservation on this dataset, we will
+	 * need to increase the amount of refreservation being charged
+	 * since our unique space is going to zero.
+	 */
+	if (ds->ds_reserved) {
+		int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+		dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx);
+	}
+
 	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, dsphys->ds_creation_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
 	ds->ds_phys->ds_prev_snap_txg = dsphys->ds_creation_txg;
 	ds->ds_phys->ds_unique_bytes = 0;
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 	ds->ds_phys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
@@ -1557,14 +1714,22 @@
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
+	uint64_t refd, avail, uobjs, aobjs;
+
 	dsl_dir_stats(ds->ds_dir, nv);
 
+	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
 	    ds->ds_phys->ds_creation_time);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
 	    ds->ds_phys->ds_creation_txg);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
-	    ds->ds_phys->ds_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+	    ds->ds_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+	    ds->ds_reserved);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
@@ -1618,6 +1783,18 @@
 {
 	*refdbytesp = ds->ds_phys->ds_used_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
+		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+	if (ds->ds_quota != 0) {
+		/*
+		 * Adjust available bytes according to refquota
+		 */
+		if (*refdbytesp < ds->ds_quota)
+			*availbytesp = MIN(*availbytesp,
+			    ds->ds_quota - *refdbytesp);
+		else
+			*availbytesp = 0;
+	}
 	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
@@ -2198,6 +2375,9 @@
 	uint64_t unique = 0;
 	int err;
 
+	if (csa->ohds->ds_reserved)
+		panic("refreservation and clone swap are incompatible");
+
 	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
 	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
 	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
@@ -2221,6 +2401,13 @@
 	}
 	VERIFY(err == ENOENT);
 
+	/* undo any accounting due to a refreservation */
+	if (csa->ohds->ds_reserved > csa->ohds->ds_phys->ds_unique_bytes) {
+		dsl_dir_diduse_space(csa->ohds->ds_dir,
+		    csa->ohds->ds_phys->ds_unique_bytes -
+		    csa->ohds->ds_reserved, 0, 0, tx);
+	}
+
 	/* reset origin's unique bytes */
 	csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique;
 
@@ -2263,6 +2450,13 @@
 		(y) = __tmp; \
 	}
 
+	/* redo any accounting due to a refreservation */
+	if (csa->ohds->ds_reserved > csa->ohds->ds_phys->ds_unique_bytes) {
+		dsl_dir_diduse_space(csa->ohds->ds_dir,
+		    csa->ohds->ds_reserved -
+		    csa->ohds->ds_phys->ds_unique_bytes, 0, 0, tx);
+	}
+
 	/* swap ds_*_bytes */
 	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
 	    csa->cds->ds_phys->ds_used_bytes);
@@ -2280,6 +2474,9 @@
 	    csa->cds->ds_phys->ds_deadlist_obj));
 	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
 	    csa->ohds->ds_phys->ds_deadlist_obj));
+	/* fix up clone's unique */
+	dsl_dataset_recalc_head_uniq(csa->cds);
+
 }
 
 /*
@@ -2331,3 +2528,195 @@
 
 	return (0);
 }
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used)
+{
+	int error = 0;
+
+	ASSERT3S(asize, >, 0);
+
+	mutex_enter(&ds->ds_lock);
+	/*
+	 * Make a space adjustment for reserved bytes.
+	 */
+	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+		ASSERT3U(*used, >=,
+		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+	}
+
+	if (!check_quota || ds->ds_quota == 0) {
+		mutex_exit(&ds->ds_lock);
+		return (0);
+	}
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
+		if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+			error = ERESTART;
+		else
+			error = EDQUOT;
+	}
+	mutex_exit(&ds->ds_lock);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *quotap = arg2;
+	uint64_t new_quota = *quotap;
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
+		return (ENOTSUP);
+
+	if (new_quota == 0)
+		return (0);
+
+	if (new_quota < ds->ds_phys->ds_used_bytes ||
+	    new_quota < ds->ds_reserved)
+		return (ENOSPC);
+
+	return (0);
+}
+
+/* ARGSUSED */
+void
+dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *quotap = arg2;
+	uint64_t new_quota = *quotap;
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	mutex_enter(&ds->ds_lock);
+	ds->ds_quota = new_quota;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+
+	spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
+	    tx, cr, "%lld dataset = %llu ",
+	    (longlong_t)new_quota, ds->ds_dir->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
+	if (err)
+		return (err);
+
+	/*
+	 * If someone removes a file, then tries to set the quota, we
+	 * want to make sure the file freeing takes effect.
+	 */
+	txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_quota_check,
+	    dsl_dataset_set_quota_sync, ds, &quota, 0);
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	return (err);
+}
+
+static int
+dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *reservationp = arg2;
+	uint64_t new_reservation = *reservationp;
+	int64_t delta;
+	uint64_t unique;
+
+	if (new_reservation > INT64_MAX)
+		return (EOVERFLOW);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+	    SPA_VERSION_REFRESERVATION)
+		return (ENOTSUP);
+
+	if (dsl_dataset_is_snapshot(ds))
+		return (EINVAL);
+
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	mutex_enter(&ds->ds_lock);
+	unique = dsl_dataset_unique(ds);
+	delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
+	mutex_exit(&ds->ds_lock);
+
+	if (delta > 0 &&
+	    delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		return (ENOSPC);
+	if (delta > 0 && ds->ds_quota > 0 &&
+	    new_reservation > ds->ds_quota)
+		return (ENOSPC);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
+    dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	uint64_t *reservationp = arg2;
+	uint64_t new_reservation = *reservationp;
+	uint64_t unique;
+	int64_t delta;
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	mutex_enter(&ds->ds_lock);
+	unique = dsl_dataset_unique(ds);
+	delta = MAX(0, (int64_t)(new_reservation - unique)) -
+	    MAX(0, (int64_t)(ds->ds_reserved - unique));
+	ds->ds_reserved = new_reservation;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
+	    new_reservation, cr, tx);
+
+	dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx);
+
+	spa_history_internal_log(LOG_DS_REFRESERV,
+	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
+	    (longlong_t)new_reservation,
+	    ds->ds_dir->dd_phys->dd_head_dataset_obj);
+}
+
+int
+dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_open(dsname, DS_MODE_STANDARD, FTAG, &ds);
+	if (err)
+		return (err);
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    dsl_dataset_set_reservation_check,
+	    dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
+	return (err);
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Mon Oct 29 22:45:33 2007 -0700
@@ -26,6 +26,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
@@ -39,7 +40,7 @@
 #include <sys/sunddi.h>
 #include "zfs_namecheck.h"
 
-static uint64_t dsl_dir_estimated_space(dsl_dir_t *dd);
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
     cred_t *cr, dmu_tx_t *tx);
 
@@ -518,13 +519,9 @@
 void
 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
-	    dsl_dir_space_available(dd, NULL, 0, TRUE));
-
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
-	    dd->dd_phys->dd_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 	    dd->dd_phys->dd_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
@@ -590,15 +587,13 @@
 }
 
 static uint64_t
-dsl_dir_estimated_space(dsl_dir_t *dd)
+dsl_dir_space_towrite(dsl_dir_t *dd)
 {
-	int64_t space;
+	uint64_t space = 0;
 	int i;
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
-	space = dd->dd_phys->dd_used_bytes;
-	ASSERT(space >= 0);
 	for (i = 0; i < TXG_SIZE; i++) {
 		space += dd->dd_space_towrite[i&TXG_MASK];
 		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
@@ -632,11 +627,9 @@
 	mutex_enter(&dd->dd_lock);
 	if (dd->dd_phys->dd_quota != 0)
 		quota = dd->dd_phys->dd_quota;
-	if (ondiskonly) {
-		used = dd->dd_used_bytes;
-	} else {
-		used = dsl_dir_estimated_space(dd);
-	}
+	used = dd->dd_used_bytes;
+	if (!ondiskonly)
+		used += dsl_dir_space_towrite(dd);
 	if (dd == ancestor)
 		used += delta;
 
@@ -684,40 +677,50 @@
 	uint64_t tr_size;
 };
 
-/*
- * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
- */
 static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize,
-    boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+    boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
+    dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
-	uint64_t est_used, quota, parent_rsrv;
-	int edquot = EDQUOT;
+	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+	struct tempreserve *tr;
+	int error = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
-	struct tempreserve *tr;
 
 	ASSERT3U(txg, !=, 0);
-	ASSERT3S(asize, >=, 0);
+	ASSERT3S(asize, >, 0);
 
 	mutex_enter(&dd->dd_lock);
+
 	/*
 	 * Check against the dsl_dir's quota.  We don't add in the delta
 	 * when checking for over-quota because they get one free hit.
 	 */
-	est_used = dsl_dir_estimated_space(dd);
+	est_inflight = dsl_dir_space_towrite(dd);
 	for (i = 0; i < TXG_SIZE; i++)
-		est_used += dd->dd_tempreserved[i];
+		est_inflight += dd->dd_tempreserved[i];
+	used_on_disk = dd->dd_used_bytes;
 
 	/*
-	 * If this transaction will result in a net free of space, we want
-	 * to let it through.
+	 * Check for dataset reference quota on first iteration.
 	 */
-	if (netfree || noquota || dd->dd_phys->dd_quota == 0)
+	if (list_head(tr_list) == NULL && tx->tx_objset) {
+		dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+		error = dsl_dataset_check_quota(ds, checkrefquota,
+		    asize, est_inflight, &used_on_disk);
+		if (error) {
+			mutex_exit(&dd->dd_lock);
+			return (error);
+		}
+	}
+
+	/*
+	 * If this transaction will result in a net free of space,
+	 * we want to let it through.
+	 */
+	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
 		quota = dd->dd_phys->dd_quota;
@@ -735,34 +738,31 @@
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 		if (poolsize < quota) {
 			quota = poolsize;
-			edquot = ENOSPC;
+			error = ENOSPC;
 		}
 	}
 
 	/*
 	 * If they are requesting more space, and our current estimate
-	 * is over quota.  They get to try again unless the actual
+	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (asize > 0 && est_used > quota) {
-		if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
-		    dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
-		    dd->dd_used_bytes < quota)
-			edquot = ERESTART;
-		dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
+	if (used_on_disk + est_inflight > quota) {
+		if (est_inflight > 0 || used_on_disk < quota)
+			error = ERESTART;
+		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
-		    dd->dd_used_bytes>>10, est_used>>10,
-		    quota>>10, asize>>10, edquot);
+		    used_on_disk>>10, est_inflight>>10,
+		    quota>>10, asize>>10, error);
 		mutex_exit(&dd->dd_lock);
-		return (edquot);
+		return (error);
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
 	dd->dd_tempreserved[txgidx] += asize;
 
-	parent_rsrv = parent_delta(dd, est_used, asize);
+	parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize);
 	mutex_exit(&dd->dd_lock);
 
 	tr = kmem_alloc(sizeof (struct tempreserve), KM_SLEEP);
@@ -775,7 +775,7 @@
 		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
 
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
-		    parent_rsrv, netfree, ismos, tr_list, tx));
+		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx));
 	} else {
 		return (0);
 	}
@@ -783,25 +783,30 @@
 
 /*
  * Reserve space in this dsl_dir, to be used in this tx's txg.
- * After the space has been dirtied (and thus
- * dsl_dir_willuse_space() has been called), the reservation should
- * be canceled, using dsl_dir_tempreserve_clear().
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
  */
 int
-dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx)
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+    uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 {
 	int err = 0;
 	list_t *tr_list;
 
+	if (asize == 0) {
+		*tr_cookiep = NULL;
+		return (0);
+	}
+
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
-	ASSERT3S(asize, >=, 0);
+	ASSERT3S(asize, >, 0);
 	ASSERT3S(fsize, >=, 0);
 
 	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE,
-	    tr_list, tx);
+	    asize > usize, tr_list, tx);
 
 	if (err == 0) {
 		struct tempreserve *tr;
@@ -835,6 +840,9 @@
 
 	ASSERT3U(tx->tx_txg, !=, 0);
 
+	if (tr_cookie == NULL)
+		return;
+
 	while (tr = list_head(tr_list)) {
 		if (tr->tr_ds == NULL) {
 			arc_tempreserve_clear(tr->tr_size);
@@ -867,7 +875,7 @@
 	if (space > 0)
 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
-	est_used = dsl_dir_estimated_space(dd);
+	est_used = dsl_dir_space_towrite(dd) + dd->dd_used_bytes;
 	parent_space = parent_delta(dd, est_used, space);
 	mutex_exit(&dd->dd_lock);
 
@@ -924,14 +932,13 @@
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
-	 * pending changes could under-estimat the amount of space to be
+	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
-	towrite = dd->dd_space_towrite[0] + dd->dd_space_towrite[1] +
-	    dd->dd_space_towrite[2] + dd->dd_space_towrite[3];
+	towrite = dsl_dir_space_towrite(dd);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
 	    (new_quota < dd->dd_phys->dd_reserved ||
-	    new_quota < dsl_dir_estimated_space(dd))) {
+	    new_quota < dd->dd_used_bytes + towrite)) {
 		err = ENOSPC;
 	}
 	mutex_exit(&dd->dd_lock);
@@ -978,7 +985,7 @@
 	return (err);
 }
 
-static int
+int
 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
@@ -1028,15 +1035,15 @@
 	uint64_t used;
 	int64_t delta;
 
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
 	mutex_enter(&dd->dd_lock);
 	used = dd->dd_used_bytes;
 	delta = MAX(used, new_reservation) -
 	    MAX(used, dd->dd_phys->dd_reserved);
+	dd->dd_phys->dd_reserved = new_reservation;
 	mutex_exit(&dd->dd_lock);
 
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_reserved = new_reservation;
-
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
 		dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx);
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c	Mon Oct 29 22:45:33 2007 -0700
@@ -375,6 +375,24 @@
 	    dd->dd_phys->dd_head_dataset_obj);
 }
 
+void
+dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+    cred_t *cr, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx));
+
+	dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
+
+	spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
+	    "%s=%llu dataset = %llu", name, (u_longlong_t)val,
+	    dd->dd_phys->dd_head_dataset_obj);
+}
+
 int
 dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numints, const void *buf)
--- a/usr/src/uts/common/fs/zfs/dsl_synctask.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_synctask.c	Mon Oct 29 22:45:33 2007 -0700
@@ -158,7 +158,7 @@
 	 * Check for sufficient space.
 	 */
 	dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-	    dstg->dstg_space, dstg->dstg_space * 3, 0, &tr_cookie, tx);
+	    dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
 	/* don't bother trying again */
 	if (dstg->dstg_err == ERESTART)
 		dstg->dstg_err = EAGAIN;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h	Mon Oct 29 22:45:33 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -65,6 +65,7 @@
 	uint64_t tx_space_towrite;
 	uint64_t tx_space_tofree;
 	uint64_t tx_space_tooverwrite;
+	uint64_t tx_space_tounref;
 	refcount_t tx_space_written;
 	refcount_t tx_space_freed;
 #endif
@@ -87,6 +88,7 @@
 	uint64_t txh_space_towrite;
 	uint64_t txh_space_tofree;
 	uint64_t txh_space_tooverwrite;
+	uint64_t txh_space_tounref;
 #ifdef ZFS_DEBUG
 	enum dmu_tx_hold_type txh_type;
 	uint64_t txh_arg1;
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Oct 29 22:45:33 2007 -0700
@@ -55,6 +55,13 @@
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
+/*
+ * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly
+ * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE,
+ * refquota/refreservations).
+ */
+#define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
+
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;
 	uint64_t ds_prev_snap_obj;
@@ -114,6 +121,9 @@
 	/* for objset_open() */
 	kmutex_t ds_opening_lock;
 
+	uint64_t ds_reserved;	/* cached refreservation */
+	uint64_t ds_quota;	/* cached refquota */
+
 	/* Protected by ds_lock; keep at end of struct for better locality */
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
@@ -121,6 +131,9 @@
 #define	dsl_dataset_is_snapshot(ds)	\
 	((ds)->ds_phys->ds_num_children != 0)
 
+#define	DS_UNIQUE_IS_ACCURATE(ds)	\
+	(((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+
 int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_open(const char *name, int mode, void *tag,
@@ -179,6 +192,13 @@
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
+int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used);
+int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
+void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
+    dmu_tx_t *tx);
+int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
+
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Mon Oct 29 22:45:33 2007 -0700
@@ -110,7 +110,8 @@
 void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx);
 void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx);
 int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem,
-    uint64_t asize, uint64_t fsize, void **tr_cookiep, dmu_tx_t *tx);
+    uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep,
+    dmu_tx_t *tx);
 void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx);
 void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx);
 void dsl_dir_diduse_space(dsl_dir_t *dd,
@@ -119,6 +120,7 @@
 int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
--- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h	Mon Oct 29 22:45:33 2007 -0700
@@ -67,6 +67,8 @@
     int intsz, int numints, const void *buf);
 int dsl_prop_set_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numints, const void *buf);
+void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+    cred_t *cr, dmu_tx_t *tx);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Oct 29 22:45:33 2007 -0700
@@ -1411,6 +1411,12 @@
 				return (error);
 			break;
 
+		case ZFS_PROP_REFQUOTA:
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+			    (error = dsl_dataset_set_quota(name, intval)) != 0)
+				return (error);
+			break;
+
 		case ZFS_PROP_RESERVATION:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = dsl_dir_set_reservation(name,
@@ -1418,6 +1424,13 @@
 				return (error);
 			break;
 
+		case ZFS_PROP_REFRESERVATION:
+			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
+			    (error = dsl_dataset_set_reservation(name,
+			    intval)) != 0)
+				return (error);
+			break;
+
 		case ZFS_PROP_VOLSIZE:
 			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
 			    (error = zvol_set_volsize(name,
--- a/usr/src/uts/common/sys/fs/zfs.h	Mon Oct 29 22:26:03 2007 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Mon Oct 29 22:45:33 2007 -0700
@@ -98,6 +98,8 @@
 	ZFS_PROP_VSCAN,
 	ZFS_PROP_NBMAND,
 	ZFS_PROP_SHARESMB,
+	ZFS_PROP_REFQUOTA,
+	ZFS_PROP_REFRESERVATION,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -251,6 +253,9 @@
 #define	SPA_VERSION_DELEGATED_PERMS	SPA_VERSION_8
 #define	SPA_VERSION_FUID		SPA_VERSION_9
 #define	SPA_VERSION_NORMALIZATION	SPA_VERSION_9
+#define	SPA_VERSION_REFRESERVATION	SPA_VERSION_9
+#define	SPA_VERSION_REFQUOTA		SPA_VERSION_9
+#define	SPA_VERSION_UNIQUE_ACCURATE	SPA_VERSION_9
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -619,6 +624,8 @@
 	LOG_DS_ROLLBACK,
 	LOG_DS_SNAPSHOT,
 	LOG_DS_UPGRADE,
+	LOG_DS_REFQUOTA,
+	LOG_DS_REFRESERV,
 	LOG_END
 } history_internal_events_t;