6596237 Stop looking and start ganging
authorGeorge Wilson <George.Wilson@Sun.COM>
Mon, 27 Apr 2009 21:16:23 -0700
changeset 9480 fcff33da767f
parent 9479 ec0be6b2b1e3
child 9481 b25f8de94abf
6596237 Stop looking and start ganging
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/cmd/zdb/Makefile.com
usr/src/cmd/zdb/zdb.c
usr/src/cmd/ztest/ztest.c
usr/src/lib/libzpool/common/llib-lzpool
usr/src/uts/common/fs/zfs/metaslab.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/space_map.c
usr/src/uts/common/fs/zfs/sys/metaslab.h
usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
usr/src/uts/common/fs/zfs/sys/space_map.h
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Mon Apr 27 21:16:23 2009 -0700
@@ -377,6 +377,8 @@
 		"zil_disable",
 		"zfs_nocacheflush",
 		"metaslab_gang_bang",
+		"metaslab_df_alloc_threshold",
+		"metaslab_df_free_pct",
 		"zio_injection_enabled",
 		"zvol_immediate_write_sz",
 	};
--- a/usr/src/cmd/zdb/Makefile.com	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/cmd/zdb/Makefile.com	Mon Apr 27 21:16:23 2009 -0700
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -33,7 +33,7 @@
 INCS += -I../../../lib/libzpool/common 
 INCS +=	-I../../../uts/common/fs/zfs
 
-LDLIBS += -lzpool -lumem -lnvpair -lzfs
+LDLIBS += -lzpool -lumem -lnvpair -lzfs -lavl
 
 C99MODE=	-xc99=%all
 C99LMODE=	-Xc99=%all
--- a/usr/src/cmd/zdb/zdb.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Mon Apr 27 21:16:23 2009 -0700
@@ -102,6 +102,7 @@
 	(void) fprintf(stderr, "        -C cached pool configuration\n");
 	(void) fprintf(stderr, "	-i intent logs\n");
 	(void) fprintf(stderr, "	-b block statistics\n");
+	(void) fprintf(stderr, "	-m metaslabs\n");
 	(void) fprintf(stderr, "	-c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
@@ -473,6 +474,21 @@
 }
 
 static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+	char maxbuf[5];
+	space_map_t *sm = &msp->ms_map;
+	avl_tree_t *t = sm->sm_pp_root;
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	nicenum(space_map_maxsize(sm), maxbuf);
+
+	(void) printf("\t %20s %10lu   %7s  %6s   %4s %4d%%\n",
+	    "segments", avl_numnodes(t), "maxsize", maxbuf,
+	    "freepct", free_pct);
+}
+
+static void
 dump_metaslab(metaslab_t *msp)
 {
 	char freebuf[5];
@@ -482,22 +498,28 @@
 
 	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
 
-	if (dump_opt['d'] <= 5) {
-		(void) printf("\t%10llx   %10llu   %5s\n",
-		    (u_longlong_t)msp->ms_map.sm_start,
-		    (u_longlong_t)smo->smo_object,
-		    freebuf);
-		return;
-	}
-
 	(void) printf(
-	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
+	    "\tvdev %5llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
 	    (u_longlong_t)smo->smo_object, freebuf);
 
-	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+	if (dump_opt['m'] > 1) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
+		    SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+		dump_metaslab_stats(msp);
+		space_map_unload(&msp->ms_map);
+		mutex_exit(&msp->ms_lock);
+	}
 
-	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+	if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
+		ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+
+		mutex_enter(&msp->ms_lock);
+		dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+		mutex_exit(&msp->ms_lock);
+	}
+
 }
 
 static void
@@ -512,14 +534,12 @@
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vd = rvd->vdev_child[c];
 
-		(void) printf("\n    vdev %llu\n\n", (u_longlong_t)vd->vdev_id);
+		(void) printf("\t%-10s   %-19s   %-15s   %-10s\n",
+		    "vdev", "offset", "spacemap", "free");
+		(void) printf("\t%10s   %19s   %15s   %10s\n",
+		    "----------", "-------------------",
+		    "---------------", "-------------");
 
-		if (dump_opt['d'] <= 5) {
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "offset", "spacemap", "free");
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "------", "--------", "----");
-		}
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
@@ -1419,7 +1439,8 @@
 	zdb_space_map_unload,
 	NULL,	/* alloc */
 	zdb_space_map_claim,
-	NULL	/* free */
+	NULL,	/* free */
+	NULL	/* maxsize */
 };
 
 static void
@@ -1809,14 +1830,17 @@
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock);
 
-	if (dump_opt['d'] || dump_opt['i']) {
+	if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bplist(dp->dp_meta_objset,
 			    spa->spa_sync_bplist_obj, "Deferred frees");
 			dump_dtl(spa->spa_root_vdev, 0);
+		}
+
+		if (dump_opt['d'] >= 3 || dump_opt['m'])
 			dump_metaslabs(spa);
-		}
+
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
 		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 	}
@@ -2292,13 +2316,14 @@
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) {
+	while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
 		switch (c) {
 		case 'u':
 		case 'd':
 		case 'i':
 		case 'b':
 		case 'c':
+		case 'm':
 		case 's':
 		case 'C':
 		case 'l':
--- a/usr/src/cmd/ztest/ztest.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Mon Apr 27 21:16:23 2009 -0700
@@ -248,9 +248,11 @@
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
+static uint64_t metaslab_sz;
 static boolean_t ztest_exiting;
 
 extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
 
 #define	ZTEST_DIROBJ		1
 #define	ZTEST_MICROZAP_OBJ	2
@@ -3767,6 +3769,8 @@
 	if (error)
 		fatal(0, "spa_open() = %d", error);
 
+	metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+
 	if (zopt_verbose >= 3)
 		show_pool_stats(spa);
 
@@ -3858,6 +3862,9 @@
 			zi->zi_call_time = 0;
 		}
 
+		/* Set the allocation switch size */
+		metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
+
 		pid = fork();
 
 		if (pid == -1)
--- a/usr/src/lib/libzpool/common/llib-lzpool	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/lib/libzpool/common/llib-lzpool	Mon Apr 27 21:16:23 2009 -0700
@@ -49,3 +49,4 @@
 #include <sys/arc.h>
 
 extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Mon Apr 27 21:16:23 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,18 +36,35 @@
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 30;
+
+/*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(space_map_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
 	mc->mc_rotor = NULL;
+	mc->mc_ops = ops;
 
 	return (mc);
 }
@@ -202,30 +219,14 @@
 }
 
 /*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
  */
-static void
-metaslab_ff_load(space_map_t *sm)
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
 {
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
-}
-
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 	space_seg_t *ss, ssearch;
 	avl_index_t where;
 
@@ -254,7 +255,37 @@
 		return (-1ULL);
 
 	*cursor = 0;
-	return (metaslab_ff_alloc(sm, size));
+	return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	sm->sm_pp_root = NULL;
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /* ARGSUSED */
@@ -276,10 +307,137 @@
 	metaslab_ff_unload,
 	metaslab_ff_alloc,
 	metaslab_ff_claim,
-	metaslab_ff_free
+	metaslab_ff_free,
+	NULL	/* maxsize */
 };
 
 /*
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ */
+
+uint64_t
+metaslab_df_maxsize(space_map_t *sm)
+{
+	avl_tree_t *t = sm->sm_pp_root;
+	space_seg_t *ss;
+
+	if (t == NULL || (ss = avl_last(t)) == NULL)
+		return (0ULL);
+
+	return (ss->ss_end - ss->ss_start);
+}
+
+static int
+metaslab_df_seg_compare(const void *x1, const void *x2)
+{
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+	if (ss_size1 < ss_size2)
+		return (-1);
+	if (ss_size1 > ss_size2)
+		return (1);
+
+	if (s1->ss_start < s2->ss_start)
+		return (-1);
+	if (s1->ss_start > s2->ss_start)
+		return (1);
+
+	return (0);
+}
+
+static void
+metaslab_df_load(space_map_t *sm)
+{
+	space_seg_t *ss;
+
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+	avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_df_unload(space_map_t *sm)
+{
+	void *cookie = NULL;
+
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
+
+	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+		/* tear down the tree */
+	}
+
+	avl_destroy(sm->sm_pp_root);
+	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+	sm->sm_pp_root = NULL;
+}
+
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	uint64_t max_size = metaslab_df_maxsize(sm);
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	/*
+	 * If we're running low on space switch to using the size
+	 * sorted AVL tree (best-fit).
+	 */
+	if (max_size < metaslab_df_alloc_threshold ||
+	    free_pct < metaslab_df_free_pct) {
+		t = sm->sm_pp_root;
+		*cursor = 0;
+	}
+
+	return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	/* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_df_ops = {
+	metaslab_df_load,
+	metaslab_df_unload,
+	metaslab_df_alloc,
+	metaslab_df_claim,
+	metaslab_df_free,
+	metaslab_df_maxsize
+};
+
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
+/*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
@@ -414,20 +572,28 @@
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
 	space_map_t *sm = &msp->ms_map;
+	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = space_map_load(sm, &metaslab_ff_ops,
-		    SM_FREE, &msp->ms_smo,
+		int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
 		if (error) {
 			metaslab_group_sort(msp->ms_group, msp, 0);
 			return (error);
 		}
+
+		/*
+		 * If we were able to load the map then make sure
+		 * that this map is still able to satisfy our request.
+		 */
+		if (msp->ms_weight < size)
+			return (ENOSPC);
+
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -636,11 +802,16 @@
 	int i;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++)
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+	for (i = 0; i < d; i++) {
+		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
+			break;
+		}
+	}
 
 	for (;;) {
+		boolean_t was_active;
+
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
 			if (msp->ms_weight < size) {
@@ -648,6 +819,7 @@
 				return (-1ULL);
 			}
 
+			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
 
@@ -673,7 +845,9 @@
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size) {
+		if (msp->ms_weight < size || (was_active &&
+		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -686,7 +860,7 @@
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight) != 0) {
+		if (metaslab_activate(msp, activation_weight, size) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -869,7 +1043,7 @@
 		goto top;
 	}
 
-	if (!zio_lock) {
+	if (!allocatable && !zio_lock) {
 		dshift = 3;
 		zio_lock = B_TRUE;
 		goto top;
@@ -955,7 +1129,7 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Apr 27 21:16:23 2009 -0700
@@ -541,8 +541,8 @@
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
-	spa->spa_normal_class = metaslab_class_create();
-	spa->spa_log_class = metaslab_class_create();
+	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
+	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
--- a/usr/src/uts/common/fs/zfs/space_map.c	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Mon Apr 27 21:16:23 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -116,12 +116,23 @@
 
 	if (merge_before && merge_after) {
 		avl_remove(&sm->sm_root, ss_before);
+		if (sm->sm_pp_root) {
+			avl_remove(sm->sm_pp_root, ss_before);
+			avl_remove(sm->sm_pp_root, ss_after);
+		}
 		ss_after->ss_start = ss_before->ss_start;
 		kmem_free(ss_before, sizeof (*ss_before));
+		ss = ss_after;
 	} else if (merge_before) {
 		ss_before->ss_end = end;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_before);
+		ss = ss_before;
 	} else if (merge_after) {
 		ss_after->ss_start = start;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_after);
+		ss = ss_after;
 	} else {
 		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
 		ss->ss_start = start;
@@ -129,6 +140,9 @@
 		avl_insert(&sm->sm_root, ss, where);
 	}
 
+	if (sm->sm_pp_root)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space += size;
 }
 
@@ -163,12 +177,17 @@
 	left_over = (ss->ss_start != start);
 	right_over = (ss->ss_end != end);
 
+	if (sm->sm_pp_root)
+		avl_remove(sm->sm_pp_root, ss);
+
 	if (left_over && right_over) {
 		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
 		newseg->ss_start = end;
 		newseg->ss_end = ss->ss_end;
 		ss->ss_end = start;
 		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+		if (sm->sm_pp_root)
+			avl_add(sm->sm_pp_root, newseg);
 	} else if (left_over) {
 		ss->ss_end = start;
 	} else if (right_over) {
@@ -176,8 +195,12 @@
 	} else {
 		avl_remove(&sm->sm_root, ss);
 		kmem_free(ss, sizeof (*ss));
+		ss = NULL;
 	}
 
+	if (sm->sm_pp_root && ss != NULL)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space -= size;
 }
 
@@ -315,6 +338,9 @@
 		if (ops != NULL)
 			ops->smop_load(sm);
 	} else {
+		if (ops != NULL)
+			ops->smop_unload(sm);
+		sm->sm_ops = NULL;
 		space_map_vacate(sm, NULL, NULL);
 	}
 
@@ -342,6 +368,15 @@
 }
 
 uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+	if (sm->sm_loaded && sm->sm_ops != NULL)
+		return (sm->sm_ops->smop_max(sm));
+	else
+		return (-1ULL);
+}
+
+uint64_t
 space_map_alloc(space_map_t *sm, uint64_t size)
 {
 	uint64_t start;
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Mon Apr 27 21:16:23 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,6 +39,8 @@
 typedef struct metaslab_class metaslab_class_t;
 typedef struct metaslab_group metaslab_group_t;
 
+extern space_map_ops_t *zfs_metaslab_ops;
+
 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
     uint64_t start, uint64_t size, uint64_t txg);
 extern void metaslab_fini(metaslab_t *msp);
@@ -55,7 +57,7 @@
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
-extern metaslab_class_t *metaslab_class_create(void);
+extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
 extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
 extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h	Mon Apr 27 21:16:23 2009 -0700
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
 #define	_SYS_METASLAB_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
@@ -41,6 +39,7 @@
 struct metaslab_class {
 	metaslab_group_t	*mc_rotor;
 	uint64_t		mc_allocated;
+	space_map_ops_t		*mc_ops;
 };
 
 struct metaslab_group {
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h	Mon Apr 27 21:33:26 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h	Mon Apr 27 21:16:23 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -46,12 +46,14 @@
 	uint8_t		sm_loading;	/* map loading? */
 	kcondvar_t	sm_load_cv;	/* map load completion */
 	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
+	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
 	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
 typedef struct space_seg {
 	avl_node_t	ss_node;	/* AVL node */
+	avl_node_t	ss_pp_node;	/* AVL picker-private node */
 	uint64_t	ss_start;	/* starting offset of this segment */
 	uint64_t	ss_end;		/* ending offset (non-inclusive) */
 } space_seg_t;
@@ -74,6 +76,7 @@
 	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
 	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
 	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+	uint64_t (*smop_max)(space_map_t *sm);
 };
 
 /*
@@ -152,6 +155,7 @@
 extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
 extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
 
 extern void space_map_sync(space_map_t *sm, uint8_t maptype,
     space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);