6826241 Sync write IOPS drops dramatically during TXG sync
6869229 zfs should switch to shiny new metaslabs more frequently
--- a/usr/src/cmd/zdb/zdb.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/cmd/zdb/zdb.c Sat Nov 21 22:51:29 2009 -0800
@@ -453,33 +453,37 @@
static void
dump_metaslab(metaslab_t *msp)
{
- char freebuf[5];
- space_map_obj_t *smo = &msp->ms_smo;
vdev_t *vd = msp->ms_group->mg_vd;
spa_t *spa = vd->vdev_spa;
-
- nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
+ char freebuf[5];
+
+ nicenum(sm->sm_size - smo->smo_alloc, freebuf);
(void) printf(
"\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
- (u_longlong_t)(msp->ms_map.sm_start / msp->ms_map.sm_size),
- (u_longlong_t)msp->ms_map.sm_start, (u_longlong_t)smo->smo_object,
- freebuf);
+ (u_longlong_t)(sm->sm_start / sm->sm_size),
+ (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
if (dump_opt['m'] > 1 && !dump_opt['L']) {
mutex_enter(&msp->ms_lock);
- VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
- SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
- dump_metaslab_stats(msp);
- space_map_unload(&msp->ms_map);
+ space_map_load_wait(sm);
+ if (!sm->sm_loaded &&
+ (smo->smo_object != 0 || dump_opt['m'] > 2)) {
+ VERIFY(space_map_load(sm, zfs_metaslab_ops,
+ SM_FREE, smo, spa->spa_meta_objset) == 0);
+ dump_metaslab_stats(msp);
+ space_map_unload(sm);
+ }
mutex_exit(&msp->ms_lock);
}
if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
- ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+ ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
mutex_enter(&msp->ms_lock);
- dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+ dump_spacemap(spa->spa_meta_objset, smo, sm);
mutex_exit(&msp->ms_lock);
}
}
@@ -2843,6 +2847,8 @@
error = 0;
target = argv[0];
+ VERIFY(nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) == 0);
+
if (dump_opt['e']) {
nvlist_t *cfg = NULL;
char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
@@ -2853,8 +2859,7 @@
(void) printf("\nConfiguration for import:\n");
dump_nvlist(cfg, 8);
}
- if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
- nvlist_add_uint64(policy,
+ if (nvlist_add_uint64(policy,
ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
nvlist_add_nvlist(cfg,
ZPOOL_REWIND_POLICY, policy) != 0) {
@@ -2863,13 +2868,16 @@
}
if ((error = spa_import(name, cfg, NULL)) != 0)
error = spa_import_verbatim(name, cfg, NULL);
- nvlist_free(policy);
}
+ } else {
+ VERIFY(nvlist_add_uint64(policy, ZPOOL_REWIND_META_THRESH,
+ UINT64_MAX) == 0);
}
if (error == 0) {
if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
- error = spa_open(target, &spa, FTAG);
+ error = spa_open_rewind(target, &spa, FTAG, policy,
+ NULL);
if (error) {
/*
* If we're missing the log device then
@@ -2884,14 +2892,18 @@
}
mutex_exit(&spa_namespace_lock);
- if (!error)
- error = spa_open(target, &spa, FTAG);
+ if (!error) {
+ error = spa_open_rewind(target, &spa,
+ FTAG, policy, NULL);
+ }
}
} else {
error = dmu_objset_own(target, DMU_OST_ANY,
B_TRUE, FTAG, &os);
}
}
+ nvlist_free(policy);
+
if (error)
fatal("can't open '%s': %s", target, strerror(error));
--- a/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c Sat Nov 21 22:51:29 2009 -0800
@@ -41,7 +41,7 @@
/*
* Minimum size which forces the dynamic allocator to change
- * it's allocation strategy. Once the space map cannot satisfy
+ * it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
@@ -53,7 +53,23 @@
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
-int metaslab_df_free_pct = 30;
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
/*
* ==========================================================================
@@ -310,6 +326,32 @@
}
/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+ uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+ uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+ if (ss_size1 < ss_size2)
+ return (-1);
+ if (ss_size1 > ss_size2)
+ return (1);
+
+ if (s1->ss_start < s2->ss_start)
+ return (-1);
+ if (s1->ss_start > s2->ss_start)
+ return (1);
+
+ return (0);
+}
+
+/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
@@ -349,101 +391,8 @@
return (metaslab_block_picker(t, cursor, size, align));
}
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
static void
-metaslab_ff_load(space_map_t *sm)
-{
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
- sm->sm_pp_root = NULL;
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-}
-
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-
- return (metaslab_block_picker(t, cursor, size, align));
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
-}
-
-/* ARGSUSED */
-static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
-}
-
-static space_map_ops_t metaslab_ff_ops = {
- metaslab_ff_load,
- metaslab_ff_unload,
- metaslab_ff_alloc,
- metaslab_ff_claim,
- metaslab_ff_free,
- NULL /* maxsize */
-};
-
-/*
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
- */
-
-uint64_t
-metaslab_df_maxsize(space_map_t *sm)
-{
- avl_tree_t *t = sm->sm_pp_root;
- space_seg_t *ss;
-
- if (t == NULL || (ss = avl_last(t)) == NULL)
- return (0ULL);
-
- return (ss->ss_end - ss->ss_start);
-}
-
-static int
-metaslab_df_seg_compare(const void *x1, const void *x2)
-{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
- uint64_t ss_size1 = s1->ss_end - s1->ss_start;
- uint64_t ss_size2 = s2->ss_end - s2->ss_start;
-
- if (ss_size1 < ss_size2)
- return (-1);
- if (ss_size1 > ss_size2)
- return (1);
-
- if (s1->ss_start < s2->ss_start)
- return (-1);
- if (s1->ss_start > s2->ss_start)
- return (1);
-
- return (0);
-}
-
-static void
-metaslab_df_load(space_map_t *sm)
+metaslab_pp_load(space_map_t *sm)
{
space_seg_t *ss;
@@ -451,7 +400,7 @@
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+ avl_create(sm->sm_pp_root, metaslab_segsize_compare,
sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
@@ -459,7 +408,7 @@
}
static void
-metaslab_df_unload(space_map_t *sm)
+metaslab_pp_unload(space_map_t *sm)
{
void *cookie = NULL;
@@ -475,13 +424,82 @@
sm->sm_pp_root = NULL;
}
+/* ARGSUSED */
+static void
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/*
+ * Return the maximum contiguous segment within the metaslab.
+ */
+uint64_t
+metaslab_pp_maxsize(space_map_t *sm)
+{
+ avl_tree_t *t = sm->sm_pp_root;
+ space_seg_t *ss;
+
+ if (t == NULL || (ss = avl_last(t)) == NULL)
+ return (0ULL);
+
+ return (ss->ss_end - ss->ss_start);
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
+{
+ return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_ff_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_ff_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_ff_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
static uint64_t
metaslab_df_alloc(space_map_t *sm, uint64_t size)
{
avl_tree_t *t = &sm->sm_root;
uint64_t align = size & -size;
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- uint64_t max_size = metaslab_df_maxsize(sm);
+ uint64_t max_size = metaslab_pp_maxsize(sm);
int free_pct = sm->sm_space * 100 / sm->sm_size;
ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -503,27 +521,154 @@
return (metaslab_block_picker(t, cursor, size, 1ULL));
}
-/* ARGSUSED */
-static void
-metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
{
- /* No need to update cursor */
-}
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
-/* ARGSUSED */
-static void
-metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
+ if (max_size >= metaslab_df_alloc_threshold &&
+ free_pct >= metaslab_df_free_pct)
+ return (B_FALSE);
+
+ return (B_TRUE);
}
static space_map_ops_t metaslab_df_ops = {
- metaslab_df_load,
- metaslab_df_unload,
+ metaslab_pp_load,
+ metaslab_pp_unload,
metaslab_df_alloc,
- metaslab_df_claim,
- metaslab_df_free,
- metaslab_df_maxsize
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_df_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+ uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+ uint64_t rsize = size;
+ uint64_t offset = 0;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ ASSERT3U(*extent_end, >=, *cursor);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if ((*cursor + size) > *extent_end) {
+
+ t = sm->sm_pp_root;
+ *cursor = *extent_end = 0;
+
+ if (max_size > 2 * SPA_MAXBLOCKSIZE)
+ rsize = MIN(metaslab_min_alloc_size, max_size);
+ offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+ if (offset != -1)
+ *cursor = offset + size;
+ } else {
+ offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+ }
+ ASSERT3U(*cursor, <=, *extent_end);
+ return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ if (max_size > (metaslab_min_alloc_size * 10))
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_cdf_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_cdf_fragmented
+};
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, ssearch;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+ t = sm->sm_pp_root;
+
+ if (max_size > 2 * SPA_MAXBLOCKSIZE)
+ size = MIN(metaslab_min_alloc_size, max_size);
+
+ ssearch.ss_start = 0;
+ ssearch.ss_end = size;
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+ ASSERT(ss != NULL);
+ }
+
+ if (ss != NULL) {
+ if (ss->ss_start + size <= ss->ss_end) {
+ *cursor = ss->ss_start + size;
+ return (ss->ss_start);
+ }
+ }
+ return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ if (max_size > (metaslab_min_alloc_size * 10))
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_ndf_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_ndf_fragmented
};
space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
@@ -616,7 +761,6 @@
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
#define METASLAB_ACTIVE_MASK \
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define METASLAB_SMO_BONUS_MULTIPLIER 2
static uint64_t
metaslab_weight(metaslab_t *msp)
@@ -649,25 +793,60 @@
ASSERT(weight >= space && weight <= 2 * space);
/*
- * For locality, assign higher weight to metaslabs we've used before.
+ * For locality, assign higher weight to metaslabs which have
+ * a lower offset than what we've already activated.
*/
- if (smo->smo_object != 0)
- weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ if (sm->sm_start <= mg->mg_bonus_area)
+ weight *= (metaslab_smo_bonus_pct / 100);
ASSERT(weight >= space &&
- weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+ weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+ if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+ /*
+ * If this metaslab is one we're actively using, adjust its
+ * weight to make it preferable to any inactive metaslab so
+ * we'll polish it off.
+ */
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ }
+ return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ int m;
+
+ mutex_enter(&mg->mg_lock);
/*
- * If this metaslab is one we're actively using, adjust its weight to
- * make it preferable to any inactive metaslab so we'll polish it off.
+ * Prefetch the next potential metaslabs
*/
- weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
+
+ /* If we have reached our prefetch limit then we're done */
+ if (m >= metaslab_prefetch_limit)
+ break;
- return (weight);
+ if (!sm->sm_loaded && smo->smo_object != 0) {
+ mutex_exit(&mg->mg_lock);
+ dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
+ 0ULL, smo->smo_objsize);
+ mutex_enter(&mg->mg_lock);
+ }
+ }
+ mutex_exit(&mg->mg_lock);
}
static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
+ metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
@@ -679,13 +858,23 @@
int error = space_map_load(sm, sm_ops, SM_FREE,
&msp->ms_smo,
spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
- if (error) {
+ if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++)
space_map_walk(&msp->ms_defermap[t],
space_map_claim, sm);
+
+ }
+
+ /*
+ * Track the bonus area as we activate new metaslabs.
+ */
+ if (sm->sm_start > mg->mg_bonus_area) {
+ mutex_enter(&mg->mg_lock);
+ mg->mg_bonus_area = sm->sm_start;
+ mutex_exit(&mg->mg_lock);
}
/*
@@ -712,9 +901,7 @@
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
-#if 0
ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
-#endif
metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
@@ -908,6 +1095,32 @@
mutex_exit(&msp->ms_lock);
}
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+
+ /*
+ * Re-evaluate all metaslabs which have lower offsets than the
+ * bonus area.
+ */
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_map.sm_start > mg->mg_bonus_area)
+ break;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+
+ /*
+ * Prefetch the next potential metaslabs
+ */
+ metaslab_prefetch(mg);
+}
+
static uint64_t
metaslab_distance(metaslab_t *msp, dva_t *dva)
{
@@ -1003,7 +1216,7 @@
if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
break;
- metaslab_passivate(msp, size - 1);
+ metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
mutex_exit(&msp->ms_lock);
}
--- a/usr/src/uts/common/fs/zfs/spa.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c Sat Nov 21 22:51:29 2009 -0800
@@ -73,35 +73,38 @@
zti_mode_fixed, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */
zti_mode_tune, /* fill from zio_taskq_tune_* */
+ zti_mode_null, /* don't create a taskq */
zti_nmodes
};
-#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) }
-#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) }
-#define ZTI_THREAD_TUNE { zti_mode_tune, 0 }
-
-#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1)
+#define ZTI_FIX(n) { zti_mode_fixed, (n) }
+#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
+#define ZTI_TUNE { zti_mode_tune, 0 }
+#define ZTI_NULL { zti_mode_null, 0 }
+
+#define ZTI_ONE ZTI_FIX(1)
typedef struct zio_taskq_info {
- const char *zti_name;
- struct {
- enum zti_modes zti_mode;
- uint_t zti_value;
- } zti_nthreads[ZIO_TASKQ_TYPES];
+ enum zti_modes zti_mode;
+ uint_t zti_value;
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
- "issue", "intr"
+ "issue", "issue_high", "intr", "intr_high"
};
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
- /* ISSUE INTR */
- { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } },
- { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } },
- { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+/*
+ * Define the taskq threads for the following I/O types:
+ * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL },
+ { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
};
enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
@@ -596,14 +599,14 @@
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
- const zio_taskq_info_t *ztip = &zio_taskqs[t];
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
- uint_t value = ztip->zti_nthreads[q].zti_value;
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
char name[32];
(void) snprintf(name, sizeof (name),
- "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+ "%s_%s", zio_type_name[t], zio_taskq_types[q]);
if (mode == zti_mode_tune) {
mode = zio_taskq_tune_mode;
@@ -628,6 +631,10 @@
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
break;
+ case zti_mode_null:
+ spa->spa_zio_taskq[t][q] = NULL;
+ break;
+
case zti_mode_tune:
default:
panic("unrecognized mode for "
@@ -674,7 +681,8 @@
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- taskq_destroy(spa->spa_zio_taskq[t][q]);
+ if (spa->spa_zio_taskq[t][q] != NULL)
+ taskq_destroy(spa->spa_zio_taskq[t][q]);
spa->spa_zio_taskq[t][q] = NULL;
}
}
--- a/usr/src/uts/common/fs/zfs/space_map.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/space_map.c Sat Nov 21 22:51:29 2009 -0800
@@ -367,10 +367,8 @@
uint64_t
space_map_maxsize(space_map_t *sm)
{
- if (sm->sm_loaded && sm->sm_ops != NULL)
- return (sm->sm_ops->smop_max(sm));
- else
- return (-1ULL);
+ ASSERT(sm->sm_ops != NULL);
+ return (sm->sm_ops->smop_max(sm));
}
uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h Sat Nov 21 22:51:29 2009 -0800
@@ -43,6 +43,7 @@
extern void metaslab_fini(metaslab_t *msp);
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
--- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h Sat Nov 21 22:51:29 2009 -0800
@@ -51,6 +51,7 @@
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
+ uint64_t mg_bonus_area;
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Sat Nov 21 22:51:29 2009 -0800
@@ -80,7 +80,9 @@
enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
+ ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT,
+ ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES
};
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h Sat Nov 21 22:51:29 2009 -0800
@@ -77,6 +77,7 @@
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
uint64_t (*smop_max)(space_map_t *sm);
+ boolean_t (*smop_fragmented)(space_map_t *sm);
};
/*
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Sat Nov 21 22:51:29 2009 -0800
@@ -120,14 +120,15 @@
#define ZIO_PRIORITY_NOW (zio_priority_table[0])
#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
-#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
-#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
-#define ZIO_PRIORITY_FREE (zio_priority_table[5])
-#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
-#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
-#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
-#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
-#define ZIO_PRIORITY_TABLE_SIZE 10
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
+#define ZIO_PRIORITY_AGG (zio_priority_table[5])
+#define ZIO_PRIORITY_FREE (zio_priority_table[6])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
+#define ZIO_PRIORITY_TABLE_SIZE 11
#define ZIO_PIPELINE_CONTINUE 0x100
#define ZIO_PIPELINE_STOP 0x101
--- a/usr/src/uts/common/fs/zfs/vdev.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c Sat Nov 21 22:51:29 2009 -0800
@@ -1939,11 +1939,15 @@
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
ASSERT(!vd->vdev_ishole);
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
+
+ if (reassess)
+ metaslab_sync_reassess(vd->vdev_mg);
}
void
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c Sat Nov 21 22:51:29 2009 -0800
@@ -285,7 +285,7 @@
ASSERT(size <= zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
- zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+ zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
--- a/usr/src/uts/common/fs/zfs/zio.c Sat Nov 21 01:05:40 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Sat Nov 21 22:51:29 2009 -0800
@@ -45,11 +45,12 @@
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
- 6, /* ZIO_PRIORITY_ASYNC_READ */
- 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 1, /* ZIO_PRIORITY_CACHE_FILL */
+ 1, /* ZIO_PRIORITY_AGG */
4, /* ZIO_PRIORITY_FREE */
- 0, /* ZIO_PRIORITY_CACHE_FILL */
- 0, /* ZIO_PRIORITY_LOG_WRITE */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
@@ -60,7 +61,9 @@
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
- "null", "read", "write", "free", "claim", "ioctl" };
+ "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+ "zio_ioctl"
+};
/*
* ==========================================================================
@@ -1023,6 +1026,7 @@
static void
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
{
+ spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
/*
@@ -1039,7 +1043,15 @@
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
t = ZIO_TYPE_NULL;
- (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
+ /*
+ * If this is a high priority I/O, then use the high priority taskq.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_NOW &&
+ spa->spa_zio_taskq[t][q + 1] != NULL)
+ q++;
+
+ ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+ (void) taskq_dispatch(spa->spa_zio_taskq[t][q],
(task_func_t *)zio_execute, zio, TQ_SLEEP);
}