6472021 vdev knobs can not be turned
authorahrens
Fri, 03 Nov 2006 11:39:28 -0800
changeset 3059 7d69dbccfcbb
parent 3058 61fc1011bc8e
child 3060 6907ecf5a438
6472021 vdev knobs can not be turned
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/cmd/zdb/zdb.c
usr/src/uts/common/fs/zfs/sys/vdev.h
usr/src/uts/common/fs/zfs/sys/vdev_impl.h
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/vdev_cache.c
usr/src/uts/common/fs/zfs/vdev_file.c
usr/src/uts/common/fs/zfs/vdev_queue.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Fri Nov 03 11:39:28 2006 -0800
@@ -329,6 +329,74 @@
 
 /* ARGSUSED */
 static int
+zfs_params(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	/*
+	 * This table can be approximately generated by running:
+	 * egrep "^[a-z0-9_]+ [a-z0-9_]+( =.*)?;" *.c | cut -d ' ' -f 2
+	 */
+	static const char *params[] = {
+		"arc_reduce_dnlc_percent",
+		"zfs_arc_max",
+		"zfs_arc_min",
+		"arc_kmem_reclaim_shift",
+		"zfs_mdcomp_disable",
+		"zfs_prefetch_disable",
+		"zfetch_max_streams",
+		"zfetch_min_sec_reap",
+		"zfetch_block_cap",
+		"zfetch_array_rd_sz",
+		"zfs_default_bs",
+		"zfs_default_ibs",
+		"metaslab_aliquot",
+		"reference_tracking_enable",
+		"reference_history",
+		"zio_taskq_threads",
+		"spa_max_replication_override",
+		"spa_mode",
+		"zfs_flags",
+		"txg_time",
+		"zfs_vdev_cache_max",
+		"zfs_vdev_cache_size",
+		"zfs_vdev_cache_bshift",
+		"vdev_mirror_shift",
+		"zfs_vdev_max_pending",
+		"zfs_vdev_min_pending",
+		"zfs_scrub_limit",
+		"zfs_vdev_time_shift",
+		"zfs_vdev_ramp_rate",
+		"zfs_vdev_aggregation_limit",
+		"fzap_default_block_shift",
+		"zfs_immediate_write_sz",
+		"zfs_read_chunk_size",
+		"zil_disable",
+		"zfs_nocacheflush",
+		"zio_gang_bang",
+		"zio_injection_enabled",
+		"zvol_immediate_write_sz",
+	};
+	int i;
+
+	for (i = 0; i < sizeof (params) / sizeof (params[0]); i++) {
+		int sz;
+		uint64_t val64;
+		uint32_t *val32p = (uint32_t *)&val64;
+
+		sz = mdb_readvar(&val64, params[i]);
+		if (sz == 4) {
+			mdb_printf("%s = 0x%x\n", params[i], *val32p);
+		} else if (sz == 8) {
+			mdb_printf("%s = 0x%llx\n", params[i], val64);
+		} else {
+			mdb_warn("variable %s not found", params[i]);
+		}
+	}
+
+	return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
 blkptr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	blkptr_t bp;
@@ -832,8 +900,8 @@
 void
 vdev_help(void)
 {
-	mdb_printf("[vdev_t*]::vdev [-qr]\n"
-		"\t-> -q display vdev_queue parameters\n"
+	mdb_printf("[vdev_t*]::vdev [-er]\n"
+		"\t-> -e display vdev stats\n"
 		"\t-> -r recursive (visit all children)\n");
 }
 
@@ -845,21 +913,12 @@
  * ADDR             STATE	AUX            DESC
  * fffffffbcde23df0 HEALTHY	-              /dev/dsk/c0t0d0
  *
- * or with "-q" to print out a vdev_t's vdev_queue parameters:
- *
- *  vdev_t: c26ae4c0
- *     c26ae73c min pending         0x2
- *     c26ae744 max pending         0x23
- *     c26ae74c agg limit           0x20000
- *     c26ae754 time shift          0x4
- *     c26ae75c ramp rate           0x2
- *
  * If '-r' is specified, recursively visit all children.
  *
  * With '-e', the statistics associated with the vdev are printed as well.
  */
 static int
-do_print_vdev(uintptr_t addr, int flags, int depth, int queue, int stats,
+do_print_vdev(uintptr_t addr, int flags, int depth, int stats,
     int recursive)
 {
 	vdev_t vdev;
@@ -954,32 +1013,6 @@
 
 		mdb_printf("%-9s %-12s %*s%s\n", state, aux, depth, "", desc);
 
-		if (queue) {
-			mdb_inc_indent(4);
-			mdb_printf("\n");
-			mdb_printf("%p min pending		0x%llx\n",
-			    (uintptr_t)(addr + offsetof(vdev_t,
-			    vdev_queue.vq_min_pending)),
-			    vdev.vdev_queue.vq_min_pending);
-			mdb_printf("%p max pending		0x%llx\n",
-			    (uintptr_t)(addr + offsetof(vdev_t,
-			    vdev_queue.vq_max_pending)),
-			    vdev.vdev_queue.vq_max_pending);
-			mdb_printf("%p agg limit		0x%llx\n",
-			    (uintptr_t)(addr + offsetof(vdev_t,
-			    vdev_queue.vq_agg_limit)),
-			    vdev.vdev_queue.vq_agg_limit);
-			mdb_printf("%p time shift		0x%llx\n",
-			    (uintptr_t)(addr + offsetof(vdev_t,
-			    vdev_queue.vq_time_shift)),
-			    vdev.vdev_queue.vq_time_shift);
-			mdb_printf("%p ramp rate 		0x%llx\n",
-			    (uintptr_t)(addr + offsetof(vdev_t,
-			    vdev_queue.vq_ramp_rate)),
-			    vdev.vdev_queue.vq_ramp_rate);
-			mdb_dec_indent(4);
-		}
-
 		if (stats) {
 			vdev_stat_t *vs = &vdev.vdev_stat;
 			int i;
@@ -1008,7 +1041,7 @@
 			mdb_dec_indent(4);
 		}
 
-		if (queue || stats)
+		if (stats)
 			mdb_printf("\n");
 	}
 
@@ -1025,7 +1058,7 @@
 	}
 
 	for (c = 0; c < children; c++) {
-		if (do_print_vdev(child[c], flags, depth + 2, queue, stats,
+		if (do_print_vdev(child[c], flags, depth + 2, stats,
 		    recursive))
 			return (DCMD_ERR);
 	}
@@ -1036,12 +1069,10 @@
 static int
 vdev_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
-	int print_queue = FALSE;
 	int recursive = FALSE;
 	int stats = FALSE;
 
 	if (mdb_getopts(argc, argv,
-	    'q', MDB_OPT_SETBITS, TRUE, &print_queue,
 	    'r', MDB_OPT_SETBITS, TRUE, &recursive,
 	    'e', MDB_OPT_SETBITS, TRUE, &stats,
 	    NULL) != argc)
@@ -1052,7 +1083,7 @@
 		return (DCMD_ERR);
 	}
 
-	return (do_print_vdev(addr, flags, 0, print_queue, stats, recursive));
+	return (do_print_vdev(addr, flags, 0, stats, recursive));
 }
 
 typedef struct metaslab_walk_data {
@@ -1546,8 +1577,9 @@
 	{ "spa_verify", ":", "verify spa_t consistency", spa_verify },
 	{ "spa_space", ":[-b]", "print spa_t on-disk space usage", spa_space },
 	{ "spa_vdevs", ":", "given a spa_t, print vdev summary", spa_vdevs },
-	{ "vdev", ":[-qre]", "vdev_t summary", vdev_print },
+	{ "vdev", ":[-re]", "vdev_t summary", vdev_print },
 	{ "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline },
+	{ "zfs_params", "", "print zfs tunable parameters", zfs_params },
 	{ NULL }
 };
 
--- a/usr/src/cmd/zdb/zdb.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Fri Nov 03 11:39:28 2006 -0800
@@ -2056,6 +2056,8 @@
 int
 main(int argc, char **argv)
 {
+	extern int zfs_vdev_cache_size;
+
 	int i, c;
 	struct rlimit rl = { 1024, 1024 };
 	spa_t *spa;
@@ -2065,7 +2067,6 @@
 	int verbose = 0;
 	int error;
 	int flag, set;
-	vdev_knob_t *vk;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
@@ -2147,10 +2148,7 @@
 	 * Disable vdev caching.  If we don't do this, live pool traversal
 	 * won't make progress because it will never see disk updates.
 	 */
-	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
-		if (strcmp(vk->vk_name, "cache_size") == 0)
-			vk->vk_default = 0;
-	}
+	zfs_vdev_cache_size = 0;
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && c != 'L' && c != 'l' && c != 'R')
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Fri Nov 03 11:39:28 2006 -0800
@@ -41,18 +41,6 @@
 extern boolean_t zfs_nocacheflush;
 
 /*
- * Vdev knobs.
- */
-typedef struct vdev_knob {
-	char		*vk_name;		/* knob name		*/
-	char		*vk_desc;		/* knob description	*/
-	uint64_t	vk_min;			/* minimum legal value	*/
-	uint64_t	vk_max;			/* maximum legal value	*/
-	uint64_t	vk_default;		/* default value	*/
-	size_t		vk_offset;		/* offset into vdev_t	*/
-} vdev_knob_t;
-
-/*
  * Fault injection modes.
  */
 #define	VDEV_FAULT_NONE		0
@@ -113,8 +101,6 @@
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 
-extern vdev_knob_t *vdev_knob_next(vdev_knob_t *vk);
-
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Fri Nov 03 11:39:28 2006 -0800
@@ -91,22 +91,12 @@
 };
 
 struct vdev_cache {
-	uint64_t	vc_size;
-	uint64_t	vc_bshift;
-	uint64_t	vc_blocksize;
-	uint64_t	vc_max;
 	avl_tree_t	vc_offset_tree;
 	avl_tree_t	vc_lastused_tree;
 	kmutex_t	vc_lock;
 };
 
 struct vdev_queue {
-	uint64_t	vq_min_pending;
-	uint64_t	vq_max_pending;
-	uint64_t	vq_scrub_limit;
-	uint64_t	vq_agg_limit;
-	uint64_t	vq_time_shift;
-	uint64_t	vq_ramp_rate;
 	uint64_t	vq_scrub_count;
 	avl_tree_t	vq_deadline_tree;
 	avl_tree_t	vq_read_tree;
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Fri Nov 03 11:39:28 2006 -0800
@@ -45,15 +45,6 @@
  * Virtual device management.
  */
 
-/*
- * These tunables are for performance analysis, and override the
- * (not-easily-turnable) vdev "knobs".
- */
-int zfs_vdev_cache_max;
-int zfs_vdev_max_pending;
-int zfs_vdev_min_pending;
-int zfs_vdev_time_shift;
-
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
@@ -774,7 +765,6 @@
 vdev_open(vdev_t *vd)
 {
 	int error;
-	vdev_knob_t *vk;
 	int c;
 	uint64_t osize = 0;
 	uint64_t asize, psize;
@@ -791,23 +781,6 @@
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 
-	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
-		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
-
-		*valp = vk->vk_default;
-		*valp = MAX(*valp, vk->vk_min);
-		*valp = MIN(*valp, vk->vk_max);
-	}
-
-	if (zfs_vdev_cache_max)
-		vd->vdev_cache.vc_max = zfs_vdev_cache_max;
-	if (zfs_vdev_max_pending)
-		vd->vdev_queue.vq_max_pending = zfs_vdev_max_pending;
-	if (zfs_vdev_min_pending)
-		vd->vdev_queue.vq_min_pending = zfs_vdev_min_pending;
-	if (zfs_vdev_time_shift)
-		vd->vdev_queue.vq_time_shift = zfs_vdev_time_shift;
-
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_cache_init(vd);
 		vdev_queue_init(vd);
@@ -1748,96 +1721,6 @@
 }
 
 /*
- * Various knobs to tune a vdev.
- */
-static vdev_knob_t vdev_knob[] = {
-	{
-		"cache_size",
-		"size of the read-ahead cache",
-		0,
-		1ULL << 30,
-		10ULL << 20,
-		offsetof(struct vdev, vdev_cache.vc_size)
-	},
-	{
-		"cache_bshift",
-		"log2 of cache blocksize",
-		SPA_MINBLOCKSHIFT,
-		SPA_MAXBLOCKSHIFT,
-		16,
-		offsetof(struct vdev, vdev_cache.vc_bshift)
-	},
-	{
-		"cache_max",
-		"largest block size to cache",
-		0,
-		SPA_MAXBLOCKSIZE,
-		1ULL << 14,
-		offsetof(struct vdev, vdev_cache.vc_max)
-	},
-	{
-		"min_pending",
-		"minimum pending I/Os to the disk",
-		1,
-		10000,
-		4,
-		offsetof(struct vdev, vdev_queue.vq_min_pending)
-	},
-	{
-		"max_pending",
-		"maximum pending I/Os to the disk",
-		1,
-		10000,
-		35,
-		offsetof(struct vdev, vdev_queue.vq_max_pending)
-	},
-	{
-		"scrub_limit",
-		"maximum scrub/resilver I/O queue",
-		0,
-		10000,
-		70,
-		offsetof(struct vdev, vdev_queue.vq_scrub_limit)
-	},
-	{
-		"agg_limit",
-		"maximum size of aggregated I/Os",
-		0,
-		SPA_MAXBLOCKSIZE,
-		SPA_MAXBLOCKSIZE,
-		offsetof(struct vdev, vdev_queue.vq_agg_limit)
-	},
-	{
-		"time_shift",
-		"deadline = pri + (lbolt >> time_shift)",
-		0,
-		63,
-		6,
-		offsetof(struct vdev, vdev_queue.vq_time_shift)
-	},
-	{
-		"ramp_rate",
-		"exponential I/O issue ramp-up rate",
-		1,
-		10000,
-		2,
-		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
-	},
-};
-
-vdev_knob_t *
-vdev_knob_next(vdev_knob_t *vk)
-{
-	if (vk == NULL)
-		return (vdev_knob);
-
-	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
-		return (NULL);
-
-	return (vk);
-}
-
-/*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c	Fri Nov 03 11:39:28 2006 -0800
@@ -60,8 +60,23 @@
  * (4) Write.  Update cache contents after write completion.
  *
  * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
- *     if the total cache size exceeds vc_size.
+ *     if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
  */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer.  At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ */
+int zfs_vdev_cache_max = 1<<14;
+int zfs_vdev_cache_size = 10ULL << 20;
+int zfs_vdev_cache_bshift = 16;
+
+#define	VCBS (1 << zfs_vdev_cache_bshift)
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -109,7 +124,7 @@
 
 	avl_remove(&vc->vc_lastused_tree, ve);
 	avl_remove(&vc->vc_offset_tree, ve);
-	zio_buf_free(ve->ve_data, vc->vc_blocksize);
+	zio_buf_free(ve->ve_data, VCBS);
 	kmem_free(ve, sizeof (vdev_cache_entry_t));
 }
 
@@ -122,20 +137,20 @@
 vdev_cache_allocate(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
-	uint64_t offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
+	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
 	vdev_cache_entry_t *ve;
 
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 
-	if (vc->vc_size == 0)
+	if (zfs_vdev_cache_size == 0)
 		return (NULL);
 
 	/*
 	 * If adding a new entry would exceed the cache size,
 	 * evict the oldest entry (LRU).
 	 */
-	if ((avl_numnodes(&vc->vc_lastused_tree) << vc->vc_bshift) >
-	    vc->vc_size) {
+	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+	    zfs_vdev_cache_size) {
 		ve = avl_first(&vc->vc_lastused_tree);
 		if (ve->ve_fill_io != NULL) {
 			dprintf("can't evict in %p, still filling\n", vc);
@@ -148,7 +163,7 @@
 	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
 	ve->ve_offset = offset;
 	ve->ve_lastused = lbolt;
-	ve->ve_data = zio_buf_alloc(vc->vc_blocksize);
+	ve->ve_data = zio_buf_alloc(VCBS);
 
 	avl_add(&vc->vc_offset_tree, ve);
 	avl_add(&vc->vc_lastused_tree, ve);
@@ -159,7 +174,7 @@
 static void
 vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
 {
-	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 	ASSERT(ve->ve_fill_io == NULL);
@@ -185,7 +200,7 @@
 	vdev_cache_entry_t *ve = zio->io_private;
 	zio_t *dio;
 
-	ASSERT(zio->io_size == vc->vc_blocksize);
+	ASSERT(zio->io_size == VCBS);
 
 	/*
 	 * Add data to the cache.
@@ -227,8 +242,8 @@
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
 	vdev_cache_entry_t *ve, ve_search;
-	uint64_t cache_offset = P2ALIGN(zio->io_offset, vc->vc_blocksize);
-	uint64_t cache_phase = P2PHASE(zio->io_offset, vc->vc_blocksize);
+	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 	zio_t *fio;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -236,17 +251,16 @@
 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
 		return (EINVAL);
 
-	if (zio->io_size > vc->vc_max)
+	if (zio->io_size > zfs_vdev_cache_max)
 		return (EOVERFLOW);
 
 	/*
 	 * If the I/O straddles two or more cache blocks, don't cache it.
 	 */
-	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1,
-	    vc->vc_blocksize))
+	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
 		return (EXDEV);
 
-	ASSERT(cache_phase + zio->io_size <= vc->vc_blocksize);
+	ASSERT(cache_phase + zio->io_size <= VCBS);
 
 	mutex_enter(&vc->vc_lock);
 
@@ -283,8 +297,7 @@
 	}
 
 	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
-	    ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
-	    ZIO_PRIORITY_CACHE_FILL,
+	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
 	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
 	    vdev_cache_fill, ve);
@@ -309,8 +322,8 @@
 	vdev_cache_entry_t *ve, ve_search;
 	uint64_t io_start = zio->io_offset;
 	uint64_t io_end = io_start + zio->io_size;
-	uint64_t min_offset = P2ALIGN(io_start, vc->vc_blocksize);
-	uint64_t max_offset = P2ROUNDUP(io_end, vc->vc_blocksize);
+	uint64_t min_offset = P2ALIGN(io_start, VCBS);
+	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
 	avl_index_t where;
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
@@ -325,7 +338,7 @@
 
 	while (ve != NULL && ve->ve_offset < max_offset) {
 		uint64_t start = MAX(ve->ve_offset, io_start);
-		uint64_t end = MIN(ve->ve_offset + vc->vc_blocksize, io_end);
+		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
 
 		if (ve->ve_fill_io != NULL) {
 			ve->ve_missed_update = 1;
@@ -352,8 +365,6 @@
 	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
 	    sizeof (vdev_cache_entry_t),
 	    offsetof(struct vdev_cache_entry, ve_lastused_node));
-
-	vc->vc_blocksize = 1ULL << vc->vc_bshift;
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/vdev_file.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c	Fri Nov 03 11:39:28 2006 -0800
@@ -54,14 +54,6 @@
 
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
-#ifdef _KERNEL
-	/*
-	 * When using a file vdev in kernel context, the underlying filesystem
-	 * will already be caching the data.  Don't cache it again here.
-	 */
-	vd->vdev_cache.vc_size = 0;
-#endif
-
 	/*
 	 * We always open the files from the root of the global zone, even if
 	 * we're in a local zone.  If the user has gotten to this point, the
@@ -156,8 +148,14 @@
 		return;
 	}
 
+	/*
+	 * In the kernel, don't bother double-caching, but in userland,
+	 * we want to test the vdev_cache code.
+	 */
+#ifndef _KERNEL
 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
 		return;
+#endif
 
 	if ((zio = vdev_queue_io(zio)) == NULL)
 		return;
@@ -186,8 +184,10 @@
 {
 	vdev_queue_io_done(zio);
 
+#ifndef _KERNEL
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
+#endif
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Fri Nov 03 10:20:05 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Fri Nov 03 11:39:28 2006 -0800
@@ -32,6 +32,33 @@
 #include <sys/avl.h>
 
 /*
+ * These tunables are for performance analysis.
+ */
+/*
+ * zfs_vdev_max_pending is the maximum number of i/os concurrently
+ * pending to each device.  zfs_vdev_min_pending is the initial number
+ * of i/os pending to each device (before it starts ramping up to
+ * max_pending).
+ */
+int zfs_vdev_max_pending = 35;
+int zfs_vdev_min_pending = 4;
+
+/* maximum scrub/resilver I/O queue */
+int zfs_scrub_limit = 70;
+
+/* deadline = pri + (lbolt >> time_shift) */
+int zfs_vdev_time_shift = 6;
+
+/* exponential I/O issue ramp-up rate */
+int zfs_vdev_ramp_rate = 2;
+
+/*
+ * i/os will be aggregated into a single large i/o up to
+ * zfs_vdev_aggregation_limit bytes long.
+ */
+int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+
+/*
  * Virtual device vector for disk I/O scheduling.
  */
 int
@@ -119,7 +146,7 @@
 	avl_add(zio->io_vdev_tree, zio);
 
 	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&
-	    ++vq->vq_scrub_count >= vq->vq_scrub_limit)
+	    ++vq->vq_scrub_count >= zfs_scrub_limit)
 		spa_scrub_throttle(zio->io_spa, 1);
 }
 
@@ -127,7 +154,7 @@
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) &&
-	    vq->vq_scrub_count-- >= vq->vq_scrub_limit)
+	    vq->vq_scrub_count-- >= zfs_scrub_limit)
 		spa_scrub_throttle(zio->io_spa, -1);
 
 	avl_remove(&vq->vq_deadline_tree, zio);
@@ -182,14 +209,14 @@
 	size = fio->io_size;
 
 	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-	    size + dio->io_size <= vq->vq_agg_limit) {
+	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
 		dio->io_delegate_next = fio;
 		fio = dio;
 		size += dio->io_size;
 	}
 
 	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-	    size + dio->io_size <= vq->vq_agg_limit) {
+	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
 		lio->io_delegate_next = dio;
 		lio = dio;
 		size += dio->io_size;
@@ -200,7 +227,7 @@
 		uint64_t offset = 0;
 		int nagg = 0;
 
-		ASSERT(size <= vq->vq_agg_limit);
+		ASSERT(size <= zfs_vdev_aggregation_limit);
 
 		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
 		    fio->io_offset, buf, size, fio->io_type,
@@ -266,12 +293,12 @@
 
 	mutex_enter(&vq->vq_lock);
 
-	zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
+	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
 	    zio->io_priority;
 
 	vdev_queue_io_add(vq, zio);
 
-	nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
+	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func);
 
 	mutex_exit(&vq->vq_lock);
 
@@ -294,8 +321,8 @@
 
 	avl_remove(&vq->vq_pending_tree, zio);
 
-	for (i = 0; i < vq->vq_ramp_rate; i++) {
-		nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func);
+	for (i = 0; i < zfs_vdev_ramp_rate; i++) {
+		nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func);
 		if (nio == NULL)
 			break;
 		mutex_exit(&vq->vq_lock);