4894692 caching data in heap inflates crash dump
authorjohansen
Tue, 19 Dec 2006 23:13:06 -0800
changeset 3290 256464cbb73c
parent 3289 95e8ec05aa83
child 3291 a0d6d28506cf
4894692 caching data in heap inflates crash dump 6499454 time to increase size of kmem default allocation caches 6499459 vm should stop checking kvp directly
usr/src/cmd/mdb/common/modules/genunix/memory.c
usr/src/uts/common/cpr/cpr_dump.c
usr/src/uts/common/fs/fsflush.c
usr/src/uts/common/fs/lofs/lofs_vnops.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/os/kmem.c
usr/src/uts/common/os/mem_cage.c
usr/src/uts/common/os/mem_config.c
usr/src/uts/common/os/vm_pageout.c
usr/src/uts/common/sys/vnode.h
usr/src/uts/common/vm/page.h
usr/src/uts/common/vm/page_lock.c
usr/src/uts/common/vm/page_retire.c
usr/src/uts/common/vm/seg_kmem.c
usr/src/uts/common/vm/seg_kmem.h
usr/src/uts/common/vm/seg_vn.c
usr/src/uts/common/vm/vm_page.c
usr/src/uts/common/vm/vm_pagelist.c
usr/src/uts/i86pc/os/startup.c
usr/src/uts/i86pc/sys/machparam.h
usr/src/uts/i86pc/vm/vm_dep.h
usr/src/uts/i86pc/vm/vm_machdep.c
usr/src/uts/sfmmu/vm/hat_sfmmu.c
usr/src/uts/sparc/v9/vm/seg_nf.c
usr/src/uts/sun4/os/startup.c
usr/src/uts/sun4/sys/vm_machparam.h
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -218,6 +217,7 @@
 /* Summary statistics of pages */
 typedef struct memstat {
 	struct vnode    *ms_kvp;	/* Cached address of kernel vnode */
+	struct vnode    *ms_zvp;	/* Cached address of zio vnode    */
 	uint64_t	ms_kmem;	/* Pages of kernel memory	  */
 	uint64_t	ms_anon;	/* Pages of anonymous memory	  */
 	uint64_t	ms_vnode;	/* Pages of named (vnode) memory  */
@@ -226,6 +226,10 @@
 	uint64_t	ms_total;	/* Pages on page hash		  */
 } memstat_t;
 
+#define	MS_PP_ISKAS(pp, stats)				\
+	(((pp)->p_vnode == (stats)->ms_kvp) ||		\
+	    (((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp)))
+
 /*
  * Summarize pages by type; called from page walker.
  */
@@ -252,7 +256,7 @@
 		stats->ms_cachelist++;
 	else if (vp && IS_SWAPFSVP(vp))
 		stats->ms_anon++;
-	else if (pp->p_vnode == stats->ms_kvp)
+	else if (MS_PP_ISKAS(pp, stats))
 		stats->ms_kmem++;
 	else if (vp && (((vp)->v_flag & VVMEXEC)) != 0)
 		stats->ms_exec++;
@@ -308,6 +312,17 @@
 
 	stats.ms_kvp = (struct vnode *)(uintptr_t)sym.st_value;
 
+	/*
+	 * Read the zio vnode pointer.  It may not exist on all kernels, so it
+	 * it isn't found, it's not a fatal error.
+	 */
+	if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "zvp",
+		(GElf_Sym *)&sym) == -1) {
+		stats.ms_zvp = NULL;
+	} else {
+		stats.ms_zvp = (struct vnode *)(uintptr_t)sym.st_value;
+	}
+
 	/* Walk page structures, summarizing usage */
 	if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback,
 		&stats) == -1) {
--- a/usr/src/uts/common/cpr/cpr_dump.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/cpr/cpr_dump.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -790,11 +789,11 @@
 	do {
 #if defined(__sparc)
 		extern struct vnode prom_ppages;
-		if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
+		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
 		    pp->p_vnode == &prom_ppages ||
 			PP_ISFREE(pp) && PP_ISAGED(pp))
 #else
-		if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
+		if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
 		    PP_ISFREE(pp) && PP_ISAGED(pp))
 #endif /* __sparc */
 			continue;
--- a/usr/src/uts/common/fs/fsflush.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/fsflush.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -24,7 +23,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -232,7 +231,7 @@
 			coal_page = NULL;
 		}
 
-		if (pp->p_vnode == &kvp ||
+		if (PP_ISKAS(pp) ||
 		    PAGE_LOCKED(pp) ||
 		    pp->p_lckcnt != 0 ||
 		    pp->p_cowcnt != 0)
@@ -255,7 +254,7 @@
 		if (PP_ISSWAP(pp) ||
 		    PP_ISFREE(pp) ||
 		    vp == NULL ||
-		    vp == &kvp ||
+		    PP_ISKAS(pp) ||
 		    pp->p_lckcnt != 0 ||
 		    pp->p_cowcnt != 0 ||
 		    (vp->v_flag & VISSWAP) != 0) {
--- a/usr/src/uts/common/fs/lofs/lofs_vnops.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/lofs/lofs_vnops.c	Tue Dec 19 23:13:06 2006 -0800
@@ -1048,7 +1048,7 @@
 lo_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr)
 {
 	vp = realvp(vp);
-	if (vp != NULL && vp != &kvp)
+	if (vp != NULL && !VN_ISKAS(vp))
 		VOP_DISPOSE(vp, pp, fl, dn, cr);
 }
 
--- a/usr/src/uts/common/fs/zfs/arc.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Tue Dec 19 23:13:06 2006 -0800
@@ -230,10 +230,6 @@
 };
 
 struct arc_buf_hdr {
-	/* immutable */
-	uint64_t		b_size;
-	spa_t			*b_spa;
-
 	/* protected by hash lock */
 	dva_t			b_dva;
 	uint64_t		b_birth;
@@ -247,8 +243,13 @@
 	uint32_t		b_flags;
 	uint32_t		b_datacnt;
 
+	arc_callback_t		*b_acb;
 	kcondvar_t		b_cv;
-	arc_callback_t		*b_acb;
+
+	/* immutable */
+	arc_buf_contents_t	b_type;
+	uint64_t		b_size;
+	spa_t			*b_spa;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
@@ -746,7 +747,7 @@
 }
 
 arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag)
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
@@ -755,6 +756,7 @@
 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
+	hdr->b_type = type;
 	hdr->b_spa = spa;
 	hdr->b_state = arc.anon;
 	hdr->b_arc_access = 0;
@@ -839,10 +841,16 @@
 	if (buf->b_data) {
 		arc_state_t *state = buf->b_hdr->b_state;
 		uint64_t size = buf->b_hdr->b_size;
+		arc_buf_contents_t type = buf->b_hdr->b_type;
 
 		arc_cksum_verify(buf);
 		if (!recycle) {
-			zio_buf_free(buf->b_data, size);
+			if (type == ARC_BUFC_METADATA) {
+				zio_buf_free(buf->b_data, size);
+			} else {
+				ASSERT(type == ARC_BUFC_DATA);
+				zio_data_buf_free(buf->b_data, size);
+			}
 			atomic_add_64(&arc.size, -size);
 		}
 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
@@ -1003,7 +1011,8 @@
  * new buffer in a full arc cache.
  */
 static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle)
+arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+    arc_buf_contents_t type)
 {
 	arc_state_t *evicted_state;
 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
@@ -1041,7 +1050,8 @@
 				arc_buf_t *buf = ab->b_buf;
 				if (buf->b_data) {
 					bytes_evicted += ab->b_size;
-					if (recycle && ab->b_size == bytes) {
+					if (recycle && ab->b_type == type &&
+					    ab->b_size == bytes) {
 						stolen = buf->b_data;
 						recycle = FALSE;
 					}
@@ -1147,7 +1157,7 @@
 
 	if (top_sz > arc.p && arc.mru->lsize > 0) {
 		int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
-		(void) arc_evict(arc.mru, toevict, FALSE);
+		(void) arc_evict(arc.mru, toevict, FALSE, ARC_BUFC_UNDEF);
 		top_sz = arc.anon->size + arc.mru->size;
 	}
 
@@ -1165,7 +1175,8 @@
 
 		if (arc.mfu->lsize > 0) {
 			int64_t toevict = MIN(arc.mfu->lsize, arc_over);
-			(void) arc_evict(arc.mfu, toevict, FALSE);
+			(void) arc_evict(arc.mfu, toevict, FALSE,
+			    ARC_BUFC_UNDEF);
 		}
 
 		tbl_over = arc.size + arc.mru_ghost->lsize +
@@ -1207,9 +1218,9 @@
 arc_flush(void)
 {
 	while (list_head(&arc.mru->list))
-		(void) arc_evict(arc.mru, -1, FALSE);
+		(void) arc_evict(arc.mru, -1, FALSE, ARC_BUFC_UNDEF);
 	while (list_head(&arc.mfu->list))
-		(void) arc_evict(arc.mfu, -1, FALSE);
+		(void) arc_evict(arc.mfu, -1, FALSE, ARC_BUFC_UNDEF);
 
 	arc_evict_ghost(arc.mru_ghost, -1);
 	arc_evict_ghost(arc.mfu_ghost, -1);
@@ -1315,7 +1326,9 @@
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
+	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
+	extern kmem_cache_t	*zio_data_buf_cache[];
 
 #ifdef _KERNEL
 	/*
@@ -1344,6 +1357,10 @@
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
+		if (zio_data_buf_cache[i] != prev_data_cache) {
+			prev_data_cache = zio_data_buf_cache[i];
+			kmem_cache_reap_now(zio_data_buf_cache[i]);
+		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_cache);
@@ -1498,8 +1515,9 @@
 static void
 arc_get_data_buf(arc_buf_t *buf)
 {
-	arc_state_t	*state = buf->b_hdr->b_state;
-	uint64_t	size = buf->b_hdr->b_size;
+	arc_state_t		*state = buf->b_hdr->b_state;
+	uint64_t		size = buf->b_hdr->b_size;
+	arc_buf_contents_t	type = buf->b_hdr->b_type;
 
 	arc_adapt(size, state);
 
@@ -1508,7 +1526,12 @@
 	 * just allocate a new buffer.
 	 */
 	if (!arc_evict_needed()) {
-		buf->b_data = zio_buf_alloc(size);
+		if (type == ARC_BUFC_METADATA) {
+			buf->b_data = zio_buf_alloc(size);
+		} else {
+			ASSERT(type == ARC_BUFC_DATA);
+			buf->b_data = zio_data_buf_alloc(size);
+		}
 		atomic_add_64(&arc.size, size);
 		goto out;
 	}
@@ -1530,8 +1553,13 @@
 		uint64_t mfu_space = arc.c - arc.p;
 		state =  (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu;
 	}
-	if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) {
-		buf->b_data = zio_buf_alloc(size);
+	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+		if (type == ARC_BUFC_METADATA) {
+			buf->b_data = zio_buf_alloc(size);
+		} else {
+			ASSERT(type == ARC_BUFC_DATA);
+			buf->b_data = zio_data_buf_alloc(size);
+		}
 		atomic_add_64(&arc.size, size);
 		atomic_add_64(&arc.recycle_miss, 1);
 	}
@@ -1916,8 +1944,8 @@
 		if (hdr == NULL) {
 			/* this block is not in the cache */
 			arc_buf_hdr_t	*exists;
-
-			buf = arc_buf_alloc(spa, size, private);
+			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			hdr->b_dva = *BP_IDENTITY(bp);
 			hdr->b_birth = bp->blk_birth;
@@ -2177,6 +2205,7 @@
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		spa_t *spa = hdr->b_spa;
+		arc_buf_contents_t type = hdr->b_type;
 
 		ASSERT(hdr->b_datacnt > 1);
 		/*
@@ -2202,6 +2231,7 @@
 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
 		nhdr->b_size = blksz;
 		nhdr->b_spa = spa;
+		nhdr->b_type = type;
 		nhdr->b_buf = buf;
 		nhdr->b_state = arc.anon;
 		nhdr->b_arc_access = 0;
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Tue Dec 19 23:13:06 2006 -0800
@@ -504,9 +504,11 @@
 		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
 
 	if (bp == NULL || BP_IS_HOLE(bp)) {
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
 		ASSERT(bp == NULL || BP_IS_HOLE(bp));
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db));
+		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
 		*flags |= DB_RF_CACHED;
@@ -615,10 +617,12 @@
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db));
+		    db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
@@ -643,6 +647,7 @@
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	arc_buf_t **quiescing, **syncing;
+	arc_buf_contents_t type;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
@@ -665,8 +670,9 @@
 		ASSERT(*syncing != db->db_buf);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 			int size = db->db.db_size;
+			type = DBUF_GET_BUFC_TYPE(db);
 			*quiescing = arc_buf_alloc(
-			    db->db_dnode->dn_objset->os_spa, size, db);
+			    db->db_dnode->dn_objset->os_spa, size, db, type);
 			bcopy(db->db.db_data, (*quiescing)->b_data, size);
 		} else {
 			dbuf_set_data(db, NULL);
@@ -685,10 +691,11 @@
 		ASSERT3U(db->db_dirtycnt, ==, 1);
 		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 			int size = db->db.db_size;
+			type = DBUF_GET_BUFC_TYPE(db);
 			/* we can't copy if we have already started a write */
 			ASSERT(*syncing != db->db_data_pending);
 			*syncing = arc_buf_alloc(
-			    db->db_dnode->dn_objset->os_spa, size, db);
+			    db->db_dnode->dn_objset->os_spa, size, db, type);
 			bcopy(db->db.db_data, (*syncing)->b_data, size);
 		} else {
 			dbuf_set_data(db, NULL);
@@ -860,6 +867,7 @@
 {
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
+	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
@@ -879,7 +887,7 @@
 	dbuf_will_dirty(db, tx);
 
 	/* create the data buffer for the new block */
-	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
+	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
@@ -1588,9 +1596,10 @@
 	    db->db_data_pending == db->db_buf) {
 		int size = (db->db_blkid == DB_BONUS_BLKID) ?
 		    DN_MAX_BONUSLEN : db->db.db_size;
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    size, db));
+		    size, db, type));
 		bcopy(db->db_data_pending->b_data, db->db.db_data,
 		    db->db.db_size);
 	}
@@ -1766,6 +1775,7 @@
 	int checksum, compress;
 	zbookmark_t zb;
 	int blksz;
+	arc_buf_contents_t type;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
@@ -1823,6 +1833,7 @@
 	}
 
 	if (db->db_level == 0) {
+		type = DBUF_GET_BUFC_TYPE(db);
 		data = &db->db_d.db_data_old[txg&TXG_MASK];
 		blksz = arc_buf_size(*data);
 
@@ -1849,7 +1860,8 @@
 		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
 			if (refcount_count(&db->db_holds) > 1 &&
 			    *data == db->db_buf) {
-				*data = arc_buf_alloc(os->os_spa, blksz, db);
+				*data = arc_buf_alloc(os->os_spa, blksz, db,
+				    type);
 				bcopy(db->db.db_data, (*data)->b_data, blksz);
 			}
 			db->db_data_pending = *data;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Tue Dec 19 23:13:06 2006 -0800
@@ -737,7 +737,8 @@
 	int err;
 	zbookmark_t zb;
 	arc_buf_t *abuf =
-	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
+	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG,
+		ARC_BUFC_METADATA);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(os->os_synctx == NULL);
--- a/usr/src/uts/common/fs/zfs/spa.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2133,7 +2133,7 @@
 {
 	spa_t *spa = zio->io_spa;
 
-	zio_buf_free(zio->io_data, zio->io_size);
+	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2155,7 +2155,7 @@
     zbookmark_t *zb)
 {
 	size_t size = BP_GET_LSIZE(bp);
-	void *data = zio_buf_alloc(size);
+	void *data = zio_data_buf_alloc(size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight++;
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Tue Dec 19 23:13:06 2006 -0800
@@ -54,6 +54,11 @@
 	void			*b_private;
 };
 
+typedef enum arc_buf_contents {
+	ARC_BUFC_UNDEF,				/* buffer contents undefined */
+	ARC_BUFC_DATA,				/* buffer contains data */
+	ARC_BUFC_METADATA			/* buffer contains metadata */
+} arc_buf_contents_t;
 /*
  * These are the flags we pass into calls to the arc
  */
@@ -62,7 +67,8 @@
 #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
 #define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
 
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+    arc_buf_contents_t type);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Tue Dec 19 23:13:06 2006 -0800
@@ -254,6 +254,11 @@
 void dbuf_init(void);
 void dbuf_fini(void);
 
+#define	DBUF_GET_BUFC_TYPE(db)					\
+	((((db)->db_level > 0) ||				\
+	    (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ?	\
+	    ARC_BUFC_METADATA : ARC_BUFC_DATA);
+
 #ifdef ZFS_DEBUG
 
 /*
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Tue Dec 19 23:13:06 2006 -0800
@@ -306,6 +306,9 @@
 
 #include <sys/dmu.h>
 
+#define	BP_GET_BUFC_TYPE(bp)						\
+	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
+	ARC_BUFC_METADATA : ARC_BUFC_DATA);
 /*
  * Routines found in spa.c
  */
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Dec 19 23:13:06 2006 -0800
@@ -295,6 +295,8 @@
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
 
 /*
  * Move an I/O to the next stage of the pipeline and execute that stage.
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Dec 19 23:13:06 2006 -0800
@@ -82,11 +82,21 @@
  * ==========================================================================
  */
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
 
 void
 zio_init(void)
 {
 	size_t c;
+	vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+	data_alloc_arena = zio_alloc_arena;
+#endif
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
@@ -111,10 +121,16 @@
 		}
 
 		if (align != 0) {
-			char name[30];
+			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+
+			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+			zio_data_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
+			    KMC_NODEBUG);
+
 			dprintf("creating cache for size %5lx align %5lx\n",
 			    size, align);
 		}
@@ -124,6 +140,10 @@
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+		ASSERT(zio_data_buf_cache[c] != NULL);
+		if (zio_data_buf_cache[c - 1] == NULL)
+			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
@@ -134,6 +154,7 @@
 {
 	size_t c;
 	kmem_cache_t *last_cache = NULL;
+	kmem_cache_t *last_data_cache = NULL;
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		if (zio_buf_cache[c] != last_cache) {
@@ -141,6 +162,12 @@
 			kmem_cache_destroy(zio_buf_cache[c]);
 		}
 		zio_buf_cache[c] = NULL;
+
+		if (zio_data_buf_cache[c] != last_data_cache) {
+			last_data_cache = zio_data_buf_cache[c];
+			kmem_cache_destroy(zio_data_buf_cache[c]);
+		}
+		zio_data_buf_cache[c] = NULL;
 	}
 
 	zio_inject_fini();
@@ -151,6 +178,13 @@
  * Allocate and free I/O buffers
  * ==========================================================================
  */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
 void *
 zio_buf_alloc(size_t size)
 {
@@ -161,6 +195,22 @@
 	return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
 }
 
+/*
+ * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
+ * crashdump if the kernel panics.  This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
+}
+
 void
 zio_buf_free(void *buf, size_t size)
 {
@@ -171,6 +221,15 @@
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_data_buf_cache[c], buf);
+}
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
--- a/usr/src/uts/common/os/kmem.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/kmem.c	Tue Dec 19 23:13:06 2006 -0800
@@ -154,9 +154,11 @@
 	P2ALIGN(8192 / 1, 64),
 	4096 * 3,
 	8192 * 2,
+	8192 * 3,
+	8192 * 4,
 };
 
-#define	KMEM_MAXBUF	16384
+#define	KMEM_MAXBUF	32768
 
 static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
 
--- a/usr/src/uts/common/os/mem_cage.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/mem_cage.c	Tue Dec 19 23:13:06 2006 -0800
@@ -1308,7 +1308,7 @@
 		 * non-swapfs (i.e. anonymous memory) file system pages.
 		 */
 		ASSERT(rootpp->p_vnode != NULL &&
-		    rootpp->p_vnode != &kvp &&
+		    !PP_ISKAS(rootpp) &&
 		    !IS_SWAPFSVP(rootpp->p_vnode));
 		PP_SETNORELOC(rootpp);
 		return (1);
@@ -1783,7 +1783,7 @@
 				continue;
 			}
 
-			if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) ||
+			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
 			    !page_trylock(pp, SE_EXCL)) {
 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
 				continue;
@@ -1791,7 +1791,7 @@
 
 			/* P_NORELOC bit should not have gone away. */
 			ASSERT(PP_ISNORELOC(pp));
-			if (PP_ISFREE(pp) || (pp->p_vnode == &kvp &&
+			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
 			    pp->p_lckcnt > 0)) {
 				page_unlock(pp);
 				continue;
--- a/usr/src/uts/common/os/mem_config.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/mem_config.c	Tue Dec 19 23:13:06 2006 -0800
@@ -1923,7 +1923,7 @@
 				 * Unload the mappings and check if mod bit
 				 * is set.
 				 */
-				ASSERT(pp->p_vnode != &kvp);
+				ASSERT(!PP_ISKAS(pp));
 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 				mod = hat_ismod(pp);
 
--- a/usr/src/uts/common/os/vm_pageout.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/vm_pageout.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -950,8 +949,7 @@
 	 * NOTE:  These optimizations assume that reads are atomic.
 	 */
 top:
-	if ((pp->p_vnode == &kvp) ||
-	    (PP_ISFREE(pp)) ||
+	if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) ||
 	    (hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) {
 		return (-1);
 	}
--- a/usr/src/uts/common/sys/vnode.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/sys/vnode.h	Tue Dec 19 23:13:06 2006 -0800
@@ -960,6 +960,11 @@
 	((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \
 	VOP_CMP(VP1, VP2) : 0))
 
+extern struct vnode kvp;
+extern struct vnode zvp;
+
+#define	VN_ISKAS(vp)		((vp) == &kvp || (vp) == &zvp)
+
 #endif	/* _KERNEL */
 
 /*
@@ -1001,7 +1006,7 @@
  */
 #define	VN_DISPOSE(pp, flag, dn, cr)	{ \
 	extern struct vnode kvp; \
-	if ((pp)->p_vnode != NULL && (pp)->p_vnode != &kvp) \
+	if ((pp)->p_vnode != NULL && !VN_ISKAS((pp)->p_vnode)) \
 		VOP_DISPOSE((pp)->p_vnode, (pp), (flag), (dn), (cr)); \
 	else if ((flag) == B_FREE) \
 		page_free((pp), (dn)); \
--- a/usr/src/uts/common/vm/page.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page.h	Tue Dec 19 23:13:06 2006 -0800
@@ -877,8 +877,9 @@
 #define	PP_ISAGED(pp)		(((pp)->p_state & P_FREE) && \
 					((pp)->p_vnode == NULL))
 #define	PP_ISNORELOC(pp)	((pp)->p_state & P_NORELOC)
-#define	PP_ISKVP(pp)		((pp)->p_vnode == &kvp)
-#define	PP_ISNORELOCKERNEL(pp)	(PP_ISNORELOC(pp) && PP_ISKVP(pp))
+#define	PP_ISKAS(pp)		(((pp)->p_vnode == &kvp) || \
+					    ((pp)->p_vnode == &zvp))
+#define	PP_ISNORELOCKERNEL(pp)	(PP_ISNORELOC(pp) && PP_ISKAS(pp))
 #define	PP_ISMIGRATE(pp)	((pp)->p_state & P_MIGRATE)
 #define	PP_ISSWAP(pp)		((pp)->p_state & P_SWAP)
 
@@ -956,7 +957,7 @@
 #define	PP_PR_REQ(pp)	(((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
 #define	PP_PR_NOSHARE(pp)						\
 	((((pp)->p_toxic & (PR_RETIRED | PR_FMA | PR_UE)) == PR_FMA) &&	\
-	!PP_ISKVP(pp))
+	!PP_ISKAS(pp))
 
 /*
  * Flags for page_unretire_pp
--- a/usr/src/uts/common/vm/page_lock.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page_lock.c	Tue Dec 19 23:13:06 2006 -0800
@@ -142,6 +142,12 @@
 
 extern	struct vnode	kvp;
 
+/*
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
+ * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
+ * VPH_TABLE_SIZE + 1.
+ */
+
 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
 
 /*
@@ -861,6 +867,9 @@
 {
 	if (vp == &kvp)
 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
+
+	if (vp == &zvp)
+		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
 #ifdef DEBUG
 	if (page_vnode_mutex_stress != 0)
 		return (&vph_mutex[0]);
@@ -913,7 +922,7 @@
 	ASSERT(!PP_ISFREE(pp));
 	ASSERT(pp->p_vnode != NULL);
 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
-	ASSERT(pp->p_vnode != &kvp);
+	ASSERT(!PP_ISKAS(pp));
 
 again:
 	if (pszc == 0) {
--- a/usr/src/uts/common/vm/page_retire.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page_retire.c	Tue Dec 19 23:13:06 2006 -0800
@@ -355,7 +355,7 @@
 	int whichtype = 0;			\
 	if (pp->p_vnode)			\
 		whichtype |= PRT_NAMED;		\
-	if (PP_ISKVP(pp))			\
+	if (PP_ISKAS(pp))			\
 		whichtype |= PRT_KERNEL;	\
 	if (PP_ISFREE(pp))			\
 		whichtype |= PRT_FREE;		\
@@ -882,7 +882,7 @@
 page_retire_thread_cb(page_t *pp)
 {
 	PR_DEBUG(prd_tctop);
-	if (!PP_ISKVP(pp) && page_trylock(pp, SE_EXCL)) {
+	if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
 		PR_DEBUG(prd_tclocked);
 		page_unlock(pp);
 	}
@@ -901,7 +901,7 @@
 	 * Don't scrub the kernel, since we might still need it, unless
 	 * we have UEs on the page, in which case we have nothing to lose.
 	 */
-	if (!PP_ISKVP(pp) || PP_TOXIC(pp)) {
+	if (!PP_ISKAS(pp) || PP_TOXIC(pp)) {
 		pp->p_selock = -1;	/* pacify ASSERTs */
 		PP_CLRFREE(pp);
 		pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/seg_kmem.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_kmem.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -103,6 +102,7 @@
 char *ekernelheap;		/* end of primary kernel heap */
 struct seg kvseg;		/* primary kernel heap segment */
 struct seg kvseg_core;		/* "core" kernel heap segment */
+struct seg kzioseg;		/* Segment for zio mappings */
 vmem_t *heap_arena;		/* primary kernel heap arena */
 vmem_t *heap_core_arena;	/* core kernel heap arena */
 char *heap_core_base;		/* start of core kernel heap arena */
@@ -114,9 +114,12 @@
 vmem_t *heaptext_arena;		/* heaptext arena */
 struct as kas;			/* kernel address space */
 struct vnode kvp;		/* vnode for all segkmem pages */
+struct vnode zvp;		/* vnode for zfs pages */
 int segkmem_reloc;		/* enable/disable relocatable segkmem pages */
 vmem_t *static_arena;		/* arena for caches to import static memory */
 vmem_t *static_alloc_arena;	/* arena for allocating static memory */
+vmem_t *zio_arena = NULL;	/* arena for allocating zio memory */
+vmem_t *zio_alloc_arena = NULL;	/* arena for allocating zio memory */
 
 /*
  * seg_kmem driver can map part of the kernel heap with large pages.
@@ -427,6 +430,7 @@
 	pgcnt_t npages;
 	spgcnt_t pg;
 	page_t *pp;
+	struct vnode *vp = seg->s_data;
 
 	ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
 
@@ -451,7 +455,7 @@
 	switch (type) {
 	case F_SOFTLOCK:	/* lock down already-loaded translations */
 		for (pg = 0; pg < npages; pg++) {
-			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+			pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
 			    SE_SHARED);
 			if (pp == NULL) {
 				/*
@@ -461,7 +465,7 @@
 				if (!hat_probe(kas.a_hat, addr)) {
 					addr -= PAGESIZE;
 					while (--pg >= 0) {
-						pp = page_find(&kvp,
+						pp = page_find(vp,
 						(u_offset_t)(uintptr_t)addr);
 						if (pp)
 							page_unlock(pp);
@@ -477,7 +481,7 @@
 		return (0);
 	case F_SOFTUNLOCK:
 		while (npages--) {
-			pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+			pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
 			if (pp)
 				page_unlock(pp);
 			addr += PAGESIZE;
@@ -645,6 +649,13 @@
 		    segkmem_dump_range, seg->s_as);
 		vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
 		    segkmem_dump_range, seg->s_as);
+	} else if (seg == &kzioseg) {
+		/*
+		 * We don't want to dump pages attached to kzioseg since they
+		 * contain file data from ZFS.  If this page's segment is
+		 * kzioseg return instead of writing it to the dump device.
+		 */
+		return;
 	} else {
 		segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
 	}
@@ -666,6 +677,7 @@
 	pgcnt_t npages;
 	spgcnt_t pg;
 	size_t nb;
+	struct vnode *vp = seg->s_data;
 
 	ASSERT(ppp != NULL);
 
@@ -706,7 +718,7 @@
 	}
 
 	for (pg = 0; pg < npages; pg++) {
-		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
+		pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
 		if (pp == NULL) {
 			while (--pg >= 0)
 				page_unlock(pplist[pg]);
@@ -791,11 +803,21 @@
 };
 
 int
+segkmem_zio_create(struct seg *seg)
+{
+	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
+	seg->s_ops = &segkmem_ops;
+	seg->s_data = &zvp;
+	kas.a_size += seg->s_size;
+	return (0);
+}
+
+int
 segkmem_create(struct seg *seg)
 {
 	ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
 	seg->s_ops = &segkmem_ops;
-	seg->s_data = NULL;
+	seg->s_data = &kvp;
 	kas.a_size += seg->s_size;
 	return (0);
 }
@@ -806,6 +828,10 @@
 {
 	struct seg kseg;
 	int pgflags;
+	struct vnode *vp = arg;
+
+	if (vp == NULL)
+		vp = &kvp;
 
 	kseg.s_as = &kas;
 	pgflags = PG_EXCL;
@@ -819,7 +845,7 @@
 	if (vmflag & VM_PUSHPAGE)
 		pgflags |= PG_PUSHPAGE;
 
-	return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size,
+	return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size,
 	    pgflags, &kseg, addr));
 }
 
@@ -897,12 +923,14 @@
 	return (addr);
 }
 
-void *
-segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+static void *
+segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
 {
 	void *addr;
 	segkmem_gc_list_t *gcp, **prev_gcpp;
 
+	ASSERT(vp != NULL);
+
 	if (kvseg.s_base == NULL) {
 #ifndef __sparc
 		if (bootops->bsys_alloc == NULL)
@@ -928,7 +956,19 @@
 		return (addr);
 	}
 	return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
-	    segkmem_page_create, NULL));
+	    segkmem_page_create, vp));
+}
+
+void *
+segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+	return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
+}
+
+void *
+segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+	return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
 }
 
 /*
@@ -937,8 +977,8 @@
  * we currently don't have a special kernel segment for non-paged
  * kernel memory that is exported by drivers to user space.
  */
-void
-segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+static void
+segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp)
 {
 	page_t *pp;
 	caddr_t addr = inaddr;
@@ -946,6 +986,7 @@
 	pgcnt_t npages = btopr(size);
 
 	ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+	ASSERT(vp != NULL);
 
 	if (kvseg.s_base == NULL) {
 		segkmem_gc_list_t *gc = inaddr;
@@ -960,7 +1001,7 @@
 
 	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
 #if defined(__x86)
-		pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+		pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
 		if (pp == NULL)
 			panic("segkmem_free: page not found");
 		if (!page_tryupgrade(pp)) {
@@ -969,11 +1010,11 @@
 			 * it to drop the lock so we can free this page.
 			 */
 			page_unlock(pp);
-			pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+			pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
 			    SE_EXCL);
 		}
 #else
-		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+		pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
 #endif
 		if (pp == NULL)
 			panic("segkmem_free: page not found");
@@ -985,6 +1026,19 @@
 
 	if (vmp != NULL)
 		vmem_free(vmp, inaddr, size);
+
+}
+
+void
+segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+	segkmem_free_vn(vmp, inaddr, size, &kvp);
+}
+
+void
+segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+	segkmem_free_vn(vmp, inaddr, size, &zvp);
 }
 
 void
@@ -1441,6 +1495,22 @@
 	return (use_large_pages);
 }
 
+void
+segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
+{
+	ASSERT(zio_mem_base != NULL);
+	ASSERT(zio_mem_size != 0);
+
+	zio_arena = vmem_create("zio", zio_mem_base, zio_mem_size, PAGESIZE,
+	    NULL, NULL, NULL, 0, VM_SLEEP);
+
+	zio_alloc_arena = vmem_create("zio_buf", NULL, 0, PAGESIZE,
+	    segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
+
+	ASSERT(zio_arena != NULL);
+	ASSERT(zio_alloc_arena != NULL);
+}
+
 #ifdef __sparc
 
 
--- a/usr/src/uts/common/vm/seg_kmem.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_kmem.h	Tue Dec 19 23:13:06 2006 -0800
@@ -51,6 +51,7 @@
 extern char *heap_lp_end;	/* end of kernel large page heap arena */
 extern struct seg kvseg;	/* primary kernel heap segment */
 extern struct seg kvseg_core;	/* "core" kernel heap segment */
+extern struct seg kzioseg;	/* Segment for zio mappings */
 extern vmem_t *heap_lp_arena;	/* kernel large page heap arena */
 extern vmem_t *heap_arena;	/* primary kernel heap arena */
 extern vmem_t *hat_memload_arena; /* HAT translation arena */
@@ -59,9 +60,12 @@
 extern vmem_t *heaptext_arena;	/* kernel text arena, from heap */
 extern struct as kas;		/* kernel address space */
 extern struct vnode kvp;	/* vnode for all segkmem pages */
+extern struct vnode zvp;	/* vnode for all segkmem pages for zfs */
 extern int segkmem_reloc;	/* enable/disable segkmem relocatable pages */
 extern vmem_t *static_arena;	/* arena for caches to import static memory */
 extern vmem_t *static_alloc_arena;	/* arena for allocating static memory */
+extern vmem_t *zio_arena;	/* arena for zio caches */
+extern vmem_t *zio_alloc_arena;	/* arena for zio caches */
 
 extern int segkmem_create(struct seg *);
 extern page_t *segkmem_page_create(void *, size_t, int, void *);
@@ -77,6 +81,11 @@
 extern void kernelheap_extend(void *, void *);
 extern void segkmem_gc(void);
 
+extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
+extern int segkmem_zio_create(struct seg *);
+extern void segkmem_zio_free(vmem_t *, void *, size_t);
+extern void segkmem_zio_init(void *, size_t);
+
 /*
  * Flags for segkmem_xalloc().
  *
--- a/usr/src/uts/common/vm/seg_vn.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_vn.c	Tue Dec 19 23:13:06 2006 -0800
@@ -408,7 +408,7 @@
 				a->szc = 0;
 			} else if (a->vp != NULL) {
 				extern struct vnode kvp;
-				if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) {
+				if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
 					/*
 					 * paranoid check.
 					 * hat_page_demote() is not supported
@@ -5537,7 +5537,7 @@
 
 	/* paranoid check */
 	if (svd->vp != NULL &&
-	    (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) {
+	    (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) {
 		    return (EINVAL);
 	}
 
--- a/usr/src/uts/common/vm/vm_page.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/vm_page.c	Tue Dec 19 23:13:06 2006 -0800
@@ -1035,7 +1035,7 @@
 	ASSERT(szc != 0);
 	ASSERT(vp != NULL);
 	ASSERT(!IS_SWAPFSVP(vp));
-	ASSERT(vp != &kvp);
+	ASSERT(!VN_ISKAS(vp));
 
 again:
 	if (++loopcnt > 3) {
@@ -2704,7 +2704,7 @@
 
 	if (pp->p_szc != 0) {
 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
-		    pp->p_vnode == &kvp) {
+		    PP_ISKAS(pp)) {
 			panic("page_free: anon or kernel "
 			    "or no vnode large page %p", (void *)pp);
 		}
@@ -3153,7 +3153,7 @@
 
 	if (pp->p_szc != 0) {
 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
-		    pp->p_vnode == &kvp) {
+		    PP_ISKAS(pp)) {
 			panic("page_destroy: anon or kernel or no vnode "
 			    "large page %p", (void *)pp);
 		}
@@ -3332,7 +3332,7 @@
 		vnode_t *ovp = opp->p_vnode;
 		ASSERT(ovp != NULL);
 		ASSERT(!IS_SWAPFSVP(ovp));
-		ASSERT(ovp != &kvp);
+		ASSERT(!VN_ISKAS(ovp));
 		page_demote_vp_pages(opp);
 		ASSERT(opp->p_szc == 0);
 	}
@@ -3399,14 +3399,14 @@
 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 			if (pp->p_szc != 0) {
 				ASSERT(!IS_SWAPFSVP(vp));
-				ASSERT(vp != &kvp);
+				ASSERT(!VN_ISKAS(vp));
 				page_demote_vp_pages(pp);
 				ASSERT(pp->p_szc == 0);
 			}
 			mutex_enter(phm);
 		} else if (pp->p_szc != 0) {
 			ASSERT(!IS_SWAPFSVP(vp));
-			ASSERT(vp != &kvp);
+			ASSERT(!VN_ISKAS(vp));
 			mutex_exit(phm);
 			page_demote_vp_pages(pp);
 			ASSERT(pp->p_szc == 0);
@@ -4378,7 +4378,7 @@
 		 * (g)	Backed by a filesystem which doesn't have a
 		 *	stubbed-out sync operation
 		 */
-		if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
+		if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
 		    hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
 		    vfs_can_sync(vp->v_vfsp)) {
 			nppbusy++;
@@ -4457,10 +4457,10 @@
 		 * with the kernel vnode or prom allocated kernel mem.
 		 */
 #if defined(__sparc)
-		if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
+		if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp) ||
 		    vp == &prom_ppages)
 #else /* x86 doesn't have prom or prom_ppage */
-		if ((vp = pp->p_vnode) == NULL || vp == &kvp)
+		if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
 #endif /* __sparc */
 			continue;
 
@@ -4747,7 +4747,7 @@
 	}
 	if (pp->p_szc != pszc) {
 		ASSERT(pp->p_szc < pszc);
-		ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
+		ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
 		    !IS_SWAPFSVP(pp->p_vnode));
 		tpp = pp + 1;
 		for (i = 1; i < npgs; i++, tpp++) {
@@ -4879,7 +4879,7 @@
 		 * seg kmem pages require that the target and replacement
 		 * page be the same pagesize.
 		 */
-		flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
+		flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
 		repl = page_get_replacement_page(targ, lgrp, flags);
 		if (repl == NULL) {
 			if (grouplock != 0) {
@@ -4900,7 +4900,7 @@
 	/*
 	 * Let hat_page_relocate() complete the relocation if it's kernel page
 	 */
-	if (targ->p_vnode == &kvp) {
+	if (VN_ISKAS(targ->p_vnode)) {
 		*replacement = repl;
 		if (hat_page_relocate(target, replacement, nrelocp) != 0) {
 			if (grouplock != 0) {
@@ -5244,7 +5244,7 @@
 		return (1);
 	}
 
-	if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
+	if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
 		page_demote_vp_pages(pp);
 		ASSERT(pp->p_szc == 0);
@@ -5269,7 +5269,7 @@
 	 * We can't demote kernel pages since we can't hat_unload()
 	 * the mappings.
 	 */
-	if (rootpp->p_vnode == &kvp)
+	if (VN_ISKAS(rootpp->p_vnode))
 		return (0);
 
 	/*
@@ -5393,7 +5393,7 @@
 	ASSERT(!PP_ISFREE(pp));
 	ASSERT(pp->p_vnode != NULL);
 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
-	ASSERT(pp->p_vnode != &kvp);
+	ASSERT(!PP_ISKAS(pp));
 
 	VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
 
@@ -6850,7 +6850,7 @@
 		ret = EAGAIN;
 		goto cleanup;
 	}
-	if (PP_ISKVP(pp)) {
+	if (PP_ISKAS(pp)) {
 		ret = EAGAIN;
 		goto cleanup;
 	}
@@ -6932,7 +6932,7 @@
 		return (EPERM);
 	}
 #else
-	if (PP_ISKVP(pp)) {
+	if (PP_ISKAS(pp)) {
 		return (EPERM);
 	}
 #endif /* __sparc */
@@ -7344,7 +7344,7 @@
 			bp = page_capture_hash[i].lists[j].next;
 			while (bp != &page_capture_hash[i].lists[j]) {
 				pp = bp->pp;
-				if (!PP_ISKVP(pp) && PP_TOXIC(pp)) {
+				if (!PP_ISKAS(pp) && PP_TOXIC(pp)) {
 					pp->p_selock = -1;  /* pacify ASSERTs */
 					PP_CLRFREE(pp);
 					pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/vm_pagelist.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/vm_pagelist.c	Tue Dec 19 23:13:06 2006 -0800
@@ -3909,7 +3909,7 @@
 	 * pages, since we cannot properly handle demotion of kernel
 	 * pages.
 	 */
-	if (like_pp->p_vnode == &kvp)
+	if (PP_ISKAS(like_pp))
 		pgrflags |= PGR_SAMESZC;
 
 	/* LINTED */
--- a/usr/src/uts/i86pc/os/startup.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/os/startup.c	Tue Dec 19 23:13:06 2006 -0800
@@ -209,6 +209,19 @@
 char kern_bootargs[OBP_MAXPATHLEN];
 
 /*
+ * ZFS zio segment.  This allows us to exclude large portions of ZFS data that
+ * gets cached in kmem caches on the heap.  If this is set to zero, we allocate
+ * zio buffers from their own segment, otherwise they are allocated from the
+ * heap.  The optimization of allocating zio buffers from their own segment is
+ * only valid on 64-bit kernels.
+ */
+#if defined(__amd64)
+int segzio_fromheap = 0;
+#else
+int segzio_fromheap = 1;
+#endif
+
+/*
  * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
  * depends on number of BOP_ALLOC calls made and requested size, memory size
  * combination and whether boot.bin memory needs to be freed.
@@ -239,11 +252,13 @@
 #endif
 
 caddr_t segkp_base;		/* Base address of segkp */
+caddr_t segzio_base;		/* Base address of segzio */
 #if defined(__amd64)
 pgcnt_t segkpsize = btop(SEGKPDEFSIZE);	/* size of segkp segment in pages */
 #else
 pgcnt_t segkpsize = 0;
 #endif
+pgcnt_t segziosize = 0;		/* size of zio segment in pages */
 
 struct memseg *memseg_base;
 struct vnode unused_pages_vp;
@@ -362,6 +377,8 @@
  * 0xFFFFFXXX.XXX00000  |-----------------------|- segkmap_start (floating)
  *			|    device mappings	|
  * 0xFFFFFXXX.XXX00000  |-----------------------|- toxic_addr (floating)
+ *			|	  segzio	|
+ * 0xFFFFFXXX.XXX00000  |-----------------------|- segzio_base (floating)
  *			|	  segkp		|
  * ---                  |-----------------------|- segkp_base
  *			|	 segkpm		|
@@ -1566,6 +1583,29 @@
 		PRM_DEBUG(final_kernelheap);
 	}
 
+	if (!segzio_fromheap) {
+		size_t size;
+
+		/* size is in bytes, segziosize is in pages */
+		if (segziosize == 0) {
+			size = mmu_ptob(physmem * 2);
+		} else {
+			size = mmu_ptob(segziosize);
+		}
+
+		if (size < SEGZIOMINSIZE) {
+			size = SEGZIOMINSIZE;
+		} else if (size > mmu_ptob(physmem * 4)) {
+			size = mmu_ptob(physmem * 4);
+		}
+		segziosize = mmu_btop(ROUND_UP_LPAGE(size));
+		segzio_base = final_kernelheap;
+		PRM_DEBUG(segziosize);
+		PRM_DEBUG(segzio_base);
+		final_kernelheap = segzio_base + mmu_ptob(segziosize);
+		PRM_DEBUG(final_kernelheap);
+	}
+
 	/*
 	 * put the range of VA for device mappings next
 	 */
@@ -2377,6 +2417,16 @@
 #if defined(__amd64)
 	(void) seg_attach(&kas, (caddr_t)core_base, core_size, &kvseg_core);
 	(void) segkmem_create(&kvseg_core);
+
+	/* segzio optimization is only valid for 64-bit kernels */
+	if (!segzio_fromheap) {
+		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
+		    &kzioseg);
+		(void) segkmem_zio_create(&kzioseg);
+
+		/* create zio area covering new segment */
+		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
+	}
 #endif
 
 	(void) seg_attach(&kas, (caddr_t)SEGDEBUGBASE, (size_t)SEGDEBUGSIZE,
--- a/usr/src/uts/i86pc/sys/machparam.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/sys/machparam.h	Tue Dec 19 23:13:06 2006 -0800
@@ -167,6 +167,11 @@
 #define	SEGKPMINSIZE	(200L * 1024 * 1024L)			/* 200M */
 
 /*
+ * minimum size for segzio
+ */
+#define	SEGZIOMINSIZE	(400L * 1024 * 1024L)			/* 400M */
+
+/*
  * Boot (or, more precisely, vmx) maps most pages twice - once in the
  * bottom 2GB of memory and once in the bottom 2GB of the topmost 4GB.
  * When boot is unmapped this range is available to the kernel, but until
--- a/usr/src/uts/i86pc/vm/vm_dep.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_dep.h	Tue Dec 19 23:13:06 2006 -0800
@@ -406,7 +406,7 @@
  */
 
 #define	MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) {			\
-	if (restricted_kmemalloc && (vp) == &kvp &&			\
+	if (restricted_kmemalloc && VN_ISKAS(vp) &&			\
 	    (caddr_t)(vaddr) >= kernelheap &&				\
 	    (caddr_t)(vaddr) < ekernelheap) {				\
 		ASSERT(physmax4g);					\
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Tue Dec 19 23:13:06 2006 -0800
@@ -1920,8 +1920,8 @@
 			 *	 with kernel vnode 'kvp'.
 			 */
 			/* XX64 - to debug why this happens! */
-			ASSERT(vp != &kvp);
-			if (vp == &kvp)
+			ASSERT(!VN_ISKAS(vp));
+			if (VN_ISKAS(vp))
 				cmn_err(CE_NOTE,
 				    "page_create: page not expected "
 				    "in hash list for kernel vnode - pp 0x%p",
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Tue Dec 19 23:13:06 2006 -0800
@@ -3672,14 +3672,21 @@
 			 * Somebody is holding SE_EXCL lock. Might
 			 * even be hat_page_relocate(). Drop all
 			 * our locks, lookup the page in &kvp, and
-			 * retry. If it doesn't exist in &kvp, then
-			 * we must be dealing with a kernel mapped
+			 * retry. If it doesn't exist in &kvp and &zvp,
+			 * then we must be dealing with a kernel mapped
 			 * page which doesn't actually belong to
 			 * segkmem so we punt.
 			 */
 			sfmmu_mlist_exit(pml);
 			SFMMU_HASH_UNLOCK(hmebp);
 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
+
+			/* check zvp before giving up */
+			if (pp == NULL)
+				pp = page_lookup(&zvp, (u_offset_t)saddr,
+				    SE_SHARED);
+
+			/* Okay, we didn't find it, give up */
 			if (pp == NULL) {
 				kmem_cache_free(pa_hment_cache, pahmep);
 				*rpfn = pfn;
@@ -3710,7 +3717,7 @@
 		goto rehash;
 	}
 
-	if (vp != &kvp) {
+	if (!VN_ISKAS(vp)) {
 		/*
 		 * This is not a segkmem page but another page which
 		 * has been kernel mapped. It had better have at least
@@ -3841,14 +3848,19 @@
 			 * Somebody is holding SE_EXCL lock. Might
 			 * even be hat_page_relocate(). Drop all
 			 * our locks, lookup the page in &kvp, and
-			 * retry. If it doesn't exist in &kvp, then
-			 * we must be dealing with a kernel mapped
+			 * retry. If it doesn't exist in &kvp and &zvp,
+			 * then we must be dealing with a kernel mapped
 			 * page which doesn't actually belong to
 			 * segkmem so we punt.
 			 */
 			sfmmu_mlist_exit(pml);
 			SFMMU_HASH_UNLOCK(hmebp);
 			pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
+			/* check zvp before giving up */
+			if (pp == NULL)
+				pp = page_lookup(&zvp, (u_offset_t)saddr,
+				    SE_SHARED);
+
 			if (pp == NULL) {
 				ASSERT(cookie == NULL);
 				return;
@@ -3875,7 +3887,7 @@
 		goto rehash;
 	}
 
-	if (vp != &kvp) {
+	if (!VN_ISKAS(vp)) {
 		/*
 		 * This is not a segkmem page but another page which
 		 * has been kernel mapped.
@@ -6522,7 +6534,7 @@
 
 	ASSERT(pp != NULL);
 	ASSERT(sfmmu_mlist_held(pp));
-	ASSERT(pp->p_vnode != &kvp);
+	ASSERT(!PP_ISKAS(pp));
 
 	CPUSET_ZERO(cpuset);
 
--- a/usr/src/uts/sparc/v9/vm/seg_nf.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sparc/v9/vm/seg_nf.c	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -122,8 +121,8 @@
  * vnode and page for the page of zeros we use for the nf mappings.
  */
 static kmutex_t segnf_lock;
-static struct vnode zvp;
-static struct page **zpp;
+static struct vnode nfvp;
+static struct page **nfpp;
 
 #define	addr_to_vcolor(addr)                                            \
 	(shm_alignment) ?						\
@@ -195,7 +194,7 @@
 	 * Need a page per virtual color or just 1 if no vac.
 	 */
 	mutex_enter(&segnf_lock);
-	if (zpp == NULL) {
+	if (nfpp == NULL) {
 		struct seg kseg;
 
 		vacpgs = 1;
@@ -203,16 +202,16 @@
 			vacpgs = shm_alignment >> PAGESHIFT;
 		}
 
-		zpp = kmem_alloc(sizeof (*zpp) * vacpgs, KM_SLEEP);
+		nfpp = kmem_alloc(sizeof (*nfpp) * vacpgs, KM_SLEEP);
 
 		kseg.s_as = &kas;
 		for (i = 0; i < vacpgs; i++, off += PAGESIZE,
 		    vaddr += PAGESIZE) {
-			zpp[i] = page_create_va(&zvp, off, PAGESIZE,
+			nfpp[i] = page_create_va(&nfvp, off, PAGESIZE,
 			    PG_WAIT | PG_NORELOC, &kseg, vaddr);
-			page_io_unlock(zpp[i]);
-			page_downgrade(zpp[i]);
-			pagezero(zpp[i], 0, PAGESIZE);
+			page_io_unlock(nfpp[i]);
+			page_downgrade(nfpp[i]);
+			pagezero(nfpp[i], 0, PAGESIZE);
 		}
 	}
 	mutex_exit(&segnf_lock);
@@ -234,7 +233,7 @@
 	color = addr_to_vcolor(seg->s_base);
 	if (as != &kas)
 		prot |= PROT_USER;
-	hat_memload(as->a_hat, seg->s_base, zpp[color],
+	hat_memload(as->a_hat, seg->s_base, nfpp[color],
 	    prot | HAT_NOFAULT, HAT_LOAD);
 
 	/*
@@ -456,7 +455,7 @@
 {
 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 
-	*vpp = &zvp;
+	*vpp = &nfvp;
 	return (0);
 }
 
--- a/usr/src/uts/sun4/os/startup.c	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sun4/os/startup.c	Tue Dec 19 23:13:06 2006 -0800
@@ -192,6 +192,10 @@
 struct seg *segkmap = &kmapseg;	/* Kernel generic mapping segment */
 struct seg *segkpm = &kpmseg;	/* 64bit kernel physical mapping segment */
 
+int segzio_fromheap = 0;	/* zio allocations occur from heap */
+caddr_t segzio_base;		/* Base address of segzio */
+pgcnt_t segziosize = 0;		/* size of zio segment in pages */
+
 /*
  * debugger pages (if allocated)
  */
@@ -373,6 +377,8 @@
  * 0xFFFFFFFC.00000000  -|-----------------------|-
  *                       :                       :
  *                       :                       :
+ *                      -|-----------------------|-
+ *                       |       segzio          | (base and size vary)
  * 0xFFFFFE00.00000000  -|-----------------------|-
  *                       |                       |  Ultrasparc I/II support
  *                       |    segkpm segment     |  up to 2TB of physical
@@ -2058,6 +2064,47 @@
 		mach_kpm_init();
 	}
 
+	if (!segzio_fromheap) {
+		size_t size;
+
+		/* size is in bytes, segziosize is in pages */
+		if (segziosize == 0) {
+			size = mmu_ptob(physmem * 2);
+		} else {
+			size = mmu_ptob(segziosize);
+		}
+
+		if (size < SEGZIOMINSIZE) {
+			size = SEGZIOMINSIZE;
+		} else if (size > mmu_ptob(physmem * 4)) {
+			size = mmu_ptob(physmem * 4);
+		}
+		segziosize = mmu_btop(roundup(size, MMU_PAGESIZE));
+		/* put the base of the ZIO segment after the kpm segment */
+		segzio_base = kpm_vbase + (kpm_size * vac_colors);
+		PRM_DEBUG(segziosize);
+		PRM_DEBUG(segzio_base);
+
+		/*
+		 * On some platforms, kvm_init is called after the kpm
+		 * sizes have been determined.  On SPARC, kvm_init is called
+		 * before, so we have to attach the kzioseg after kvm is
+		 * initialized, otherwise we'll try to allocate from the boot
+		 * area since the kernel heap hasn't yet been configured.
+		 */
+		rw_enter(&kas.a_lock, RW_WRITER);
+
+		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
+		    &kzioseg);
+		(void) segkmem_zio_create(&kzioseg);
+
+		/* create zio area covering new segment */
+		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
+
+		rw_exit(&kas.a_lock);
+	}
+
+
 	/*
 	 * Now create generic mapping segment.  This mapping
 	 * goes SEGMAPSIZE beyond SEGMAPBASE.  But if the total
--- a/usr/src/uts/sun4/sys/vm_machparam.h	Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sun4/sys/vm_machparam.h	Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -22,10 +21,9 @@
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-
 /*
- * Copyright (c) 1989,1999 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef _SYS_VM_MACHPARAM_H
@@ -96,6 +94,11 @@
 #endif	/* _LP64 */
 
 /*
+ * Define minimum size for zio segment
+ */
+#define	SEGZIOMINSIZE	(512L * 1024 * 1024L)			/* 512M */
+
+/*
  * The time for a process to be blocked before being very swappable.
  * This is a number of seconds which the system takes as being a non-trivial
  * amount of real time. You probably shouldn't change this;