4894692 caching data in heap inflates crash dump
6499454 time to increase size of kmem default allocation caches
6499459 vm should stop checking kvp directly
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -218,6 +217,7 @@
/* Summary statistics of pages */
typedef struct memstat {
struct vnode *ms_kvp; /* Cached address of kernel vnode */
+ struct vnode *ms_zvp; /* Cached address of zio vnode */
uint64_t ms_kmem; /* Pages of kernel memory */
uint64_t ms_anon; /* Pages of anonymous memory */
uint64_t ms_vnode; /* Pages of named (vnode) memory */
@@ -226,6 +226,10 @@
uint64_t ms_total; /* Pages on page hash */
} memstat_t;
+#define MS_PP_ISKAS(pp, stats) \
+ (((pp)->p_vnode == (stats)->ms_kvp) || \
+ (((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp)))
+
/*
* Summarize pages by type; called from page walker.
*/
@@ -252,7 +256,7 @@
stats->ms_cachelist++;
else if (vp && IS_SWAPFSVP(vp))
stats->ms_anon++;
- else if (pp->p_vnode == stats->ms_kvp)
+ else if (MS_PP_ISKAS(pp, stats))
stats->ms_kmem++;
else if (vp && (((vp)->v_flag & VVMEXEC)) != 0)
stats->ms_exec++;
@@ -308,6 +312,17 @@
stats.ms_kvp = (struct vnode *)(uintptr_t)sym.st_value;
+ /*
+ * Read the zio vnode pointer. It may not exist on all kernels, so it
+ * it isn't found, it's not a fatal error.
+ */
+ if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "zvp",
+ (GElf_Sym *)&sym) == -1) {
+ stats.ms_zvp = NULL;
+ } else {
+ stats.ms_zvp = (struct vnode *)(uintptr_t)sym.st_value;
+ }
+
/* Walk page structures, summarizing usage */
if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback,
&stats) == -1) {
--- a/usr/src/uts/common/cpr/cpr_dump.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/cpr/cpr_dump.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -790,11 +789,11 @@
do {
#if defined(__sparc)
extern struct vnode prom_ppages;
- if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
+ if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
pp->p_vnode == &prom_ppages ||
PP_ISFREE(pp) && PP_ISAGED(pp))
#else
- if (pp->p_vnode == NULL || pp->p_vnode == &kvp ||
+ if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
PP_ISFREE(pp) && PP_ISAGED(pp))
#endif /* __sparc */
continue;
--- a/usr/src/uts/common/fs/fsflush.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/fsflush.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -24,7 +23,7 @@
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -232,7 +231,7 @@
coal_page = NULL;
}
- if (pp->p_vnode == &kvp ||
+ if (PP_ISKAS(pp) ||
PAGE_LOCKED(pp) ||
pp->p_lckcnt != 0 ||
pp->p_cowcnt != 0)
@@ -255,7 +254,7 @@
if (PP_ISSWAP(pp) ||
PP_ISFREE(pp) ||
vp == NULL ||
- vp == &kvp ||
+ PP_ISKAS(pp) ||
pp->p_lckcnt != 0 ||
pp->p_cowcnt != 0 ||
(vp->v_flag & VISSWAP) != 0) {
--- a/usr/src/uts/common/fs/lofs/lofs_vnops.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/lofs/lofs_vnops.c Tue Dec 19 23:13:06 2006 -0800
@@ -1048,7 +1048,7 @@
lo_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr)
{
vp = realvp(vp);
- if (vp != NULL && vp != &kvp)
+ if (vp != NULL && !VN_ISKAS(vp))
VOP_DISPOSE(vp, pp, fl, dn, cr);
}
--- a/usr/src/uts/common/fs/zfs/arc.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c Tue Dec 19 23:13:06 2006 -0800
@@ -230,10 +230,6 @@
};
struct arc_buf_hdr {
- /* immutable */
- uint64_t b_size;
- spa_t *b_spa;
-
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
@@ -247,8 +243,13 @@
uint32_t b_flags;
uint32_t b_datacnt;
+ arc_callback_t *b_acb;
kcondvar_t b_cv;
- arc_callback_t *b_acb;
+
+ /* immutable */
+ arc_buf_contents_t b_type;
+ uint64_t b_size;
+ spa_t *b_spa;
/* protected by arc state mutex */
arc_state_t *b_state;
@@ -746,7 +747,7 @@
}
arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag)
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
@@ -755,6 +756,7 @@
hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
+ hdr->b_type = type;
hdr->b_spa = spa;
hdr->b_state = arc.anon;
hdr->b_arc_access = 0;
@@ -839,10 +841,16 @@
if (buf->b_data) {
arc_state_t *state = buf->b_hdr->b_state;
uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
arc_cksum_verify(buf);
if (!recycle) {
- zio_buf_free(buf->b_data, size);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf->b_data, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf->b_data, size);
+ }
atomic_add_64(&arc.size, -size);
}
if (list_link_active(&buf->b_hdr->b_arc_node)) {
@@ -1003,7 +1011,8 @@
* new buffer in a full arc cache.
*/
static void *
-arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle)
+arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
+ arc_buf_contents_t type)
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
@@ -1041,7 +1050,8 @@
arc_buf_t *buf = ab->b_buf;
if (buf->b_data) {
bytes_evicted += ab->b_size;
- if (recycle && ab->b_size == bytes) {
+ if (recycle && ab->b_type == type &&
+ ab->b_size == bytes) {
stolen = buf->b_data;
recycle = FALSE;
}
@@ -1147,7 +1157,7 @@
if (top_sz > arc.p && arc.mru->lsize > 0) {
int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
- (void) arc_evict(arc.mru, toevict, FALSE);
+ (void) arc_evict(arc.mru, toevict, FALSE, ARC_BUFC_UNDEF);
top_sz = arc.anon->size + arc.mru->size;
}
@@ -1165,7 +1175,8 @@
if (arc.mfu->lsize > 0) {
int64_t toevict = MIN(arc.mfu->lsize, arc_over);
- (void) arc_evict(arc.mfu, toevict, FALSE);
+ (void) arc_evict(arc.mfu, toevict, FALSE,
+ ARC_BUFC_UNDEF);
}
tbl_over = arc.size + arc.mru_ghost->lsize +
@@ -1207,9 +1218,9 @@
arc_flush(void)
{
while (list_head(&arc.mru->list))
- (void) arc_evict(arc.mru, -1, FALSE);
+ (void) arc_evict(arc.mru, -1, FALSE, ARC_BUFC_UNDEF);
while (list_head(&arc.mfu->list))
- (void) arc_evict(arc.mfu, -1, FALSE);
+ (void) arc_evict(arc.mfu, -1, FALSE, ARC_BUFC_UNDEF);
arc_evict_ghost(arc.mru_ghost, -1);
arc_evict_ghost(arc.mfu_ghost, -1);
@@ -1315,7 +1326,9 @@
{
size_t i;
kmem_cache_t *prev_cache = NULL;
+ kmem_cache_t *prev_data_cache = NULL;
extern kmem_cache_t *zio_buf_cache[];
+ extern kmem_cache_t *zio_data_buf_cache[];
#ifdef _KERNEL
/*
@@ -1344,6 +1357,10 @@
prev_cache = zio_buf_cache[i];
kmem_cache_reap_now(zio_buf_cache[i]);
}
+ if (zio_data_buf_cache[i] != prev_data_cache) {
+ prev_data_cache = zio_data_buf_cache[i];
+ kmem_cache_reap_now(zio_data_buf_cache[i]);
+ }
}
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_cache);
@@ -1498,8 +1515,9 @@
static void
arc_get_data_buf(arc_buf_t *buf)
{
- arc_state_t *state = buf->b_hdr->b_state;
- uint64_t size = buf->b_hdr->b_size;
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
arc_adapt(size, state);
@@ -1508,7 +1526,12 @@
* just allocate a new buffer.
*/
if (!arc_evict_needed()) {
- buf->b_data = zio_buf_alloc(size);
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
atomic_add_64(&arc.size, size);
goto out;
}
@@ -1530,8 +1553,13 @@
uint64_t mfu_space = arc.c - arc.p;
state = (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu;
}
- if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) {
- buf->b_data = zio_buf_alloc(size);
+ if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
+ if (type == ARC_BUFC_METADATA) {
+ buf->b_data = zio_buf_alloc(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ buf->b_data = zio_data_buf_alloc(size);
+ }
atomic_add_64(&arc.size, size);
atomic_add_64(&arc.recycle_miss, 1);
}
@@ -1916,8 +1944,8 @@
if (hdr == NULL) {
/* this block is not in the cache */
arc_buf_hdr_t *exists;
-
- buf = arc_buf_alloc(spa, size, private);
+ arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+ buf = arc_buf_alloc(spa, size, private, type);
hdr = buf->b_hdr;
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = bp->blk_birth;
@@ -2177,6 +2205,7 @@
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
+ arc_buf_contents_t type = hdr->b_type;
ASSERT(hdr->b_datacnt > 1);
/*
@@ -2202,6 +2231,7 @@
nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
nhdr->b_size = blksz;
nhdr->b_spa = spa;
+ nhdr->b_type = type;
nhdr->b_buf = buf;
nhdr->b_state = arc.anon;
nhdr->b_arc_access = 0;
--- a/usr/src/uts/common/fs/zfs/dbuf.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Tue Dec 19 23:13:06 2006 -0800
@@ -504,9 +504,11 @@
dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
if (bp == NULL || BP_IS_HOLE(bp)) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
ASSERT(bp == NULL || BP_IS_HOLE(bp));
dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db));
+ db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
@@ -615,10 +617,12 @@
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- db->db.db_size, db));
+ db->db.db_size, db, type));
db->db_state = DB_FILL;
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
@@ -643,6 +647,7 @@
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
{
arc_buf_t **quiescing, **syncing;
+ arc_buf_contents_t type;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db.db_data != NULL);
@@ -665,8 +670,9 @@
ASSERT(*syncing != db->db_buf);
if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
+ type = DBUF_GET_BUFC_TYPE(db);
*quiescing = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db);
+ db->db_dnode->dn_objset->os_spa, size, db, type);
bcopy(db->db.db_data, (*quiescing)->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -685,10 +691,11 @@
ASSERT3U(db->db_dirtycnt, ==, 1);
if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
+ type = DBUF_GET_BUFC_TYPE(db);
/* we can't copy if we have already started a write */
ASSERT(*syncing != db->db_data_pending);
*syncing = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, size, db);
+ db->db_dnode->dn_objset->os_spa, size, db, type);
bcopy(db->db.db_data, (*syncing)->b_data, size);
} else {
dbuf_set_data(db, NULL);
@@ -860,6 +867,7 @@
{
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -879,7 +887,7 @@
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db);
+ buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -1588,9 +1596,10 @@
db->db_data_pending == db->db_buf) {
int size = (db->db_blkid == DB_BONUS_BLKID) ?
DN_MAX_BONUSLEN : db->db.db_size;
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- size, db));
+ size, db, type));
bcopy(db->db_data_pending->b_data, db->db.db_data,
db->db.db_size);
}
@@ -1766,6 +1775,7 @@
int checksum, compress;
zbookmark_t zb;
int blksz;
+ arc_buf_contents_t type;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1823,6 +1833,7 @@
}
if (db->db_level == 0) {
+ type = DBUF_GET_BUFC_TYPE(db);
data = &db->db_d.db_data_old[txg&TXG_MASK];
blksz = arc_buf_size(*data);
@@ -1849,7 +1860,8 @@
db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
if (refcount_count(&db->db_holds) > 1 &&
*data == db->db_buf) {
- *data = arc_buf_alloc(os->os_spa, blksz, db);
+ *data = arc_buf_alloc(os->os_spa, blksz, db,
+ type);
bcopy(db->db.db_data, (*data)->b_data, blksz);
}
db->db_data_pending = *data;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Tue Dec 19 23:13:06 2006 -0800
@@ -737,7 +737,8 @@
int err;
zbookmark_t zb;
arc_buf_t *abuf =
- arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
+ arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG,
+ ARC_BUFC_METADATA);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(os->os_synctx == NULL);
--- a/usr/src/uts/common/fs/zfs/spa.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c Tue Dec 19 23:13:06 2006 -0800
@@ -2133,7 +2133,7 @@
{
spa_t *spa = zio->io_spa;
- zio_buf_free(zio->io_data, zio->io_size);
+ zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2155,7 +2155,7 @@
zbookmark_t *zb)
{
size_t size = BP_GET_LSIZE(bp);
- void *data = zio_buf_alloc(size);
+ void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h Tue Dec 19 23:13:06 2006 -0800
@@ -54,6 +54,11 @@
void *b_private;
};
+typedef enum arc_buf_contents {
+ ARC_BUFC_UNDEF, /* buffer contents undefined */
+ ARC_BUFC_DATA, /* buffer contains data */
+ ARC_BUFC_METADATA /* buffer contains metadata */
+} arc_buf_contents_t;
/*
* These are the flags we pass into calls to the arc
*/
@@ -62,7 +67,8 @@
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
+arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+ arc_buf_contents_t type);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Tue Dec 19 23:13:06 2006 -0800
@@ -254,6 +254,11 @@
void dbuf_init(void);
void dbuf_fini(void);
+#define DBUF_GET_BUFC_TYPE(db) \
+ ((((db)->db_level > 0) || \
+ (dmu_ot[(db)->db_dnode->dn_type].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
+
#ifdef ZFS_DEBUG
/*
--- a/usr/src/uts/common/fs/zfs/sys/spa.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h Tue Dec 19 23:13:06 2006 -0800
@@ -306,6 +306,9 @@
#include <sys/dmu.h>
+#define BP_GET_BUFC_TYPE(bp) \
+ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
+ ARC_BUFC_METADATA : ARC_BUFC_DATA);
/*
* Routines found in spa.c
*/
--- a/usr/src/uts/common/fs/zfs/sys/zio.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h Tue Dec 19 23:13:06 2006 -0800
@@ -295,6 +295,8 @@
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);
+extern void *zio_data_buf_alloc(size_t size);
+extern void zio_data_buf_free(void *buf, size_t size);
/*
* Move an I/O to the next stage of the pipeline and execute that stage.
--- a/usr/src/uts/common/fs/zfs/zio.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c Tue Dec 19 23:13:06 2006 -0800
@@ -82,11 +82,21 @@
* ==========================================================================
*/
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
void
zio_init(void)
{
size_t c;
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
/*
* For small buffers, we want a cache for each multiple of
@@ -111,10 +121,16 @@
}
if (align != 0) {
- char name[30];
+ char name[36];
(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
zio_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+
+ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, data_alloc_arena,
+ KMC_NODEBUG);
+
dprintf("creating cache for size %5lx align %5lx\n",
size, align);
}
@@ -124,6 +140,10 @@
ASSERT(zio_buf_cache[c] != NULL);
if (zio_buf_cache[c - 1] == NULL)
zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
zio_inject_init();
@@ -134,6 +154,7 @@
{
size_t c;
kmem_cache_t *last_cache = NULL;
+ kmem_cache_t *last_data_cache = NULL;
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
if (zio_buf_cache[c] != last_cache) {
@@ -141,6 +162,12 @@
kmem_cache_destroy(zio_buf_cache[c]);
}
zio_buf_cache[c] = NULL;
+
+ if (zio_data_buf_cache[c] != last_data_cache) {
+ last_data_cache = zio_data_buf_cache[c];
+ kmem_cache_destroy(zio_data_buf_cache[c]);
+ }
+ zio_data_buf_cache[c] = NULL;
}
zio_inject_fini();
@@ -151,6 +178,13 @@
* Allocate and free I/O buffers
* ==========================================================================
*/
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
void *
zio_buf_alloc(size_t size)
{
@@ -161,6 +195,22 @@
return (kmem_cache_alloc(zio_buf_cache[c], KM_SLEEP));
}
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_SLEEP));
+}
+
void
zio_buf_free(void *buf, size_t size)
{
@@ -171,6 +221,15 @@
kmem_cache_free(zio_buf_cache[c], buf);
}
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+}
/*
* ==========================================================================
* Push and pop I/O transform buffers
--- a/usr/src/uts/common/os/kmem.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/kmem.c Tue Dec 19 23:13:06 2006 -0800
@@ -154,9 +154,11 @@
P2ALIGN(8192 / 1, 64),
4096 * 3,
8192 * 2,
+ 8192 * 3,
+ 8192 * 4,
};
-#define KMEM_MAXBUF 16384
+#define KMEM_MAXBUF 32768
static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
--- a/usr/src/uts/common/os/mem_cage.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/mem_cage.c Tue Dec 19 23:13:06 2006 -0800
@@ -1308,7 +1308,7 @@
* non-swapfs (i.e. anonymous memory) file system pages.
*/
ASSERT(rootpp->p_vnode != NULL &&
- rootpp->p_vnode != &kvp &&
+ !PP_ISKAS(rootpp) &&
!IS_SWAPFSVP(rootpp->p_vnode));
PP_SETNORELOC(rootpp);
return (1);
@@ -1783,7 +1783,7 @@
continue;
}
- if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) ||
+ if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
!page_trylock(pp, SE_EXCL)) {
KCAGE_STAT_INCR_SCAN(kt_cantlock);
continue;
@@ -1791,7 +1791,7 @@
/* P_NORELOC bit should not have gone away. */
ASSERT(PP_ISNORELOC(pp));
- if (PP_ISFREE(pp) || (pp->p_vnode == &kvp &&
+ if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
pp->p_lckcnt > 0)) {
page_unlock(pp);
continue;
--- a/usr/src/uts/common/os/mem_config.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/mem_config.c Tue Dec 19 23:13:06 2006 -0800
@@ -1923,7 +1923,7 @@
* Unload the mappings and check if mod bit
* is set.
*/
- ASSERT(pp->p_vnode != &kvp);
+ ASSERT(!PP_ISKAS(pp));
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
mod = hat_ismod(pp);
--- a/usr/src/uts/common/os/vm_pageout.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/os/vm_pageout.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -950,8 +949,7 @@
* NOTE: These optimizations assume that reads are atomic.
*/
top:
- if ((pp->p_vnode == &kvp) ||
- (PP_ISFREE(pp)) ||
+ if ((PP_ISKAS(pp)) || (PP_ISFREE(pp)) ||
(hat_page_getshare(pp) > po_share) || PAGE_LOCKED(pp)) {
return (-1);
}
--- a/usr/src/uts/common/sys/vnode.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/sys/vnode.h Tue Dec 19 23:13:06 2006 -0800
@@ -960,6 +960,11 @@
((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \
VOP_CMP(VP1, VP2) : 0))
+extern struct vnode kvp;
+extern struct vnode zvp;
+
+#define VN_ISKAS(vp) ((vp) == &kvp || (vp) == &zvp)
+
#endif /* _KERNEL */
/*
@@ -1001,7 +1006,7 @@
*/
#define VN_DISPOSE(pp, flag, dn, cr) { \
extern struct vnode kvp; \
- if ((pp)->p_vnode != NULL && (pp)->p_vnode != &kvp) \
+ if ((pp)->p_vnode != NULL && !VN_ISKAS((pp)->p_vnode)) \
VOP_DISPOSE((pp)->p_vnode, (pp), (flag), (dn), (cr)); \
else if ((flag) == B_FREE) \
page_free((pp), (dn)); \
--- a/usr/src/uts/common/vm/page.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page.h Tue Dec 19 23:13:06 2006 -0800
@@ -877,8 +877,9 @@
#define PP_ISAGED(pp) (((pp)->p_state & P_FREE) && \
((pp)->p_vnode == NULL))
#define PP_ISNORELOC(pp) ((pp)->p_state & P_NORELOC)
-#define PP_ISKVP(pp) ((pp)->p_vnode == &kvp)
-#define PP_ISNORELOCKERNEL(pp) (PP_ISNORELOC(pp) && PP_ISKVP(pp))
+#define PP_ISKAS(pp) (((pp)->p_vnode == &kvp) || \
+ ((pp)->p_vnode == &zvp))
+#define PP_ISNORELOCKERNEL(pp) (PP_ISNORELOC(pp) && PP_ISKAS(pp))
#define PP_ISMIGRATE(pp) ((pp)->p_state & P_MIGRATE)
#define PP_ISSWAP(pp) ((pp)->p_state & P_SWAP)
@@ -956,7 +957,7 @@
#define PP_PR_REQ(pp) (((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
#define PP_PR_NOSHARE(pp) \
((((pp)->p_toxic & (PR_RETIRED | PR_FMA | PR_UE)) == PR_FMA) && \
- !PP_ISKVP(pp))
+ !PP_ISKAS(pp))
/*
* Flags for page_unretire_pp
--- a/usr/src/uts/common/vm/page_lock.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page_lock.c Tue Dec 19 23:13:06 2006 -0800
@@ -142,6 +142,12 @@
extern struct vnode kvp;
+/*
+ * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
+ * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
+ * VPH_TABLE_SIZE + 1.
+ */
+
kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
/*
@@ -861,6 +867,9 @@
{
if (vp == &kvp)
return (&vph_mutex[VPH_TABLE_SIZE + 0]);
+
+ if (vp == &zvp)
+ return (&vph_mutex[VPH_TABLE_SIZE + 1]);
#ifdef DEBUG
if (page_vnode_mutex_stress != 0)
return (&vph_mutex[0]);
@@ -913,7 +922,7 @@
ASSERT(!PP_ISFREE(pp));
ASSERT(pp->p_vnode != NULL);
ASSERT(!IS_SWAPFSVP(pp->p_vnode));
- ASSERT(pp->p_vnode != &kvp);
+ ASSERT(!PP_ISKAS(pp));
again:
if (pszc == 0) {
--- a/usr/src/uts/common/vm/page_retire.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/page_retire.c Tue Dec 19 23:13:06 2006 -0800
@@ -355,7 +355,7 @@
int whichtype = 0; \
if (pp->p_vnode) \
whichtype |= PRT_NAMED; \
- if (PP_ISKVP(pp)) \
+ if (PP_ISKAS(pp)) \
whichtype |= PRT_KERNEL; \
if (PP_ISFREE(pp)) \
whichtype |= PRT_FREE; \
@@ -882,7 +882,7 @@
page_retire_thread_cb(page_t *pp)
{
PR_DEBUG(prd_tctop);
- if (!PP_ISKVP(pp) && page_trylock(pp, SE_EXCL)) {
+ if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
PR_DEBUG(prd_tclocked);
page_unlock(pp);
}
@@ -901,7 +901,7 @@
* Don't scrub the kernel, since we might still need it, unless
* we have UEs on the page, in which case we have nothing to lose.
*/
- if (!PP_ISKVP(pp) || PP_TOXIC(pp)) {
+ if (!PP_ISKAS(pp) || PP_TOXIC(pp)) {
pp->p_selock = -1; /* pacify ASSERTs */
PP_CLRFREE(pp);
pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/seg_kmem.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_kmem.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -103,6 +102,7 @@
char *ekernelheap; /* end of primary kernel heap */
struct seg kvseg; /* primary kernel heap segment */
struct seg kvseg_core; /* "core" kernel heap segment */
+struct seg kzioseg; /* Segment for zio mappings */
vmem_t *heap_arena; /* primary kernel heap arena */
vmem_t *heap_core_arena; /* core kernel heap arena */
char *heap_core_base; /* start of core kernel heap arena */
@@ -114,9 +114,12 @@
vmem_t *heaptext_arena; /* heaptext arena */
struct as kas; /* kernel address space */
struct vnode kvp; /* vnode for all segkmem pages */
+struct vnode zvp; /* vnode for zfs pages */
int segkmem_reloc; /* enable/disable relocatable segkmem pages */
vmem_t *static_arena; /* arena for caches to import static memory */
vmem_t *static_alloc_arena; /* arena for allocating static memory */
+vmem_t *zio_arena = NULL; /* arena for allocating zio memory */
+vmem_t *zio_alloc_arena = NULL; /* arena for allocating zio memory */
/*
* seg_kmem driver can map part of the kernel heap with large pages.
@@ -427,6 +430,7 @@
pgcnt_t npages;
spgcnt_t pg;
page_t *pp;
+ struct vnode *vp = seg->s_data;
ASSERT(RW_READ_HELD(&seg->s_as->a_lock));
@@ -451,7 +455,7 @@
switch (type) {
case F_SOFTLOCK: /* lock down already-loaded translations */
for (pg = 0; pg < npages; pg++) {
- pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+ pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
SE_SHARED);
if (pp == NULL) {
/*
@@ -461,7 +465,7 @@
if (!hat_probe(kas.a_hat, addr)) {
addr -= PAGESIZE;
while (--pg >= 0) {
- pp = page_find(&kvp,
+ pp = page_find(vp,
(u_offset_t)(uintptr_t)addr);
if (pp)
page_unlock(pp);
@@ -477,7 +481,7 @@
return (0);
case F_SOFTUNLOCK:
while (npages--) {
- pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+ pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
if (pp)
page_unlock(pp);
addr += PAGESIZE;
@@ -645,6 +649,13 @@
segkmem_dump_range, seg->s_as);
vmem_walk(heaptext_arena, VMEM_ALLOC | VMEM_REENTRANT,
segkmem_dump_range, seg->s_as);
+ } else if (seg == &kzioseg) {
+ /*
+ * We don't want to dump pages attached to kzioseg since they
+ * contain file data from ZFS. If this page's segment is
+ * kzioseg return instead of writing it to the dump device.
+ */
+ return;
} else {
segkmem_dump_range(seg->s_as, seg->s_base, seg->s_size);
}
@@ -666,6 +677,7 @@
pgcnt_t npages;
spgcnt_t pg;
size_t nb;
+ struct vnode *vp = seg->s_data;
ASSERT(ppp != NULL);
@@ -706,7 +718,7 @@
}
for (pg = 0; pg < npages; pg++) {
- pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
+ pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_SHARED);
if (pp == NULL) {
while (--pg >= 0)
page_unlock(pplist[pg]);
@@ -791,11 +803,21 @@
};
int
+segkmem_zio_create(struct seg *seg)
+{
+ ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
+ seg->s_ops = &segkmem_ops;
+ seg->s_data = &zvp;
+ kas.a_size += seg->s_size;
+ return (0);
+}
+
+int
segkmem_create(struct seg *seg)
{
ASSERT(seg->s_as == &kas && RW_WRITE_HELD(&kas.a_lock));
seg->s_ops = &segkmem_ops;
- seg->s_data = NULL;
+ seg->s_data = &kvp;
kas.a_size += seg->s_size;
return (0);
}
@@ -806,6 +828,10 @@
{
struct seg kseg;
int pgflags;
+ struct vnode *vp = arg;
+
+ if (vp == NULL)
+ vp = &kvp;
kseg.s_as = &kas;
pgflags = PG_EXCL;
@@ -819,7 +845,7 @@
if (vmflag & VM_PUSHPAGE)
pgflags |= PG_PUSHPAGE;
- return (page_create_va(&kvp, (u_offset_t)(uintptr_t)addr, size,
+ return (page_create_va(vp, (u_offset_t)(uintptr_t)addr, size,
pgflags, &kseg, addr));
}
@@ -897,12 +923,14 @@
return (addr);
}
-void *
-segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+static void *
+segkmem_alloc_vn(vmem_t *vmp, size_t size, int vmflag, struct vnode *vp)
{
void *addr;
segkmem_gc_list_t *gcp, **prev_gcpp;
+ ASSERT(vp != NULL);
+
if (kvseg.s_base == NULL) {
#ifndef __sparc
if (bootops->bsys_alloc == NULL)
@@ -928,7 +956,19 @@
return (addr);
}
return (segkmem_xalloc(vmp, NULL, size, vmflag, 0,
- segkmem_page_create, NULL));
+ segkmem_page_create, vp));
+}
+
+void *
+segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+ return (segkmem_alloc_vn(vmp, size, vmflag, &kvp));
+}
+
+void *
+segkmem_zio_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+ return (segkmem_alloc_vn(vmp, size, vmflag, &zvp));
}
/*
@@ -937,8 +977,8 @@
* we currently don't have a special kernel segment for non-paged
* kernel memory that is exported by drivers to user space.
*/
-void
-segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+static void
+segkmem_free_vn(vmem_t *vmp, void *inaddr, size_t size, struct vnode *vp)
{
page_t *pp;
caddr_t addr = inaddr;
@@ -946,6 +986,7 @@
pgcnt_t npages = btopr(size);
ASSERT(((uintptr_t)addr & PAGEOFFSET) == 0);
+ ASSERT(vp != NULL);
if (kvseg.s_base == NULL) {
segkmem_gc_list_t *gc = inaddr;
@@ -960,7 +1001,7 @@
for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
#if defined(__x86)
- pp = page_find(&kvp, (u_offset_t)(uintptr_t)addr);
+ pp = page_find(vp, (u_offset_t)(uintptr_t)addr);
if (pp == NULL)
panic("segkmem_free: page not found");
if (!page_tryupgrade(pp)) {
@@ -969,11 +1010,11 @@
* it to drop the lock so we can free this page.
*/
page_unlock(pp);
- pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr,
+ pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr,
SE_EXCL);
}
#else
- pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
+ pp = page_lookup(vp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
#endif
if (pp == NULL)
panic("segkmem_free: page not found");
@@ -985,6 +1026,19 @@
if (vmp != NULL)
vmem_free(vmp, inaddr, size);
+
+}
+
+void
+segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+ segkmem_free_vn(vmp, inaddr, size, &kvp);
+}
+
+void
+segkmem_zio_free(vmem_t *vmp, void *inaddr, size_t size)
+{
+ segkmem_free_vn(vmp, inaddr, size, &zvp);
}
void
@@ -1441,6 +1495,22 @@
return (use_large_pages);
}
+void
+segkmem_zio_init(void *zio_mem_base, size_t zio_mem_size)
+{
+ ASSERT(zio_mem_base != NULL);
+ ASSERT(zio_mem_size != 0);
+
+ zio_arena = vmem_create("zio", zio_mem_base, zio_mem_size, PAGESIZE,
+ NULL, NULL, NULL, 0, VM_SLEEP);
+
+ zio_alloc_arena = vmem_create("zio_buf", NULL, 0, PAGESIZE,
+ segkmem_zio_alloc, segkmem_zio_free, zio_arena, 0, VM_SLEEP);
+
+ ASSERT(zio_arena != NULL);
+ ASSERT(zio_alloc_arena != NULL);
+}
+
#ifdef __sparc
--- a/usr/src/uts/common/vm/seg_kmem.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_kmem.h Tue Dec 19 23:13:06 2006 -0800
@@ -51,6 +51,7 @@
extern char *heap_lp_end; /* end of kernel large page heap arena */
extern struct seg kvseg; /* primary kernel heap segment */
extern struct seg kvseg_core; /* "core" kernel heap segment */
+extern struct seg kzioseg; /* Segment for zio mappings */
extern vmem_t *heap_lp_arena; /* kernel large page heap arena */
extern vmem_t *heap_arena; /* primary kernel heap arena */
extern vmem_t *hat_memload_arena; /* HAT translation arena */
@@ -59,9 +60,12 @@
extern vmem_t *heaptext_arena; /* kernel text arena, from heap */
extern struct as kas; /* kernel address space */
extern struct vnode kvp; /* vnode for all segkmem pages */
+extern struct vnode zvp; /* vnode for all segkmem pages for zfs */
extern int segkmem_reloc; /* enable/disable segkmem relocatable pages */
extern vmem_t *static_arena; /* arena for caches to import static memory */
extern vmem_t *static_alloc_arena; /* arena for allocating static memory */
+extern vmem_t *zio_arena; /* arena for zio caches */
+extern vmem_t *zio_alloc_arena; /* arena for zio caches */
extern int segkmem_create(struct seg *);
extern page_t *segkmem_page_create(void *, size_t, int, void *);
@@ -77,6 +81,11 @@
extern void kernelheap_extend(void *, void *);
extern void segkmem_gc(void);
+extern void *segkmem_zio_alloc(vmem_t *, size_t, int);
+extern int segkmem_zio_create(struct seg *);
+extern void segkmem_zio_free(vmem_t *, void *, size_t);
+extern void segkmem_zio_init(void *, size_t);
+
/*
* Flags for segkmem_xalloc().
*
--- a/usr/src/uts/common/vm/seg_vn.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/seg_vn.c Tue Dec 19 23:13:06 2006 -0800
@@ -408,7 +408,7 @@
a->szc = 0;
} else if (a->vp != NULL) {
extern struct vnode kvp;
- if (IS_SWAPFSVP(a->vp) || a->vp == &kvp) {
+ if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
/*
* paranoid check.
* hat_page_demote() is not supported
@@ -5537,7 +5537,7 @@
/* paranoid check */
if (svd->vp != NULL &&
- (IS_SWAPFSVP(svd->vp) || svd->vp == &kvp)) {
+ (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) {
return (EINVAL);
}
--- a/usr/src/uts/common/vm/vm_page.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/vm_page.c Tue Dec 19 23:13:06 2006 -0800
@@ -1035,7 +1035,7 @@
ASSERT(szc != 0);
ASSERT(vp != NULL);
ASSERT(!IS_SWAPFSVP(vp));
- ASSERT(vp != &kvp);
+ ASSERT(!VN_ISKAS(vp));
again:
if (++loopcnt > 3) {
@@ -2704,7 +2704,7 @@
if (pp->p_szc != 0) {
if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
- pp->p_vnode == &kvp) {
+ PP_ISKAS(pp)) {
panic("page_free: anon or kernel "
"or no vnode large page %p", (void *)pp);
}
@@ -3153,7 +3153,7 @@
if (pp->p_szc != 0) {
if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
- pp->p_vnode == &kvp) {
+ PP_ISKAS(pp)) {
panic("page_destroy: anon or kernel or no vnode "
"large page %p", (void *)pp);
}
@@ -3332,7 +3332,7 @@
vnode_t *ovp = opp->p_vnode;
ASSERT(ovp != NULL);
ASSERT(!IS_SWAPFSVP(ovp));
- ASSERT(ovp != &kvp);
+ ASSERT(!VN_ISKAS(ovp));
page_demote_vp_pages(opp);
ASSERT(opp->p_szc == 0);
}
@@ -3399,14 +3399,14 @@
(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
if (pp->p_szc != 0) {
ASSERT(!IS_SWAPFSVP(vp));
- ASSERT(vp != &kvp);
+ ASSERT(!VN_ISKAS(vp));
page_demote_vp_pages(pp);
ASSERT(pp->p_szc == 0);
}
mutex_enter(phm);
} else if (pp->p_szc != 0) {
ASSERT(!IS_SWAPFSVP(vp));
- ASSERT(vp != &kvp);
+ ASSERT(!VN_ISKAS(vp));
mutex_exit(phm);
page_demote_vp_pages(pp);
ASSERT(pp->p_szc == 0);
@@ -4378,7 +4378,7 @@
* (g) Backed by a filesystem which doesn't have a
* stubbed-out sync operation
*/
- if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
+ if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
vfs_can_sync(vp->v_vfsp)) {
nppbusy++;
@@ -4457,10 +4457,10 @@
* with the kernel vnode or prom allocated kernel mem.
*/
#if defined(__sparc)
- if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
+ if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp) ||
vp == &prom_ppages)
#else /* x86 doesn't have prom or prom_ppage */
- if ((vp = pp->p_vnode) == NULL || vp == &kvp)
+ if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
#endif /* __sparc */
continue;
@@ -4747,7 +4747,7 @@
}
if (pp->p_szc != pszc) {
ASSERT(pp->p_szc < pszc);
- ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
+ ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
!IS_SWAPFSVP(pp->p_vnode));
tpp = pp + 1;
for (i = 1; i < npgs; i++, tpp++) {
@@ -4879,7 +4879,7 @@
* seg kmem pages require that the target and replacement
* page be the same pagesize.
*/
- flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
+ flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
repl = page_get_replacement_page(targ, lgrp, flags);
if (repl == NULL) {
if (grouplock != 0) {
@@ -4900,7 +4900,7 @@
/*
* Let hat_page_relocate() complete the relocation if it's kernel page
*/
- if (targ->p_vnode == &kvp) {
+ if (VN_ISKAS(targ->p_vnode)) {
*replacement = repl;
if (hat_page_relocate(target, replacement, nrelocp) != 0) {
if (grouplock != 0) {
@@ -5244,7 +5244,7 @@
return (1);
}
- if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
+ if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
page_demote_vp_pages(pp);
ASSERT(pp->p_szc == 0);
@@ -5269,7 +5269,7 @@
* We can't demote kernel pages since we can't hat_unload()
* the mappings.
*/
- if (rootpp->p_vnode == &kvp)
+ if (VN_ISKAS(rootpp->p_vnode))
return (0);
/*
@@ -5393,7 +5393,7 @@
ASSERT(!PP_ISFREE(pp));
ASSERT(pp->p_vnode != NULL);
ASSERT(!IS_SWAPFSVP(pp->p_vnode));
- ASSERT(pp->p_vnode != &kvp);
+ ASSERT(!PP_ISKAS(pp));
VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
@@ -6850,7 +6850,7 @@
ret = EAGAIN;
goto cleanup;
}
- if (PP_ISKVP(pp)) {
+ if (PP_ISKAS(pp)) {
ret = EAGAIN;
goto cleanup;
}
@@ -6932,7 +6932,7 @@
return (EPERM);
}
#else
- if (PP_ISKVP(pp)) {
+ if (PP_ISKAS(pp)) {
return (EPERM);
}
#endif /* __sparc */
@@ -7344,7 +7344,7 @@
bp = page_capture_hash[i].lists[j].next;
while (bp != &page_capture_hash[i].lists[j]) {
pp = bp->pp;
- if (!PP_ISKVP(pp) && PP_TOXIC(pp)) {
+ if (!PP_ISKAS(pp) && PP_TOXIC(pp)) {
pp->p_selock = -1; /* pacify ASSERTs */
PP_CLRFREE(pp);
pagescrub(pp, 0, PAGESIZE);
--- a/usr/src/uts/common/vm/vm_pagelist.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/common/vm/vm_pagelist.c Tue Dec 19 23:13:06 2006 -0800
@@ -3909,7 +3909,7 @@
* pages, since we cannot properly handle demotion of kernel
* pages.
*/
- if (like_pp->p_vnode == &kvp)
+ if (PP_ISKAS(like_pp))
pgrflags |= PGR_SAMESZC;
/* LINTED */
--- a/usr/src/uts/i86pc/os/startup.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/os/startup.c Tue Dec 19 23:13:06 2006 -0800
@@ -209,6 +209,19 @@
char kern_bootargs[OBP_MAXPATHLEN];
/*
+ * ZFS zio segment. This allows us to exclude large portions of ZFS data that
+ * gets cached in kmem caches on the heap. If this is set to zero, we allocate
+ * zio buffers from their own segment, otherwise they are allocated from the
+ * heap. The optimization of allocating zio buffers from their own segment is
+ * only valid on 64-bit kernels.
+ */
+#if defined(__amd64)
+int segzio_fromheap = 0;
+#else
+int segzio_fromheap = 1;
+#endif
+
+/*
* new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
* depends on number of BOP_ALLOC calls made and requested size, memory size
* combination and whether boot.bin memory needs to be freed.
@@ -239,11 +252,13 @@
#endif
caddr_t segkp_base; /* Base address of segkp */
+caddr_t segzio_base; /* Base address of segzio */
#if defined(__amd64)
pgcnt_t segkpsize = btop(SEGKPDEFSIZE); /* size of segkp segment in pages */
#else
pgcnt_t segkpsize = 0;
#endif
+pgcnt_t segziosize = 0; /* size of zio segment in pages */
struct memseg *memseg_base;
struct vnode unused_pages_vp;
@@ -362,6 +377,8 @@
* 0xFFFFFXXX.XXX00000 |-----------------------|- segkmap_start (floating)
* | device mappings |
* 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating)
+ * | segzio |
+ * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
* | segkp |
* --- |-----------------------|- segkp_base
* | segkpm |
@@ -1566,6 +1583,29 @@
PRM_DEBUG(final_kernelheap);
}
+ if (!segzio_fromheap) {
+ size_t size;
+
+ /* size is in bytes, segziosize is in pages */
+ if (segziosize == 0) {
+ size = mmu_ptob(physmem * 2);
+ } else {
+ size = mmu_ptob(segziosize);
+ }
+
+ if (size < SEGZIOMINSIZE) {
+ size = SEGZIOMINSIZE;
+ } else if (size > mmu_ptob(physmem * 4)) {
+ size = mmu_ptob(physmem * 4);
+ }
+ segziosize = mmu_btop(ROUND_UP_LPAGE(size));
+ segzio_base = final_kernelheap;
+ PRM_DEBUG(segziosize);
+ PRM_DEBUG(segzio_base);
+ final_kernelheap = segzio_base + mmu_ptob(segziosize);
+ PRM_DEBUG(final_kernelheap);
+ }
+
/*
* put the range of VA for device mappings next
*/
@@ -2377,6 +2417,16 @@
#if defined(__amd64)
(void) seg_attach(&kas, (caddr_t)core_base, core_size, &kvseg_core);
(void) segkmem_create(&kvseg_core);
+
+ /* segzio optimization is only valid for 64-bit kernels */
+ if (!segzio_fromheap) {
+ (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
+ &kzioseg);
+ (void) segkmem_zio_create(&kzioseg);
+
+ /* create zio area covering new segment */
+ segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
+ }
#endif
(void) seg_attach(&kas, (caddr_t)SEGDEBUGBASE, (size_t)SEGDEBUGSIZE,
--- a/usr/src/uts/i86pc/sys/machparam.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/sys/machparam.h Tue Dec 19 23:13:06 2006 -0800
@@ -167,6 +167,11 @@
#define SEGKPMINSIZE (200L * 1024 * 1024L) /* 200M */
/*
+ * minimum size for segzio
+ */
+#define SEGZIOMINSIZE (400L * 1024 * 1024L) /* 400M */
+
+/*
* Boot (or, more precisely, vmx) maps most pages twice - once in the
* bottom 2GB of memory and once in the bottom 2GB of the topmost 4GB.
* When boot is unmapped this range is available to the kernel, but until
--- a/usr/src/uts/i86pc/vm/vm_dep.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_dep.h Tue Dec 19 23:13:06 2006 -0800
@@ -406,7 +406,7 @@
*/
#define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) { \
- if (restricted_kmemalloc && (vp) == &kvp && \
+ if (restricted_kmemalloc && VN_ISKAS(vp) && \
(caddr_t)(vaddr) >= kernelheap && \
(caddr_t)(vaddr) < ekernelheap) { \
ASSERT(physmax4g); \
--- a/usr/src/uts/i86pc/vm/vm_machdep.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c Tue Dec 19 23:13:06 2006 -0800
@@ -1920,8 +1920,8 @@
* with kernel vnode 'kvp'.
*/
/* XX64 - to debug why this happens! */
- ASSERT(vp != &kvp);
- if (vp == &kvp)
+ ASSERT(!VN_ISKAS(vp));
+ if (VN_ISKAS(vp))
cmn_err(CE_NOTE,
"page_create: page not expected "
"in hash list for kernel vnode - pp 0x%p",
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c Tue Dec 19 23:13:06 2006 -0800
@@ -3672,14 +3672,21 @@
* Somebody is holding SE_EXCL lock. Might
* even be hat_page_relocate(). Drop all
* our locks, lookup the page in &kvp, and
- * retry. If it doesn't exist in &kvp, then
- * we must be dealing with a kernel mapped
+ * retry. If it doesn't exist in &kvp and &zvp,
+ * then we must be dealing with a kernel mapped
* page which doesn't actually belong to
* segkmem so we punt.
*/
sfmmu_mlist_exit(pml);
SFMMU_HASH_UNLOCK(hmebp);
pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
+
+ /* check zvp before giving up */
+ if (pp == NULL)
+ pp = page_lookup(&zvp, (u_offset_t)saddr,
+ SE_SHARED);
+
+ /* Okay, we didn't find it, give up */
if (pp == NULL) {
kmem_cache_free(pa_hment_cache, pahmep);
*rpfn = pfn;
@@ -3710,7 +3717,7 @@
goto rehash;
}
- if (vp != &kvp) {
+ if (!VN_ISKAS(vp)) {
/*
* This is not a segkmem page but another page which
* has been kernel mapped. It had better have at least
@@ -3841,14 +3848,19 @@
* Somebody is holding SE_EXCL lock. Might
* even be hat_page_relocate(). Drop all
* our locks, lookup the page in &kvp, and
- * retry. If it doesn't exist in &kvp, then
- * we must be dealing with a kernel mapped
+ * retry. If it doesn't exist in &kvp and &zvp,
+ * then we must be dealing with a kernel mapped
* page which doesn't actually belong to
* segkmem so we punt.
*/
sfmmu_mlist_exit(pml);
SFMMU_HASH_UNLOCK(hmebp);
pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
+ /* check zvp before giving up */
+ if (pp == NULL)
+ pp = page_lookup(&zvp, (u_offset_t)saddr,
+ SE_SHARED);
+
if (pp == NULL) {
ASSERT(cookie == NULL);
return;
@@ -3875,7 +3887,7 @@
goto rehash;
}
- if (vp != &kvp) {
+ if (!VN_ISKAS(vp)) {
/*
* This is not a segkmem page but another page which
* has been kernel mapped.
@@ -6522,7 +6534,7 @@
ASSERT(pp != NULL);
ASSERT(sfmmu_mlist_held(pp));
- ASSERT(pp->p_vnode != &kvp);
+ ASSERT(!PP_ISKAS(pp));
CPUSET_ZERO(cpuset);
--- a/usr/src/uts/sparc/v9/vm/seg_nf.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sparc/v9/vm/seg_nf.c Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -122,8 +121,8 @@
* vnode and page for the page of zeros we use for the nf mappings.
*/
static kmutex_t segnf_lock;
-static struct vnode zvp;
-static struct page **zpp;
+static struct vnode nfvp;
+static struct page **nfpp;
#define addr_to_vcolor(addr) \
(shm_alignment) ? \
@@ -195,7 +194,7 @@
* Need a page per virtual color or just 1 if no vac.
*/
mutex_enter(&segnf_lock);
- if (zpp == NULL) {
+ if (nfpp == NULL) {
struct seg kseg;
vacpgs = 1;
@@ -203,16 +202,16 @@
vacpgs = shm_alignment >> PAGESHIFT;
}
- zpp = kmem_alloc(sizeof (*zpp) * vacpgs, KM_SLEEP);
+ nfpp = kmem_alloc(sizeof (*nfpp) * vacpgs, KM_SLEEP);
kseg.s_as = &kas;
for (i = 0; i < vacpgs; i++, off += PAGESIZE,
vaddr += PAGESIZE) {
- zpp[i] = page_create_va(&zvp, off, PAGESIZE,
+ nfpp[i] = page_create_va(&nfvp, off, PAGESIZE,
PG_WAIT | PG_NORELOC, &kseg, vaddr);
- page_io_unlock(zpp[i]);
- page_downgrade(zpp[i]);
- pagezero(zpp[i], 0, PAGESIZE);
+ page_io_unlock(nfpp[i]);
+ page_downgrade(nfpp[i]);
+ pagezero(nfpp[i], 0, PAGESIZE);
}
}
mutex_exit(&segnf_lock);
@@ -234,7 +233,7 @@
color = addr_to_vcolor(seg->s_base);
if (as != &kas)
prot |= PROT_USER;
- hat_memload(as->a_hat, seg->s_base, zpp[color],
+ hat_memload(as->a_hat, seg->s_base, nfpp[color],
prot | HAT_NOFAULT, HAT_LOAD);
/*
@@ -456,7 +455,7 @@
{
ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
- *vpp = &zvp;
+ *vpp = &nfvp;
return (0);
}
--- a/usr/src/uts/sun4/os/startup.c Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sun4/os/startup.c Tue Dec 19 23:13:06 2006 -0800
@@ -192,6 +192,10 @@
struct seg *segkmap = &kmapseg; /* Kernel generic mapping segment */
struct seg *segkpm = &kpmseg; /* 64bit kernel physical mapping segment */
+int segzio_fromheap = 0; /* zio allocations occur from heap */
+caddr_t segzio_base; /* Base address of segzio */
+pgcnt_t segziosize = 0; /* size of zio segment in pages */
+
/*
* debugger pages (if allocated)
*/
@@ -373,6 +377,8 @@
* 0xFFFFFFFC.00000000 -|-----------------------|-
* : :
* : :
+ * -|-----------------------|-
+ * | segzio | (base and size vary)
* 0xFFFFFE00.00000000 -|-----------------------|-
* | | Ultrasparc I/II support
* | segkpm segment | up to 2TB of physical
@@ -2058,6 +2064,47 @@
mach_kpm_init();
}
+ if (!segzio_fromheap) {
+ size_t size;
+
+ /* size is in bytes, segziosize is in pages */
+ if (segziosize == 0) {
+ size = mmu_ptob(physmem * 2);
+ } else {
+ size = mmu_ptob(segziosize);
+ }
+
+ if (size < SEGZIOMINSIZE) {
+ size = SEGZIOMINSIZE;
+ } else if (size > mmu_ptob(physmem * 4)) {
+ size = mmu_ptob(physmem * 4);
+ }
+ segziosize = mmu_btop(roundup(size, MMU_PAGESIZE));
+ /* put the base of the ZIO segment after the kpm segment */
+ segzio_base = kpm_vbase + (kpm_size * vac_colors);
+ PRM_DEBUG(segziosize);
+ PRM_DEBUG(segzio_base);
+
+ /*
+ * On some platforms, kvm_init is called after the kpm
+ * sizes have been determined. On SPARC, kvm_init is called
+ * before, so we have to attach the kzioseg after kvm is
+ * initialized, otherwise we'll try to allocate from the boot
+ * area since the kernel heap hasn't yet been configured.
+ */
+ rw_enter(&kas.a_lock, RW_WRITER);
+
+ (void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
+ &kzioseg);
+ (void) segkmem_zio_create(&kzioseg);
+
+ /* create zio area covering new segment */
+ segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
+
+ rw_exit(&kas.a_lock);
+ }
+
+
/*
* Now create generic mapping segment. This mapping
* goes SEGMAPSIZE beyond SEGMAPBASE. But if the total
--- a/usr/src/uts/sun4/sys/vm_machparam.h Tue Dec 19 22:06:32 2006 -0800
+++ b/usr/src/uts/sun4/sys/vm_machparam.h Tue Dec 19 23:13:06 2006 -0800
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -22,10 +21,9 @@
/* Copyright (c) 1988 AT&T */
/* All Rights Reserved */
-
/*
- * Copyright (c) 1989,1999 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
#ifndef _SYS_VM_MACHPARAM_H
@@ -96,6 +94,11 @@
#endif /* _LP64 */
/*
+ * Define minimum size for zio segment
+ */
+#define SEGZIOMINSIZE (512L * 1024 * 1024L) /* 512M */
+
+/*
* The time for a process to be blocked before being very swappable.
* This is a number of seconds which the system takes as being a non-trivial
* amount of real time. You probably shouldn't change this;