6708183 poor scalability of mdb memstat with increasing CPU count
authorPavel Tatashin <Pavel.Tatashin@Sun.COM>
Wed, 17 Jun 2009 15:32:10 -0700
changeset 9894 42b0c48b08a4
parent 9893 589b92d8d72b
child 9895 adcf72c91c4f
6708183 poor scalability of mdb memstat with increasing CPU count
usr/src/cmd/mdb/common/modules/genunix/genunix.c
usr/src/cmd/mdb/common/modules/genunix/memory.c
usr/src/cmd/mdb/common/modules/genunix/memory.h
usr/src/uts/common/io/mem.c
usr/src/uts/common/sys/vnode.h
usr/src/uts/common/vm/hat.h
usr/src/uts/i86pc/vm/hat_i86.c
usr/src/uts/sun4u/vm/mach_kpm.c
usr/src/uts/sun4v/vm/mach_kpm.c
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Wed Jun 17 15:32:10 2009 -0700
@@ -4833,6 +4833,8 @@
 	/* from memory.c */
 	{ "page", "walk all pages, or those from the specified vnode",
 		page_walk_init, page_walk_step, page_walk_fini },
+	{ "allpages", "walk all pages, including free pages",
+		allpages_walk_init, allpages_walk_step, allpages_walk_fini },
 	{ "memlist", "walk specified memlist",
 		NULL, memlist_walk_step, NULL },
 	{ "swapinfo", "walk swapinfo structures",
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/memory.c	Wed Jun 17 15:32:10 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -214,9 +214,222 @@
 	mdb_free(wsp->walk_data, sizeof (page_walk_data_t));
 }
 
+/*
+ * allpages walks all pages in the system in order they appear in
+ * the memseg structure
+ */
+
+#define	PAGE_BUFFER	128
+
+int
+allpages_walk_init(mdb_walk_state_t *wsp)
+{
+	if (wsp->walk_addr != 0) {
+		mdb_warn("allpages only supports global walks.\n");
+		return (WALK_ERR);
+	}
+
+	if (mdb_layered_walk("memseg", wsp) == -1) {
+		mdb_warn("couldn't walk 'memseg'");
+		return (WALK_ERR);
+	}
+
+	wsp->walk_data = mdb_alloc(sizeof (page_t) * PAGE_BUFFER, UM_SLEEP);
+	return (WALK_NEXT);
+}
+
+int
+allpages_walk_step(mdb_walk_state_t *wsp)
+{
+	const struct memseg *msp = wsp->walk_layer;
+	page_t *buf = wsp->walk_data;
+	size_t pg_read, i;
+	size_t pg_num = msp->pages_end - msp->pages_base;
+	const page_t *pg_addr = msp->pages;
+
+	while (pg_num > 0) {
+		pg_read = MIN(pg_num, PAGE_BUFFER);
+
+		if (mdb_vread(buf, pg_read * sizeof (page_t),
+		    (uintptr_t)pg_addr) == -1) {
+			mdb_warn("can't read page_t's at %#lx", pg_addr);
+			return (WALK_ERR);
+		}
+		for (i = 0; i < pg_read; i++) {
+			int ret = wsp->walk_callback((uintptr_t)&pg_addr[i],
+			    &buf[i], wsp->walk_cbdata);
+
+			if (ret != WALK_NEXT)
+				return (ret);
+		}
+		pg_num -= pg_read;
+		pg_addr += pg_read;
+	}
+
+	return (WALK_NEXT);
+}
+
+void
+allpages_walk_fini(mdb_walk_state_t *wsp)
+{
+	mdb_free(wsp->walk_data, sizeof (page_t) * PAGE_BUFFER);
+}
+
+/*
+ * Hash table + LRU queue.
+ * This table is used to cache recently read vnodes for the memstat
+ * command, to reduce the number of mdb_vread calls.  This greatly
+ * speeds the memstat command on on live, large CPU count systems.
+ */
+
+#define	VN_SMALL	401
+#define	VN_LARGE	10007
+#define	VN_HTABLE_KEY(p, hp)	((p) % ((hp)->vn_htable_buckets))
+
+struct vn_htable_list {
+	uint_t vn_flag;				/* v_flag from vnode	*/
+	uintptr_t vn_ptr;			/* pointer to vnode	*/
+	struct vn_htable_list *vn_q_next;	/* queue next pointer	*/
+	struct vn_htable_list *vn_q_prev;	/* queue prev pointer	*/
+	struct vn_htable_list *vn_h_next;	/* hash table pointer	*/
+};
+
+/*
+ * vn_q_first        -> points to to head of queue: the vnode that was most
+ *                      recently used
+ * vn_q_last         -> points to the oldest used vnode, and is freed once a new
+ *                      vnode is read.
+ * vn_htable         -> hash table
+ * vn_htable_buf     -> contains htable objects
+ * vn_htable_size    -> total number of items in the hash table
+ * vn_htable_buckets -> number of buckets in the hash table
+ */
+typedef struct vn_htable {
+	struct vn_htable_list  *vn_q_first;
+	struct vn_htable_list  *vn_q_last;
+	struct vn_htable_list **vn_htable;
+	struct vn_htable_list  *vn_htable_buf;
+	int vn_htable_size;
+	int vn_htable_buckets;
+} vn_htable_t;
+
+
+/* allocate memory, initilize hash table and LRU queue */
+static void
+vn_htable_init(vn_htable_t *hp, size_t vn_size)
+{
+	int i;
+	int htable_size = MAX(vn_size, VN_LARGE);
+
+	if ((hp->vn_htable_buf = mdb_zalloc(sizeof (struct vn_htable_list)
+	    * htable_size, UM_NOSLEEP|UM_GC)) == NULL) {
+		htable_size = VN_SMALL;
+		hp->vn_htable_buf = mdb_zalloc(sizeof (struct vn_htable_list)
+		    * htable_size, UM_SLEEP|UM_GC);
+	}
+
+	hp->vn_htable = mdb_zalloc(sizeof (struct vn_htable_list *)
+	    * htable_size, UM_SLEEP|UM_GC);
+
+	hp->vn_q_first  = &hp->vn_htable_buf[0];
+	hp->vn_q_last   = &hp->vn_htable_buf[htable_size - 1];
+	hp->vn_q_first->vn_q_next = &hp->vn_htable_buf[1];
+	hp->vn_q_last->vn_q_prev = &hp->vn_htable_buf[htable_size - 2];
+
+	for (i = 1; i < (htable_size-1); i++) {
+		hp->vn_htable_buf[i].vn_q_next = &hp->vn_htable_buf[i + 1];
+		hp->vn_htable_buf[i].vn_q_prev = &hp->vn_htable_buf[i - 1];
+	}
+
+	hp->vn_htable_size = htable_size;
+	hp->vn_htable_buckets = htable_size;
+}
+
+
+/*
+ * Find the vnode whose address is ptr, and return its v_flag in vp->v_flag.
+ * The function tries to find needed information in the following order:
+ *
+ * 1. check if ptr is the first in queue
+ * 2. check if ptr is in hash table (if so move it to the top of queue)
+ * 3. do mdb_vread, remove last queue item from queue and hash table.
+ *    Insert new information to freed object, and put this object in to the
+ *    top of the queue.
+ */
+static int
+vn_get(vn_htable_t *hp, struct vnode *vp, uintptr_t ptr)
+{
+	int hkey;
+	struct vn_htable_list *hent, **htmp, *q_next, *q_prev;
+	struct vn_htable_list  *q_first = hp->vn_q_first;
+
+	/* 1. vnode ptr is the first in queue, just get v_flag and return */
+	if (q_first->vn_ptr == ptr) {
+		vp->v_flag = q_first->vn_flag;
+
+		return (0);
+	}
+
+	/* 2. search the hash table for this ptr */
+	hkey = VN_HTABLE_KEY(ptr, hp);
+	hent = hp->vn_htable[hkey];
+	while (hent && (hent->vn_ptr != ptr))
+		hent = hent->vn_h_next;
+
+	/* 3. if hent is NULL, we did not find in hash table, do mdb_vread */
+	if (hent == NULL) {
+		struct vnode vn;
+
+		if (mdb_vread(&vn, sizeof (vnode_t), ptr) == -1) {
+			mdb_warn("unable to read vnode_t at %#lx", ptr);
+			return (-1);
+		}
+
+		/* we will insert read data into the last element in queue */
+		hent = hp->vn_q_last;
+
+		/* remove last hp->vn_q_last object from hash table */
+		if (hent->vn_ptr) {
+			htmp = &hp->vn_htable[VN_HTABLE_KEY(hent->vn_ptr, hp)];
+			while (*htmp != hent)
+				htmp = &(*htmp)->vn_h_next;
+			*htmp = hent->vn_h_next;
+		}
+
+		/* insert data into new free object */
+		hent->vn_ptr  = ptr;
+		hent->vn_flag = vn.v_flag;
+
+		/* insert new object into hash table */
+		hent->vn_h_next = hp->vn_htable[hkey];
+		hp->vn_htable[hkey] = hent;
+	}
+
+	/* Remove from queue. hent is not first, vn_q_prev is not NULL */
+	q_next = hent->vn_q_next;
+	q_prev = hent->vn_q_prev;
+	if (q_next == NULL)
+		hp->vn_q_last = q_prev;
+	else
+		q_next->vn_q_prev = q_prev;
+	q_prev->vn_q_next = q_next;
+
+	/* Add to the front of queue */
+	hent->vn_q_prev = NULL;
+	hent->vn_q_next = q_first;
+	q_first->vn_q_prev = hent;
+	hp->vn_q_first = hent;
+
+	/* Set v_flag in vnode pointer from hent */
+	vp->v_flag = hent->vn_flag;
+
+	return (0);
+}
+
 /* Summary statistics of pages */
 typedef struct memstat {
 	struct vnode    *ms_kvp;	/* Cached address of kernel vnode */
+	struct vnode    *ms_unused_vp;	/* Unused pages vnode pointer	  */
 	struct vnode    *ms_zvp;	/* Cached address of zio vnode    */
 	uint64_t	ms_kmem;	/* Pages of kernel memory	  */
 	uint64_t	ms_zfs_data;	/* Pages of zfs data		  */
@@ -225,6 +438,8 @@
 	uint64_t	ms_exec;	/* Pages of exec/library memory	  */
 	uint64_t	ms_cachelist;	/* Pages on the cachelist (free)  */
 	uint64_t	ms_total;	/* Pages on page hash		  */
+	vn_htable_t	*ms_vn_htable;	/* Pointer to hash table	  */
+	struct vnode	ms_vn;		/* vnode buffer			  */
 } memstat_t;
 
 #define	MS_PP_ISKAS(pp, stats)				\
@@ -234,36 +449,28 @@
 	(((stats)->ms_zvp != NULL) && ((pp)->p_vnode == (stats)->ms_zvp))
 
 /*
- * Summarize pages by type; called from page walker.
+ * Summarize pages by type and update stat information
  */
 
 /* ARGSUSED */
 static int
 memstat_callback(page_t *page, page_t *pp, memstat_t *stats)
 {
-	struct vnode vn, *vp;
-	uintptr_t ptr;
+	struct vnode *vp = &stats->ms_vn;
 
-	/* read page's vnode pointer */
-	if ((ptr = (uintptr_t)(pp->p_vnode)) != NULL) {
-		if (mdb_vread(&vn, sizeof (vnode_t), ptr) == -1) {
-			mdb_warn("unable to read vnode_t at %#lx",
-			    ptr);
-			return (WALK_ERR);
-		}
-		vp = &vn;
-	} else
-		vp = NULL;
-
-	if (PP_ISFREE(pp))
-		stats->ms_cachelist++;
-	else if (vp && IS_SWAPFSVP(vp))
-		stats->ms_anon++;
+	if (pp->p_vnode == NULL || pp->p_vnode == stats->ms_unused_vp)
+		return (WALK_NEXT);
+	else if (MS_PP_ISKAS(pp, stats))
+		stats->ms_kmem++;
 	else if (MS_PP_ISZFS_DATA(pp, stats))
 		stats->ms_zfs_data++;
-	else if (MS_PP_ISKAS(pp, stats))
-		stats->ms_kmem++;
-	else if (vp && (((vp)->v_flag & VVMEXEC)) != 0)
+	else if (PP_ISFREE(pp))
+		stats->ms_cachelist++;
+	else if (vn_get(stats->ms_vn_htable, vp, (uintptr_t)pp->p_vnode))
+		return (WALK_ERR);
+	else if (IS_SWAPFSVP(vp))
+		stats->ms_anon++;
+	else if ((vp->v_flag & VVMEXEC) != 0)
 		stats->ms_exec++;
 	else
 		stats->ms_vnode++;
@@ -281,19 +488,33 @@
 	pgcnt_t total_pages, physmem;
 	ulong_t freemem;
 	memstat_t stats;
-	memstat_t unused_stats;
 	GElf_Sym sym;
+	vn_htable_t ht;
+	uintptr_t vn_size = 0;
 #if defined(__i386) || defined(__amd64)
 	bln_stats_t bln_stats;
 	ssize_t bln_size;
 #endif
 
 	bzero(&stats, sizeof (memstat_t));
-	bzero(&unused_stats, sizeof (memstat_t));
 
-	if (argc != 0 || (flags & DCMD_ADDRSPEC))
+	/*
+	 * -s size, is an internal option. It specifies the size of vn_htable.
+	 * Hash table size is set in the following order:
+	 * If user has specified the size that is larger than VN_LARGE: try it,
+	 * but if malloc failed default to VN_SMALL. Otherwise try VN_LARGE, if
+	 * failed to allocate default to VN_SMALL.
+	 * For a better efficiency of hash table it is highly recommended to
+	 * set size to a prime number.
+	 */
+	if ((flags & DCMD_ADDRSPEC) || mdb_getopts(argc, argv,
+	    's', MDB_OPT_UINTPTR, &vn_size, NULL) != argc)
 		return (DCMD_USAGE);
 
+	/* Initialize vnode hash list and queue */
+	vn_htable_init(&ht, vn_size);
+	stats.ms_vn_htable = &ht;
+
 	/* Grab base page size */
 	if (mdb_readvar(&pagesize, "_pagesize") == -1) {
 		mdb_warn("unable to read _pagesize");
@@ -332,37 +553,26 @@
 		stats.ms_zvp = (struct vnode *)(uintptr_t)sym.st_value;
 	}
 
-	/* Walk page structures, summarizing usage */
-	if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback,
-	    &stats) == -1) {
-		mdb_warn("can't walk pages");
-		return (DCMD_ERR);
-	}
-
-	/* read unused pages vnode */
+	/*
+	 * If physmem != total_pages, then the administrator has limited the
+	 * number of pages available in the system.  Excluded pages are
+	 * associated with the unused pages vnode.  Read this vnode so the
+	 * pages can be excluded in the page accounting.
+	 */
 	if (mdb_lookup_by_obj(MDB_OBJ_EXEC, "unused_pages_vp",
 	    (GElf_Sym *)&sym) == -1) {
 		mdb_warn("unable to read unused_pages_vp");
 		return (DCMD_ERR);
 	}
-
-	unused_stats.ms_kvp = (struct vnode *)(uintptr_t)sym.st_value;
+	stats.ms_unused_vp = (struct vnode *)(uintptr_t)sym.st_value;
 
-	/* Find unused pages */
-	if (mdb_walk("page", (mdb_walk_cb_t)memstat_callback,
-	    &unused_stats) == -1) {
-		mdb_warn("can't walk pages");
+	/* walk all pages, collect statistics */
+	if (mdb_walk("allpages", (mdb_walk_cb_t)memstat_callback,
+	    &stats) == -1) {
+		mdb_warn("can't walk memseg");
 		return (DCMD_ERR);
 	}
 
-	/*
-	 * If physmem != total_pages, then the administrator has limited the
-	 * number of pages available in the system.  In order to account for
-	 * this, we reduce the amount normally attributed to the page cache.
-	 */
-	stats.ms_vnode -= unused_stats.ms_kmem;
-	stats.ms_total -= unused_stats.ms_kmem;
-
 #define	MS_PCT_TOTAL(x)	((ulong_t)((((5 * total_pages) + ((x) * 1000ull))) / \
 		((physmem) * 10)))
 
--- a/usr/src/cmd/mdb/common/modules/genunix/memory.h	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/memory.h	Wed Jun 17 15:32:10 2009 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2000-2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef	_MEMORY_H
 #define	_MEMORY_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -37,6 +34,9 @@
 int page_walk_step(mdb_walk_state_t *);
 void page_walk_fini(mdb_walk_state_t *);
 int page(uintptr_t, uint_t, int, const mdb_arg_t *);
+int allpages_walk_init(mdb_walk_state_t *);
+int allpages_walk_step(mdb_walk_state_t *);
+void allpages_walk_fini(mdb_walk_state_t *);
 int memstat(uintptr_t, uint_t, int, const mdb_arg_t *);
 
 int swap_walk_init(mdb_walk_state_t *);
--- a/usr/src/uts/common/io/mem.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/common/io/mem.c	Wed Jun 17 15:32:10 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -234,18 +234,34 @@
 #pragma weak mach_sync_icache_pa
 
 static int
-mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio)
+mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
+    page_t *pp)
 {
 	int error = 0;
+	int devload = 0;
+	int is_memory = pf_is_memory(pfn);
 	size_t nbytes = MIN((size_t)(PAGESIZE - pageoff),
 	    (size_t)uio->uio_iov->iov_len);
+	caddr_t va = NULL;
 
 	mutex_enter(&mm_lock);
-	hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
-	    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ | PROT_WRITE),
-	    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
+
+	if (is_memory && kpm_enable) {
+		if (pp)
+			va = hat_kpm_mapin(pp, NULL);
+		else
+			va = hat_kpm_mapin_pfn(pfn);
+	}
 
-	if (!pf_is_memory(pfn)) {
+	if (va == NULL) {
+		hat_devload(kas.a_hat, mm_map, PAGESIZE, pfn,
+		    (uint_t)(rw == UIO_READ ? PROT_READ : PROT_READ|PROT_WRITE),
+		    HAT_LOAD_NOCONSIST|HAT_LOAD_LOCK);
+		va = mm_map;
+		devload = 1;
+	}
+
+	if (!is_memory) {
 		if (allowio) {
 			size_t c = uio->uio_iov->iov_len;
 
@@ -256,7 +272,7 @@
 		} else
 			error = EIO;
 	} else {
-		error = uiomove(&mm_map[pageoff], nbytes, rw, uio);
+		error = uiomove(va + pageoff, nbytes, rw, uio);
 
 		/*
 		 * In case this has changed executable code,
@@ -267,7 +283,13 @@
 		}
 	}
 
-	hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
+	if (devload)
+		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
+	else if (pp)
+		hat_kpm_mapout(pp, NULL, va);
+	else
+		hat_kpm_mapout_pfn(pfn);
+
 	mutex_exit(&mm_lock);
 	return (error);
 }
@@ -330,13 +352,13 @@
 
 			v = BTOP((u_offset_t)uio->uio_loffset);
 			error = mmio(uio, rw, v,
-			    uio->uio_loffset & PAGEOFFSET, 0);
+			    uio->uio_loffset & PAGEOFFSET, 0, NULL);
 			break;
 
 		case M_KMEM:
 		case M_ALLKMEM:
 			{
-			page_t **ppp;
+			page_t **ppp = NULL;
 			caddr_t vaddr = (caddr_t)uio->uio_offset;
 			int try_lock = NEED_LOCK_KVADDR(vaddr);
 			int locked = 0;
@@ -369,7 +391,8 @@
 			}
 
 			error = mmio(uio, rw, v, uio->uio_loffset & PAGEOFFSET,
-			    minor == M_ALLKMEM || mm_kmem_io_access);
+			    minor == M_ALLKMEM || mm_kmem_io_access,
+			    (locked && ppp) ? *ppp : NULL);
 			if (locked)
 				as_pageunlock(&kas, ppp, vaddr, PAGESIZE,
 				    S_WRITE);
--- a/usr/src/uts/common/sys/vnode.h	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/common/sys/vnode.h	Wed Jun 17 15:32:10 2009 -0700
@@ -326,6 +326,12 @@
 	(pvn_vmodsort_supported != 0 && ((vp)->v_flag  & VMODSORT) != 0)
 
 #define	VISSWAPFS	0x20000	/* vnode is being used for swapfs */
+
+/*
+ * The mdb memstat command assumes that IS_SWAPFSVP only uses the
+ * vnode's v_flag field.  If this changes, cache the additional
+ * fields in mdb; see vn_get in mdb/common/modules/genunix/memory.c
+ */
 #define	IS_SWAPFSVP(vp)	(((vp)->v_flag & VISSWAPFS) != 0)
 
 #define	V_SYSATTR	0x40000	/* vnode is a GFS system attribute */
--- a/usr/src/uts/common/vm/hat.h	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/common/vm/hat.h	Wed Jun 17 15:32:10 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -248,6 +248,8 @@
  */
 caddr_t	hat_kpm_mapin(struct page *, struct kpme *);
 void	hat_kpm_mapout(struct page *, struct kpme *, caddr_t);
+caddr_t hat_kpm_mapin_pfn(pfn_t);
+void    hat_kpm_mapout_pfn(pfn_t);
 caddr_t	hat_kpm_page2va(struct page *, int);
 struct page *hat_kpm_vaddr2page(caddr_t);
 int	hat_kpm_fault(struct hat *, caddr_t);
--- a/usr/src/uts/i86pc/vm/hat_i86.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/i86pc/vm/hat_i86.c	Wed Jun 17 15:32:10 2009 -0700
@@ -138,7 +138,7 @@
 
 /*
  * AMD shanghai processors provide better management of 1gb ptes in its tlb.
- * By default, 1g page support will be disabled for pre-shanghai AMD
+ * By default, 1g page suppport will be disabled for pre-shanghai AMD
  * processors that don't have optimal tlb support for the 1g page size.
  * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal
  * processors.
@@ -1299,7 +1299,7 @@
 	int		rv = 0;
 
 	/*
-	 * Is this a consistent (ie. need mapping list lock) mapping?
+	 * Is this a consistant (ie. need mapping list lock) mapping?
 	 */
 	is_consist = (pp != NULL && (flags & HAT_LOAD_NOCONSIST) == 0);
 
@@ -1991,15 +1991,22 @@
 
 /*
  * Service a delayed TLB flush if coming out of being idle.
- * It will be called from cpu idle notification with interrupt disabled.
  */
 void
 tlb_service(void)
 {
+	ulong_t flags = getflags();
 	ulong_t tlb_info;
 	ulong_t found;
 
 	/*
+	 * Be sure interrupts are off while doing this so that
+	 * higher level interrupts correctly wait for flushes to finish.
+	 */
+	if (flags & PS_IE)
+		flags = intr_clear();
+
+	/*
 	 * We only have to do something if coming out of being idle.
 	 */
 	tlb_info = CPU->cpu_m.mcpu_tlb_info;
@@ -2017,6 +2024,12 @@
 		if (tlb_info & TLB_INVAL_ALL)
 			flush_all_tlb_entries();
 	}
+
+	/*
+	 * Restore interrupt enable control bit.
+	 */
+	if (flags & PS_IE)
+		sti();
 }
 #endif /* !__xpv */
 
@@ -3165,7 +3178,7 @@
 
 /*
  * Called when all mappings to a page should have write permission removed.
- * Mostly stolen from hat_pagesync()
+ * Mostly stolem from hat_pagesync()
  */
 static void
 hati_page_clrwrt(struct page *pp)
@@ -3298,8 +3311,8 @@
 
 /*
  *	If flag is specified, returns 0 if attribute is disabled
- *	and non zero if enabled.  If flag specifes multiple attributes
- *	then returns 0 if ALL attributes are disabled.  This is an advisory
+ *	and non zero if enabled.  If flag specifes multiple attributs
+ *	then returns 0 if ALL atriibutes are disabled.  This is an advisory
  *	call.
  */
 uint_t
@@ -4227,6 +4240,38 @@
 }
 
 /*
+ * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical
+ * memory addresses that are not described by a page_t.  It can
+ * also be used for normal pages that are not locked, but beware
+ * this is dangerous - no locking is performed, so the identity of
+ * the page could change.  hat_kpm_mapin_pfn is not supported when
+ * vac_colors > 1, because the chosen va depends on the page identity,
+ * which could change.
+ * The caller must only pass pfn's for valid physical addresses; violation
+ * of this rule will cause panic.
+ */
+caddr_t
+hat_kpm_mapin_pfn(pfn_t pfn)
+{
+	caddr_t paddr, vaddr;
+
+	if (kpm_enable == 0)
+		return ((caddr_t)NULL);
+
+	paddr = (caddr_t)ptob(pfn);
+	vaddr = (uintptr_t)kpm_vbase + paddr;
+
+	return ((caddr_t)vaddr);
+}
+
+/*ARGSUSED*/
+void
+hat_kpm_mapout_pfn(pfn_t pfn)
+{
+	/* empty */
+}
+
+/*
  * Return the kpm virtual address for a specific pfn
  */
 caddr_t
--- a/usr/src/uts/sun4u/vm/mach_kpm.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/sun4u/vm/mach_kpm.c	Wed Jun 17 15:32:10 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -58,6 +58,8 @@
 void	sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp);
 void	sfmmu_kpm_page_cache(page_t *, int, int);
 
+extern uint_t vac_colors;
+
 /*
  * Kernel Physical Mapping (kpm) facility
  */
@@ -168,6 +170,46 @@
 }
 
 /*
+ * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical
+ * memory addresses that are not described by a page_t.  It can
+ * only be supported if vac_colors=1, because there is no page_t
+ * and corresponding kpm_page_t to track VAC conflicts.  Currently,
+ * this may not be used on pfn's backed by page_t's, because the
+ * kpm state may not be consistent in hat_kpm_fault if the page is
+ * mapped using both this routine and hat_kpm_mapin.  KPM should be
+ * cleaned up on sun4u/vac_colors=1 to be minimal as on sun4v.
+ * The caller must only pass pfn's for valid physical addresses; violation
+ * of this rule will cause panic.
+ */
+caddr_t
+hat_kpm_mapin_pfn(pfn_t pfn)
+{
+	caddr_t paddr, vaddr;
+	tte_t tte;
+	uint_t szc = kpm_smallpages ? TTE8K : TTE4M;
+	uint_t shift = kpm_smallpages ? MMU_PAGESHIFT : MMU_PAGESHIFT4M;
+
+	if (kpm_enable == 0 || vac_colors > 1 ||
+	    page_numtomemseg_nolock(pfn) != NULL)
+		return ((caddr_t)NULL);
+
+	paddr = (caddr_t)ptob(pfn);
+	vaddr = (uintptr_t)kpm_vbase + paddr;
+
+	KPM_TTE_VCACHED(tte.ll, pfn, szc);
+	sfmmu_kpm_load_tsb(vaddr, &tte, shift);
+
+	return (vaddr);
+}
+
+/*ARGSUSED*/
+void
+hat_kpm_mapout_pfn(pfn_t pfn)
+{
+	/* empty */
+}
+
+/*
  * Return the kpm virtual address for the page at pp.
  * If checkswap is non zero and the page is backed by a
  * swap vnode the physical address is used rather than
@@ -279,17 +321,28 @@
 
 	SFMMU_KPM_VTOP(vaddr, paddr);
 	pfn = (pfn_t)btop(paddr);
-	mseg = page_numtomemseg_nolock(pfn);
-	if (mseg == NULL)
-		return (EFAULT);
+	if ((mseg = page_numtomemseg_nolock(pfn)) != NULL) {
+		pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)];
+		ASSERT((pfn_t)pp->p_pagenum == pfn);
+	}
 
-	pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)];
-	ASSERT((pfn_t)pp->p_pagenum == pfn);
+	/*
+	 * hat_kpm_mapin_pfn may add a kpm translation for memory that falls
+	 * outside of memsegs.  Check for this case and provide the translation
+	 * here.
+	 */
+	if (vac_colors == 1 && mseg == NULL) {
+		tte_t tte;
+		uint_t szc = kpm_smallpages ? TTE8K : TTE4M;
+		uint_t shift = kpm_smallpages ? MMU_PAGESHIFT : MMU_PAGESHIFT4M;
 
-	if (!PAGE_LOCKED(pp))
-		return (EFAULT);
-
-	if (kpm_smallpages == 0)
+		ASSERT(address_in_memlist(phys_install, paddr, 1));
+		KPM_TTE_VCACHED(tte.ll, pfn, szc);
+		sfmmu_kpm_load_tsb(vaddr, &tte, shift);
+		error = 0;
+	} else if (mseg == NULL || !PAGE_LOCKED(pp))
+		error = EFAULT;
+	else if (kpm_smallpages == 0)
 		error = sfmmu_kpm_fault(vaddr, mseg, pp);
 	else
 		error = sfmmu_kpm_fault_small(vaddr, mseg, pp);
@@ -522,7 +575,6 @@
 	void	*base;
 	size_t	size;
 	struct memseg *msp;
-	extern uint_t vac_colors;
 
 	for (msp = memsegs; msp; msp = msp->next) {
 		pbase = msp->pages_base;
--- a/usr/src/uts/sun4v/vm/mach_kpm.c	Wed Jun 17 13:10:47 2009 -0700
+++ b/usr/src/uts/sun4v/vm/mach_kpm.c	Wed Jun 17 15:32:10 2009 -0700
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Kernel Physical Mapping (segkpm) hat interface routines for sun4v.
  */
@@ -123,6 +121,38 @@
 }
 
 /*
+ * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical
+ * memory addresses that are not described by a page_t.  It can
+ * also be used for normal pages that are not locked, but beware
+ * this is dangerous - no locking is performed, so the identity of
+ * the page could change.  hat_kpm_mapin_pfn is not supported when
+ * vac_colors > 1, because the chosen va depends on the page identity,
+ * which could change.
+ * The caller must only pass pfn's for valid physical addresses; violation
+ * of this rule will cause panic.
+ */
+caddr_t
+hat_kpm_mapin_pfn(pfn_t pfn)
+{
+	caddr_t paddr, vaddr;
+
+	if (kpm_enable == 0)
+		return ((caddr_t)NULL);
+
+	paddr = (caddr_t)ptob(pfn);
+	vaddr = (uintptr_t)kpm_vbase + paddr;
+
+	return ((caddr_t)vaddr);
+}
+
+/*ARGSUSED*/
+void
+hat_kpm_mapout_pfn(pfn_t pfn)
+{
+	/* empty */
+}
+
+/*
  * Return the kpm virtual address for the page at pp.
  */
 /*ARGSUSED*/