PSARC/2004/402 CPU Caps
authorakolb
Fri, 09 Mar 2007 15:55:28 -0800
changeset 3792 57ba782523b7
parent 3791 4893821e03ad
child 3793 061efa79f440
PSARC/2004/402 CPU Caps 6327235 PSARC/2004/402 CPU caps 6464161 Dead KSLICE code should be removed 6514387 FX class contains dead code to keep list of member threads 6518395 kstat_zone_add performs KM_SLEEP allocation when it should not
usr/src/cmd/mdb/common/modules/genunix/genunix.c
usr/src/cmd/mdb/common/modules/genunix/thread.c
usr/src/cmd/prstat/prutil.c
usr/src/cmd/zonecfg/zonecfg.c
usr/src/cmd/zonecfg/zonecfg.h
usr/src/cmd/zonecfg/zonecfg_grammar.y
usr/src/cmd/zonecfg/zonecfg_lex.l
usr/src/head/libzonecfg.h
usr/src/lib/libdtrace/common/procfs.d.in
usr/src/lib/libdtrace/common/procfs.sed.in
usr/src/lib/libzonecfg/common/libzonecfg.c
usr/src/pkgdefs/SUNWhea/prototype_com
usr/src/uts/common/Makefile.files
usr/src/uts/common/cpr/cpr_uthread.c
usr/src/uts/common/disp/cpucaps.c
usr/src/uts/common/disp/fss.c
usr/src/uts/common/disp/fx.c
usr/src/uts/common/disp/sysclass.c
usr/src/uts/common/disp/thread.c
usr/src/uts/common/disp/ts.c
usr/src/uts/common/dtrace/sdt_subr.c
usr/src/uts/common/fs/proc/prcontrol.c
usr/src/uts/common/fs/proc/prsubr.c
usr/src/uts/common/os/clock.c
usr/src/uts/common/os/cpu.c
usr/src/uts/common/os/kstat_fr.c
usr/src/uts/common/os/lwp.c
usr/src/uts/common/os/msacct.c
usr/src/uts/common/os/project.c
usr/src/uts/common/os/sig.c
usr/src/uts/common/os/task.c
usr/src/uts/common/os/timers.c
usr/src/uts/common/os/waitq.c
usr/src/uts/common/os/zone.c
usr/src/uts/common/sys/Makefile
usr/src/uts/common/sys/cpucaps.h
usr/src/uts/common/sys/cpucaps_impl.h
usr/src/uts/common/sys/cpuvar.h
usr/src/uts/common/sys/fss.h
usr/src/uts/common/sys/fx.h
usr/src/uts/common/sys/proc.h
usr/src/uts/common/sys/project.h
usr/src/uts/common/sys/schedctl.h
usr/src/uts/common/sys/thread.h
usr/src/uts/common/sys/ts.h
usr/src/uts/common/sys/waitq.h
usr/src/uts/common/sys/zone.h
usr/src/uts/i86pc/os/trap.c
usr/src/uts/intel/ia32/os/syscall.c
usr/src/uts/sparc/os/syscall.c
usr/src/uts/sun4/os/trap.c
usr/src/uts/sun4u/ngdr/io/dr_quiesce.c
usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c
usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Fri Mar 09 15:55:28 2007 -0800
@@ -124,6 +124,7 @@
 		case SIDL: return ('I');
 		case SONPROC: return ('O');
 		case SSTOP: return ('T');
+		case SWAIT: return ('W');
 		default: return ('?');
 	}
 }
@@ -148,6 +149,7 @@
 		{ "TS_ONPROC",	TS_ONPROC,	TS_ONPROC	},
 		{ "TS_ZOMB",	TS_ZOMB,	TS_ZOMB		},
 		{ "TS_STOPPED",	TS_STOPPED,	TS_STOPPED	},
+		{ "TS_WAIT",	TS_WAIT,	TS_WAIT		},
 		{ NULL,		0,		0		}
 	};
 
--- a/usr/src/cmd/mdb/common/modules/genunix/thread.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/mdb/common/modules/genunix/thread.c	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -455,6 +454,9 @@
 		case TS_STOPPED:
 			state = "stopped";
 			break;
+		case TS_WAIT:
+			state = "wait";
+			break;
 		default:
 			(void) mdb_snprintf(stbuf, 11, "inval/%02x", t.t_state);
 			state = stbuf;
--- a/usr/src/cmd/prstat/prutil.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/prstat/prutil.c	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -197,8 +197,8 @@
 	case 'I':
 		(void) strncpy(str, "idle", length);
 		break;
-	case 'X':
-		(void) strncpy(str, "xbrk", length);
+	case 'W':
+		(void) strncpy(str, "wait", length);
 		break;
 	case 'O':
 		(void) snprintf(str, length, "cpu%-3d", (int)pr_id);
--- a/usr/src/cmd/zonecfg/zonecfg.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/zonecfg/zonecfg.c	Fri Mar 09 15:55:28 2007 -0800
@@ -179,6 +179,7 @@
 	ALIAS_SHARES,
 	"scheduling-class",
 	"ip-type",
+	"capped-cpu",
 	NULL
 };
 
@@ -265,6 +266,7 @@
 	"add attr",
 	"add dataset",
 	"add dedicated-cpu",
+	"add capped-cpu",
 	"add capped-memory",
 	NULL
 };
@@ -294,6 +296,7 @@
 	"remove attr ",
 	"remove dataset ",
 	"remove dedicated-cpu ",
+	"remove capped-cpu ",
 	"remove capped-memory ",
 	NULL
 };
@@ -307,6 +310,7 @@
 	"select attr ",
 	"select dataset ",
 	"select dedicated-cpu",
+	"select capped-cpu",
 	"select capped-memory",
 	NULL
 };
@@ -340,6 +344,7 @@
 	"info dataset ",
 	"info capped-memory",
 	"info dedicated-cpu",
+	"info capped-cpu",
 	"info zonename",
 	"info zonepath",
 	"info autoboot",
@@ -451,6 +456,16 @@
 	NULL
 };
 
+static const char *pcap_res_scope_cmds[] = {
+	"cancel",
+	"end",
+	"exit",
+	"help",
+	"info",
+	"set ncpus=",
+	NULL
+};
+
 static const char *mcap_res_scope_cmds[] = {
 	"cancel",
 	"end",
@@ -605,6 +620,8 @@
 		return (add_stuff(cpl, line, dataset_res_scope_cmds, word_end));
 	case RT_DCPU:
 		return (add_stuff(cpl, line, pset_res_scope_cmds, word_end));
+	case RT_PCAP:
+		return (add_stuff(cpl, line, pcap_res_scope_cmds, word_end));
 	case RT_MCAP:
 		return (add_stuff(cpl, line, mcap_res_scope_cmds, word_end));
 	}
@@ -1003,6 +1020,20 @@
 			    pt_to_str(PT_IMPORTANCE),
 			    gettext("<unsigned integer>"));
 			break;
+		case RT_PCAP:
+			(void) fprintf(fp, gettext("The '%s' resource scope is "
+			    "used to set an upper limit (a cap) on the\n"
+			    "percentage of CPU that can be used by this zone.  "
+			    "A '%s' value of 1\ncorresponds to one cpu.  The "
+			    "value can be set higher than 1, up to the total\n"
+			    "number of CPUs on the system.  The value can "
+			    "also be less than 1,\nrepresenting a fraction of "
+			    "a cpu.\n"),
+			    rt_to_str(resource_scope), pt_to_str(PT_NCPUS));
+			(void) fprintf(fp, gettext("Valid commands:\n"));
+			(void) fprintf(fp, "\t%s %s=%s\n", cmd_to_str(CMD_SET),
+			    pt_to_str(PT_NCPUS), gettext("<unsigned decimal>"));
+			break;
 		case RT_MCAP:
 			(void) fprintf(fp, gettext("The '%s' resource scope is "
 			    "used to set an upper limit (a cap) on the\n"
@@ -1078,12 +1109,12 @@
 	}
 	if (flags & HELP_RESOURCES) {
 		(void) fprintf(fp, "<%s> := %s | %s | %s | %s | %s | %s |\n\t"
-		    "%s | %s | %s\n\n",
+		    "%s | %s | %s | %s\n\n",
 		    gettext("resource type"), rt_to_str(RT_FS),
 		    rt_to_str(RT_IPD), rt_to_str(RT_NET), rt_to_str(RT_DEVICE),
 		    rt_to_str(RT_RCTL), rt_to_str(RT_ATTR),
 		    rt_to_str(RT_DATASET), rt_to_str(RT_DCPU),
-		    rt_to_str(RT_MCAP));
+		    rt_to_str(RT_PCAP), rt_to_str(RT_MCAP));
 	}
 	if (flags & HELP_PROPS) {
 		(void) fprintf(fp, gettext("For resource type ... there are "
@@ -1137,6 +1168,8 @@
 		    pt_to_str(PT_NAME));
 		(void) fprintf(fp, "\t%s\t%s, %s\n", rt_to_str(RT_DCPU),
 		    pt_to_str(PT_NCPUS), pt_to_str(PT_IMPORTANCE));
+		(void) fprintf(fp, "\t%s\t%s\n", rt_to_str(RT_PCAP),
+		    pt_to_str(PT_NCPUS));
 		(void) fprintf(fp, "\t%s\t%s, %s, %s\n", rt_to_str(RT_MCAP),
 		    pt_to_str(PT_PHYSICAL), pt_to_str(PT_SWAP),
 		    pt_to_str(PT_LOCKED));
@@ -1835,6 +1868,11 @@
 		(void) fprintf(of, "%s\n", cmd_to_str(CMD_END));
 	}
 
+	/*
+	 * There is nothing to export for pcap since this resource is just
+	 * a container for an rctl alias.
+	 */
+
 done:
 	if (need_to_close)
 		(void) fclose(of);
@@ -1908,6 +1946,7 @@
 	int type;
 	struct zone_psettab tmp_psettab;
 	struct zone_mcaptab tmp_mcaptab;
+	uint64_t tmp;
 	uint64_t tmp_mcap;
 	char pool[MAXNAMELEN];
 
@@ -1951,12 +1990,18 @@
 		bzero(&in_progress_dstab, sizeof (in_progress_dstab));
 		return;
 	case RT_DCPU:
-		/* Make sure there isn't already a cpu-set entry. */
+		/* Make sure there isn't already a cpu-set or cpu-cap entry. */
 		if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
 			zerr(gettext("The %s resource already exists."),
 			    rt_to_str(RT_DCPU));
 			goto bad;
 		}
+		if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) !=
+		    Z_NO_ENTRY) {
+			zerr(gettext("The %s resource already exists."),
+			    rt_to_str(RT_PCAP));
+			goto bad;
+		}
 
 		/* Make sure the pool property isn't set. */
 		if (zonecfg_get_pool(handle, pool, sizeof (pool)) == Z_OK &&
@@ -1970,6 +2015,32 @@
 
 		bzero(&in_progress_psettab, sizeof (in_progress_psettab));
 		return;
+	case RT_PCAP:
+		/*
+		 * Make sure there isn't already a cpu-set or incompatible
+		 * cpu-cap rctls.
+		 */
+		if (zonecfg_lookup_pset(handle, &tmp_psettab) == Z_OK) {
+			zerr(gettext("The %s resource already exists."),
+			    rt_to_str(RT_DCPU));
+			goto bad;
+		}
+
+		switch (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp)) {
+		case Z_ALIAS_DISALLOW:
+			zone_perror(rt_to_str(RT_PCAP), Z_ALIAS_DISALLOW,
+			    FALSE);
+			goto bad;
+
+		case Z_OK:
+			zerr(gettext("The %s resource already exists."),
+			    rt_to_str(RT_PCAP));
+			goto bad;
+
+		default:
+			break;
+		}
+		return;
 	case RT_MCAP:
 		/*
 		 * Make sure there isn't already a mem-cap entry or max-swap
@@ -2967,6 +3038,25 @@
 }
 
 static void
+remove_pcap()
+{
+	int err;
+	uint64_t tmp;
+
+	if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp) != Z_OK) {
+		zerr("%s %s: %s", cmd_to_str(CMD_REMOVE), rt_to_str(RT_PCAP),
+		    zonecfg_strerror(Z_NO_RESOURCE_TYPE));
+		saw_error = TRUE;
+		return;
+	}
+
+	if ((err = zonecfg_rm_aliased_rctl(handle, ALIAS_CPUCAP)) != Z_OK)
+		z_cmd_rt_perror(CMD_REMOVE, RT_PCAP, err, TRUE);
+	else
+		need_to_commit = TRUE;
+}
+
+static void
 remove_mcap()
 {
 	int err, res1, res2, res3;
@@ -3074,6 +3164,9 @@
 	case RT_DCPU:
 		remove_pset();
 		return;
+	case RT_PCAP:
+		remove_pcap();
+		return;
 	case RT_MCAP:
 		remove_mcap();
 		return;
@@ -3396,6 +3489,7 @@
 {
 	int type, err, res;
 	uint64_t limit;
+	uint64_t tmp;
 
 	if (zone_is_read_only(CMD_SELECT))
 		return;
@@ -3493,6 +3587,13 @@
 		bcopy(&old_psettab, &in_progress_psettab,
 		    sizeof (struct zone_psettab));
 		return;
+	case RT_PCAP:
+		if ((err = zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &tmp))
+		    != Z_OK) {
+			z_cmd_rt_perror(CMD_SELECT, RT_PCAP, err, TRUE);
+			global_scope = TRUE;
+		}
+		return;
 	case RT_MCAP:
 		/* if none of these exist, there is no resource to select */
 		if ((res = zonecfg_lookup_mcap(handle, &old_mcaptab)) != Z_OK &&
@@ -3708,6 +3809,8 @@
 	boolean_t force_set = FALSE;
 	size_t physmem_size = sizeof (in_progress_mcaptab.zone_physmem_cap);
 	uint64_t mem_cap, mem_limit;
+	float cap;
+	char *unitp;
 	struct zone_psettab tmp_psettab;
 	bool arg_err = FALSE;
 
@@ -4200,6 +4303,34 @@
 		long_usage(CMD_SET, TRUE);
 		usage(FALSE, HELP_PROPS);
 		return;
+	case RT_PCAP:
+		if (prop_type != PT_NCPUS) {
+			zone_perror(pt_to_str(prop_type), Z_NO_PROPERTY_TYPE,
+			    TRUE);
+			long_usage(CMD_SET, TRUE);
+			usage(FALSE, HELP_PROPS);
+			return;
+		}
+
+		/*
+		 * We already checked that an rctl alias is allowed in
+		 * the add_resource() function.
+		 */
+
+		if ((cap = strtof(prop_id, &unitp)) <= 0 || *unitp != '\0' ||
+		    (int)(cap * 100) < 1) {
+			zerr(gettext("%s property is out of range."),
+			    pt_to_str(PT_NCPUS));
+			saw_error = TRUE;
+			return;
+		}
+
+		if ((err = zonecfg_set_aliased_rctl(handle, ALIAS_CPUCAP,
+		    (int)(cap * 100))) != Z_OK)
+			zone_perror(zone, err, TRUE);
+		else
+			need_to_commit = TRUE;
+		return;
 	case RT_MCAP:
 		switch (prop_type) {
 		case PT_PHYSICAL:
@@ -4790,6 +4921,26 @@
 }
 
 static void
+output_pcap(FILE *fp)
+{
+	uint64_t cap;
+
+	if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &cap) == Z_OK) {
+		float scaled = (float)cap / 100;
+		(void) fprintf(fp, "%s:\n", rt_to_str(RT_PCAP));
+		(void) fprintf(fp, "\t[%s: %.2f]\n", pt_to_str(PT_NCPUS),
+		    scaled);
+	}
+}
+
+static void
+info_pcap(FILE *fp)
+{
+	output_pcap(fp);
+}
+
+
+static void
 info_aliased_rctl(zone_dochandle_t handle, FILE *fp, char *alias)
 {
 	uint64_t limit;
@@ -4932,6 +5083,9 @@
 		case RT_DCPU:
 			output_pset(fp, &in_progress_psettab);
 			break;
+		case RT_PCAP:
+			output_pcap(fp);
+			break;
 		case RT_MCAP:
 			res1 = zonecfg_get_aliased_rctl(handle, ALIAS_MAXSWAP,
 			    &swap_limit);
@@ -4986,6 +5140,7 @@
 			info_dev(handle, fp, cmd);
 		}
 		info_pset(handle, fp);
+		info_pcap(fp);
 		info_mcap(handle, fp);
 		if (!global_zone) {
 			info_attr(handle, fp, cmd);
@@ -5062,6 +5217,9 @@
 	case RT_DCPU:
 		info_pset(handle, fp);
 		break;
+	case RT_PCAP:
+		info_pcap(fp);
+		break;
 	case RT_MCAP:
 		info_mcap(handle, fp);
 		break;
@@ -5203,10 +5361,12 @@
 	char sched[MAXNAMELEN];
 	char brand[MAXNAMELEN];
 	int err, ret_val = Z_OK, arg;
+	int pset_res;
 	bool save = FALSE;
 	bool arg_err = FALSE;
 	zone_iptype_t iptype;
 	boolean_t has_cpu_shares = B_FALSE;
+	boolean_t has_cpu_cap = B_FALSE;
 
 	optind = 0;
 	while ((arg = getopt(cmd->cmd_argc, cmd->cmd_argv, "?")) != EOF) {
@@ -5333,6 +5493,9 @@
 		if (strcmp(rctltab.zone_rctl_name, "zone.cpu-shares") == 0)
 			has_cpu_shares = B_TRUE;
 
+		if (strcmp(rctltab.zone_rctl_name, "zone.cpu-cap") == 0)
+			has_cpu_cap = B_TRUE;
+
 		if (rctltab.zone_rctl_valptr == NULL) {
 			zerr(gettext("%s: no %s specified"),
 			    rt_to_str(RT_RCTL), pt_to_str(PT_VALUE));
@@ -5345,7 +5508,8 @@
 	}
 	(void) zonecfg_endrctlent(handle);
 
-	if (zonecfg_lookup_pset(handle, &psettab) == Z_OK && has_cpu_shares) {
+	if ((pset_res = zonecfg_lookup_pset(handle, &psettab)) == Z_OK &&
+	    has_cpu_shares) {
 		zerr(gettext("%s zone.cpu-shares and %s are incompatible."),
 		    rt_to_str(RT_RCTL), rt_to_str(RT_DCPU));
 		saw_error = TRUE;
@@ -5364,6 +5528,14 @@
 			ret_val = Z_INCOMPATIBLE;
 	}
 
+	if (pset_res == Z_OK && has_cpu_cap) {
+		zerr(gettext("%s zone.cpu-cap and the %s are incompatible."),
+		    rt_to_str(RT_RCTL), rt_to_str(RT_DCPU));
+		saw_error = TRUE;
+		if (ret_val == Z_OK)
+			ret_val = Z_INCOMPATIBLE;
+	}
+
 	if ((err = zonecfg_setattrent(handle)) != Z_OK) {
 		zone_perror(zone, err, TRUE);
 		return;
@@ -5562,6 +5734,7 @@
 	int err, arg, res1, res2, res3;
 	uint64_t swap_limit;
 	uint64_t locked_limit;
+	uint64_t proc_cap;
 
 	assert(cmd != NULL);
 
@@ -5888,6 +6061,17 @@
 			err = zonecfg_modify_pset(handle, &in_progress_psettab);
 		}
 		break;
+	case RT_PCAP:
+		/* Make sure everything was filled in. */
+		if (zonecfg_get_aliased_rctl(handle, ALIAS_CPUCAP, &proc_cap)
+		    != Z_OK) {
+			zerr(gettext("%s not specified"), pt_to_str(PT_NCPUS));
+			saw_error = TRUE;
+			validation_failed = TRUE;
+			return;
+		}
+		err = Z_OK;
+		break;
 	case RT_MCAP:
 		/* Make sure everything was filled in. */
 		res1 = strlen(in_progress_mcaptab.zone_physmem_cap) == 0 ?
--- a/usr/src/cmd/zonecfg/zonecfg.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/zonecfg/zonecfg.h	Fri Mar 09 15:55:28 2007 -0800
@@ -94,9 +94,10 @@
 #define	RT_SHARES	22	/* really a rctl alias property, but for info */
 #define	RT_SCHED	23	/* really a property, but for info ... */
 #define	RT_IPTYPE	24	/* really a property, but for info ... */
+#define	RT_PCAP		25
 
 #define	RT_MIN		RT_UNKNOWN
-#define	RT_MAX		RT_IPTYPE
+#define	RT_MAX		RT_PCAP
 
 /* property types: increment PT_MAX when expanding this list */
 #define	PT_UNKNOWN	0
--- a/usr/src/cmd/zonecfg/zonecfg_grammar.y	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/zonecfg/zonecfg_grammar.y	Fri Mar 09 15:55:28 2007 -0800
@@ -61,14 +61,14 @@
 %token FS IPD ATTR DEVICE RCTL SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL
 %token IPTYPE
 %token NAME MATCH PRIV LIMIT ACTION VALUE EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
-%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET
+%token OPEN_PAREN CLOSE_PAREN COMMA DATASET LIMITPRIV BOOTARGS BRAND PSET PCAP
 %token MCAP NCPUS IMPORTANCE SHARES MAXLWPS MAXSHMMEM MAXSHMIDS MAXMSGIDS
 %token MAXSEMIDS LOCKED SWAP SCHED CLEAR
 
 %type <strval> TOKEN EQUAL OPEN_SQ_BRACKET CLOSE_SQ_BRACKET
     property_value OPEN_PAREN CLOSE_PAREN COMMA simple_prop_val
 %type <complex> complex_piece complex_prop_val
-%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET MCAP
+%type <ival> resource_type NET FS IPD DEVICE RCTL ATTR DATASET PSET PCAP MCAP
 %type <ival> property_name SPECIAL RAW DIR OPTIONS TYPE ADDRESS PHYSICAL NAME
     MATCH ZONENAME ZONEPATH AUTOBOOT POOL LIMITPRIV BOOTARGS VALUE PRIV LIMIT
     ACTION BRAND SCHED IPTYPE
@@ -700,6 +700,14 @@
 		$$->cmd_handler = &select_func;
 		$$->cmd_res_type = RT_DCPU;
 	}
+	| SELECT PCAP
+	{
+		if (($$ = alloc_cmd()) == NULL)
+			YYERROR;
+		cmd = $$;
+		$$->cmd_handler = &select_func;
+		$$->cmd_res_type = RT_PCAP;
+	}
 	| SELECT MCAP
 	{
 		if (($$ = alloc_cmd()) == NULL)
@@ -840,6 +848,7 @@
 	| ATTR		{ $$ = RT_ATTR; }
 	| DATASET	{ $$ = RT_DATASET; }
 	| PSET		{ $$ = RT_DCPU; }
+	| PCAP		{ $$ = RT_PCAP; }
 	| MCAP		{ $$ = RT_MCAP; }
 
 property_name: SPECIAL	{ $$ = PT_SPECIAL; }
--- a/usr/src/cmd/zonecfg/zonecfg_lex.l	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/cmd/zonecfg/zonecfg_lex.l	Fri Mar 09 15:55:28 2007 -0800
@@ -173,6 +173,8 @@
 
 <TSTATE>dedicated-cpu	{ return PSET; }
 
+<TSTATE>capped-cpu	{ return PCAP; }
+
 <TSTATE>capped-memory	{ return MCAP; }
 
 <TSTATE>zonepath	{ return ZONEPATH; }
--- a/usr/src/head/libzonecfg.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/head/libzonecfg.h	Fri Mar 09 15:55:28 2007 -0800
@@ -145,6 +145,7 @@
 #define	ALIAS_MAXLOCKEDMEM	"locked"
 #define	ALIAS_MAXSWAP		"swap"
 #define	ALIAS_SHARES		"cpu-shares"
+#define	ALIAS_CPUCAP		"cpu-cap"
 
 /*
  * Bit flag definitions for passing into libzonecfg functions.
--- a/usr/src/lib/libdtrace/common/procfs.d.in	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/lib/libdtrace/common/procfs.d.in	Fri Mar 09 15:55:28 2007 -0800
@@ -49,6 +49,8 @@
 #pragma D binding "1.0" SIDL
 inline char SONPROC = @SONPROC@;
 #pragma D binding "1.0" SONPROC
+inline char SWAIT = @SWAIT@;
+#pragma D binding "1.0" SWAIT
 
 inline int PR_STOPPED = @PR_STOPPED@;
 #pragma D binding "1.0" PR_STOPPED
@@ -322,14 +324,16 @@
 	    (T->t_state == @TS_RUN@) ? SRUN :
 	    (T->t_state == @TS_ONPROC@) ? SONPROC :
 	    (T->t_state == @TS_ZOMB@) ? SZOMB :
-	    (T->t_state == @TS_STOPPED@) ? SSTOP : 0;
+	    (T->t_state == @TS_STOPPED@) ? SSTOP :
+	    (T->t_state == @TS_WAIT@) ? SWAIT : 0;
 
 	pr_sname = (T->t_proc_flag & @TP_PRVSTOP@) ? 'T' :
 	    (T->t_state == @TS_SLEEP@) ? 'S' :
 	    (T->t_state == @TS_RUN@) ? 'R' :
 	    (T->t_state == @TS_ONPROC@) ? 'O' :
 	    (T->t_state == @TS_ZOMB@) ? 'Z' :
-	    (T->t_state == @TS_STOPPED@) ? 'T' : '?';
+	    (T->t_state == @TS_STOPPED@) ? 'T' :
+	    (T->t_state == @TS_WAIT@) ? 'W' : '?';
 
 	pr_syscall = T->t_sysnum;
 	pr_pri = T->t_pri;
--- a/usr/src/lib/libdtrace/common/procfs.sed.in	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/lib/libdtrace/common/procfs.sed.in	Fri Mar 09 15:55:28 2007 -0800
@@ -56,6 +56,7 @@
 SED_REPLACE(TS_ONPROC)
 SED_REPLACE(TS_ZOMB)
 SED_REPLACE(TS_STOPPED)
+SED_REPLACE(TS_WAIT)
 
 SED_REPLACE(P_PR_FORK)
 SED_REPLACE(P_PR_RUNLCL)
@@ -75,6 +76,7 @@
 SED_REPLACE(SSTOP)
 SED_REPLACE(SIDL)
 SED_REPLACE(SONPROC)
+SED_REPLACE(SWAIT)
 
 SED_REPLACE(CLDNOSIGCHLD)
 SED_REPLACE(CLDWAITPID)
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c	Fri Mar 09 15:55:28 2007 -0800
@@ -174,6 +174,7 @@
 	{ALIAS_MAXLOCKEDMEM, "zone.max-locked-memory", "privileged", "deny", 0},
 	{ALIAS_MAXSWAP, "zone.max-swap", "privileged", "deny", 0},
 	{ALIAS_SHARES, "zone.cpu-shares", "privileged", "none", 0},
+	{ALIAS_CPUCAP, "zone.cpu-cap", "privileged", "deny", 0},
 	{NULL, NULL, NULL, NULL, 0}
 };
 
--- a/usr/src/pkgdefs/SUNWhea/prototype_com	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com	Fri Mar 09 15:55:28 2007 -0800
@@ -546,6 +546,8 @@
 f none usr/include/sys/byteorder.h 644 root bin
 f none usr/include/sys/callb.h 644 root bin
 f none usr/include/sys/callo.h 644 root bin
+f none usr/include/sys/cpucaps.h 644 root bin
+f none usr/include/sys/cpucaps_impl.h 644 root bin
 f none usr/include/sys/ccompile.h 644 root bin
 f none usr/include/sys/cdio.h 644 root bin
 f none usr/include/sys/cis.h 644 root bin
@@ -1205,6 +1207,7 @@
 f none usr/include/sys/vuid_state.h 644 root bin
 f none usr/include/sys/vuid_store.h 644 root bin
 f none usr/include/sys/wait.h 644 root bin
+f none usr/include/sys/waitq.h 644 root bin
 f none usr/include/sys/watchpoint.h 644 root bin
 f none usr/include/sys/xti_inet.h 644 root bin
 f none usr/include/sys/xti_osi.h 644 root bin
--- a/usr/src/uts/common/Makefile.files	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/Makefile.files	Fri Mar 09 15:55:28 2007 -0800
@@ -40,6 +40,7 @@
 		bitset.o	\
 		bp_map.o	\
 		brand.o		\
+		cpucaps.o	\
 		cmt.o		\
 		cpu.o		\
 		cpu_intr.o	\
@@ -341,6 +342,7 @@
 		vnode.o		\
 		vuid_queue.o	\
 		vuid_store.o	\
+		waitq.o		\
 		watchpoint.o	\
 		yield.o		\
 		scsi_confdata.o	\
--- a/usr/src/uts/common/cpr/cpr_uthread.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/cpr/cpr_uthread.c	Fri Mar 09 15:55:28 2007 -0800
@@ -148,8 +148,7 @@
 
 			aston(tp);
 
-			if (tp->t_state == TS_SLEEP &&
-			    (tp->t_flag & T_WAKEABLE)) {
+			if (ISWAKEABLE(tp) || ISWAITING(tp)) {
 				setrun_locked(tp);
 			}
 		}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/disp/cpucaps.c	Fri Mar 09 15:55:28 2007 -0800
@@ -0,0 +1,1133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/disp.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/atomic.h>
+#include <sys/cpucaps_impl.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
+#include <sys/rctl.h>
+#include <sys/errno.h>
+
+/*
+ * CPU Caps implementation
+ * =======================
+ *
+ * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
+ * usage for all projects running inside the zone. If the zone CPU cap is set
+ * below the project CPU cap, the latter will have no effect.
+ *
+ * When CPU usage of projects and/or zones reaches specified caps, threads in
+ * them do not get scheduled and instead are placed on wait queues associated
+ * with a cap. Such threads will start running again only when CPU usage drops
+ * below the cap level. Each zone and each project has its own wait queue.
+ *
+ * When CPU cap is set, the kernel continously keeps track of CPU time used by
+ * capped zones and/or projects over a short time interval and calculates their
+ * current CPU usage as a percentage. When the accumulated usage reaches the CPU
+ * cap, LWPs running in the user-land (when they are not holding any critical
+ * kernel locks) are placed on special wait queues until their project's or
+ * zone's CPU usage drops below the cap.
+ *
+ * The system maintains a list of all capped projects and all capped zones. On
+ * every clock tick every active thread belonging to a capped project adds its
+ * CPU usage to its project. Usage from all projects belonging to a capped zone
+ * is aggregated to get the zone usage.
+ *
+ * When the current CPU usage is above the cap, a project or zone is considered
+ * over-capped. Every user thread caught running in an over-capped project or
+ * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
+ * is requested to surrender its CPU. This causes scheduling class specific
+ * CL_PREEMPT() callback to be invoked. The callback function places threads
+ * marked as TS_PROJWAIT on a wait queue and calls switch().
+ *
+ * Threads are only placed on wait queues after trapping from user-land
+ * (they could be holding some user locks, but no kernel locks) and while
+ * returning from the trap back to the user-land when no kernel locks are held.
+ * Putting threads on wait queues in random places while running in the
+ * kernel might lead to all kinds of locking problems.
+ *
+ * Accounting
+ * ==========
+ *
+ * Accounting of CPU usage is based on per-thread micro-state accounting data.
+ * On every clock tick clock() adds new on-CPU time for every thread found on
+ * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
+ * New times means time since it was last accounted for. On-CPU times greater
+ * than 1 tick are truncated to 1 tick.
+ *
+ * Project CPU usage is aggregated from all threads within the project.
+ * Zone CPU usage is the sum of usages for all projects within the zone. Zone
+ * CPU usage is calculated on every clock tick by walking list of projects and
+ * adding their usage together.
+ *
+ * Decay
+ * =====
+ *
+ * CPU usage is decayed by the caps_update() routine which is called once per
+ * every clock tick. It walks lists of project caps and decays their usages by
+ * one per cent. If CPU usage drops below cap levels, threads on the wait queue
+ * are made runnable again, one thread per clock tick.
+ *
+ * Interfaces
+ * ==========
+ *
+ * The CPU Caps facility provides the following interfaces to the rest of the
+ * system:
+ *
+ *   cpucaps_project_add(kproject_t *)
+ *
+ * Notifies the framework of a new project. It should be put on the
+ * capped_projects list if its zone has a cap.
+ *
+ *   cpucaps_project_remove(kproject_t *)
+ *
+ * Remove the association between the specified project and its cap.
+ * Called right before the project is destroyed.
+ *
+ * cpucaps_project_set(kproject_t *, rctl_qty_t)
+ *
+ * Set project cap of the specified project to the specified value. Setting the
+ * value to NOCAP is equivalent to removing the cap.
+ *
+ *   cpucaps_zone_set(zone_t *, rctl_qty_t)
+ *
+ * Set zone cap of the specified zone to the specified value. Setting the value
+ * to NOCAP is equivalent to removing the cap.
+ *
+ *   cpucaps_zone_remove(zone_t *)
+ *
+ * Remove the association between the zone and its cap.
+ *
+ *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
+ *
+ * Charges specified thread's project the amount of on-CPU time that it used.
+ * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
+ * Otherwise returns True if project or zone should be penalized because its
+ * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
+ * bits in t_schedflag in this case.
+ *
+ *   CPUCAPS_ENFORCE(kthread_id_t *)
+ *
+ * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
+ * state on project or zone wait queues, as requested by TS_PROJWAITQ or
+ * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
+ * wait queue or False otherwise.
+ *
+ *   cpucaps_sc_init(caps_sc_t *)
+ *
+ * Initializes the scheduling-class specific CPU Caps data for a thread.
+ *
+ * LOCKS
+ * =====
+ *
+ * all the individual caps structures and their lists are protected by a global
+ * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
+ * caps, so it is usually uncontended. We avoid all blocking memory allocations
+ * while holding caps_lock to prevent clock() from blocking.
+ *
+ * Thread state is protected by the thread lock. It protects the association
+ * between a thread and its project and, as a consequence, to its zone. The
+ * association can not break while thread lock is held, so the project or zone
+ * cap are not going to disappear while thread lock is held.
+ *
+ * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
+ * grabbed by scheduling classes already holding thread lock at high PIL and by
+ * clock thread performing usage decay. We should do as little work as possible
+ * while holding the lock since it may be very hot. All threads in the project
+ * contend for the same cache line doing cap usage updates.
+ */
+
+/*
+ * caps_lock protects list of capped projects and zones, changes in the cap
+ * state and changes of the global cpucaps_enabled flag.
+ *
+ * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
+ * modified in parallel. This can be per-zone cap flag, but we don't keep any
+ * cap state for now.
+ */
+static kmutex_t caps_lock;		/* lock to protect: */
+static list_t capped_zones;		/* - list of zones with caps */
+static list_t capped_projects;		/* - list of projects with caps */
+boolean_t cpucaps_enabled;		/* - are there any caps defined? */
+boolean_t cpucaps_busy;			/* - is framework busy? */
+
+/*
+ * The accounting is based on the number of nanoseconds threads spend running
+ * during a tick which is kept in the cap_tick_cost variable.
+ */
+static hrtime_t cap_tick_cost;
+
+/*
+ * How much of the usage value is decayed every clock tick
+ * Decay one per cent of value per tick
+ */
+#define	CAP_DECAY_FACTOR 100
+
+/*
+ * Scale the value and round it to the closest integer value
+ */
+#define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
+
+static void caps_update();
+
+/*
+ * CAP kstats.
+ */
+struct cap_kstat {
+	kstat_named_t	cap_value;
+	kstat_named_t	cap_usage;
+	kstat_named_t	cap_nwait;
+	kstat_named_t	cap_below;
+	kstat_named_t	cap_above;
+	kstat_named_t	cap_maxusage;
+	kstat_named_t	cap_zonename;
+} cap_kstat = {
+	{ "value",	KSTAT_DATA_UINT64 },
+	{ "usage",	KSTAT_DATA_UINT64 },
+	{ "nwait",	KSTAT_DATA_UINT64 },
+	{ "below_sec",	KSTAT_DATA_UINT64 },
+	{ "above_sec",	KSTAT_DATA_UINT64 },
+	{ "maxusage",	KSTAT_DATA_UINT64 },
+	{ "zonename",	KSTAT_DATA_STRING },
+};
+
+
+static kmutex_t cap_kstat_lock;
+static int cap_kstat_update(kstat_t *, int);
+
+/*
+ * Initialize CPU caps infrastructure.
+ *   - Initialize lists of capped zones and capped projects
+ *   - Set cpucaps_clock_callout to NULL
+ */
+void
+cpucaps_init()
+{
+	/*
+	 * Initialize global variables
+	 */
+	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
+
+	list_create(&capped_zones, sizeof (cpucap_t),
+	    offsetof(cpucap_t, cap_link));
+	list_create(&capped_projects, sizeof (cpucap_t),
+	    offsetof(cpucap_t, cap_link));
+
+	cpucaps_enabled = B_FALSE;
+	cpucaps_busy = B_FALSE;
+	cpucaps_clock_callout = NULL;
+}
+
+/*
+ * Initialize scheduling-class specific CPU Caps data.
+ */
+void
+cpucaps_sc_init(caps_sc_t *csc)
+{
+	csc->csc_cputime = 0;
+}
+
+/*
+ * Allocate and initialize cpucap structure
+ */
+static cpucap_t *
+cap_alloc(void)
+{
+	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
+
+	DISP_LOCK_INIT(&cap->cap_usagelock);
+	waitq_init(&cap->cap_waitq);
+
+	return (cap);
+}
+
+/*
+ * Free cpucap structure
+ */
+static void
+cap_free(cpucap_t *cap)
+{
+	if (cap == NULL)
+		return;
+
+	/*
+	 * This cap should not be active
+	 */
+	ASSERT(!list_link_active(&cap->cap_link));
+	ASSERT(cap->cap_value == 0);
+	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
+
+	waitq_fini(&cap->cap_waitq);
+	DISP_LOCK_DESTROY(&cap->cap_usagelock);
+
+	kmem_free(cap, sizeof (cpucap_t));
+}
+
+/*
+ * Activate cap - insert into active list and unblock its
+ * wait queue. Should be called with caps_lock held.
+ * The cap_value field is set to the value supplied.
+ */
+static void
+cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
+{
+	ASSERT(MUTEX_HELD(&caps_lock));
+
+	/*
+	 * Cap can not be already enabled
+	 */
+	ASSERT(!CAP_ENABLED(cap));
+	ASSERT(!list_link_active(&cap->cap_link));
+
+	list_insert_tail(l, cap);
+	cap->cap_below = cap->cap_above = 0;
+	cap->cap_maxusage = 0;
+	cap->cap_usage = 0;
+	cap->cap_value = value;
+	waitq_unblock(&cap->cap_waitq);
+	if (CPUCAPS_OFF()) {
+		cpucaps_enabled = B_TRUE;
+		cpucaps_clock_callout = caps_update;
+	}
+}
+
+/*
+ * Deactivate cap
+ *   - Block its wait queue. This prevents any new threads from being
+ *	enqueued there and moves all enqueued threads to the run queue.
+ *   - Remove cap from list l.
+ *   - Disable CPU caps globally if there are no capped projects or zones
+ *
+ * Should be called with caps_lock held.
+ */
+static void
+cap_disable(list_t *l, cpucap_t *cap)
+{
+	ASSERT(MUTEX_HELD(&caps_lock));
+	/*
+	 * Cap should be currently active
+	 */
+	ASSERT(CPUCAPS_ON());
+	ASSERT(list_link_active(&cap->cap_link));
+	ASSERT(CAP_ENABLED(cap));
+
+	waitq_block(&cap->cap_waitq);
+	list_remove(l, cap);
+	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
+		cpucaps_enabled = B_FALSE;
+		cpucaps_clock_callout = NULL;
+	}
+	cap->cap_value = 0;
+	cap->cap_project = NULL;
+	cap->cap_zone = NULL;
+	if (cap->cap_kstat != NULL) {
+		kstat_delete(cap->cap_kstat);
+		cap->cap_kstat = NULL;
+	}
+
+}
+
+/*
+ * Enable cap for a project kpj
+ * It is safe to enable already enabled project cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_project_enable(kproject_t *kpj, hrtime_t value)
+{
+	cpucap_t *cap = kpj->kpj_cpucap;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(cap != NULL);
+
+	if (CAP_DISABLED(cap)) {
+		ASSERT(cap->cap_kstat == NULL);
+		cap_enable(&capped_projects, cap, value);
+		cap->cap_project = kpj;
+		cap->cap_zone = kpj->kpj_zone;
+
+		/*
+		 * Create cap kstats
+		 */
+		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
+		    KSTAT_TYPE_NAMED,
+		    sizeof (cap_kstat) / sizeof (kstat_named_t),
+		    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		    cap->cap_kstat->ks_data_size +=
+			strlen(cap->cap_zone->zone_name) + 1;
+		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
+		    cap->cap_kstat->ks_data = &cap_kstat;
+		    cap->cap_kstat->ks_update = cap_kstat_update;
+		    cap->cap_kstat->ks_private = cap;
+		    kstat_install(cap->cap_kstat);
+		}
+	}
+}
+
+/*
+ * Disable project cap.
+ * It is safe to disable already disabled project cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_project_disable(kproject_t *kpj)
+{
+	cpucap_t *cap = kpj->kpj_cpucap;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(cap != NULL);
+	ASSERT(cap->cap_project == kpj);
+
+	if (CAP_ENABLED(cap))
+		cap_disable(&capped_projects, cap);
+}
+
+/*
+ * Enable cap for a zone
+ * It is safe to enable already enabled zone cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_zone_enable(zone_t *zone, hrtime_t value)
+{
+	cpucap_t *cap = zone->zone_cpucap;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(cap != NULL);
+
+	if (CAP_DISABLED(cap)) {
+		ASSERT(cap->cap_kstat == NULL);
+		cap_enable(&capped_zones, cap, value);
+		cap->cap_zone = zone;
+
+		/*
+		 * Create cap kstats
+		 */
+		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
+		    KSTAT_TYPE_NAMED,
+		    sizeof (cap_kstat) / sizeof (kstat_named_t),
+		    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		    cap->cap_kstat->ks_data_size +=
+			strlen(cap->cap_zone->zone_name) + 1;
+		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
+		    cap->cap_kstat->ks_data = &cap_kstat;
+		    cap->cap_kstat->ks_update = cap_kstat_update;
+		    cap->cap_kstat->ks_private = cap;
+		    kstat_install(cap->cap_kstat);
+		}
+	}
+}
+
+/*
+ * Disable zone cap.
+ * It is safe to disable already disabled zone cap.
+ * Should be called with caps_lock held.
+ */
+static void
+cap_zone_disable(zone_t *zone)
+{
+	cpucap_t *cap = zone->zone_cpucap;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(cap != NULL);
+	ASSERT(cap->cap_zone == zone);
+
+	if (CAP_ENABLED(cap))
+		cap_disable(&capped_zones, cap);
+}
+
+/*
+ * Apply specified callback to all caps contained in the list `l'.
+ */
+static void
+cap_walk(list_t *l, void (*cb)(cpucap_t *))
+{
+	cpucap_t *cap;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+
+	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
+		(*cb)(cap);
+	}
+}
+
+/*
+ * If cap limit is not reached, make one thread from wait queue runnable.
+ * The waitq_isempty check is performed without the waitq lock. If a new thread
+ * is placed on the waitq right after the check, it will be picked up during the
+ * next invocation of cap_poke_waitq().
+ */
+static void
+cap_poke_waitq(cpucap_t *cap)
+{
+	ASSERT(MUTEX_HELD(&caps_lock));
+
+	if (cap->cap_usage >= cap->cap_value) {
+		cap->cap_above++;
+	} else {
+		waitq_t *wq = &cap->cap_waitq;
+
+		cap->cap_below++;
+
+		if (!waitq_isempty(wq))
+			waitq_runone(wq);
+	}
+}
+
+/*
+ * The callback function called for every cap on capped_projects list.
+ * Decay cap usage by CAP_DECAY_FACTOR
+ * Add this cap project usage to its zone usage.
+ * Kick off a thread from the cap waitq if cap is not reached.
+ */
+static void
+cap_project_usage_walker(cpucap_t *cap)
+{
+	zone_t		*zone = cap->cap_zone;
+	hrtime_t	cap_usage = cap->cap_usage;
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(cap->cap_project->kpj_cpucap == cap);
+	ASSERT(zone == cap->cap_project->kpj_zone);
+	ASSERT(CAP_ENABLED(cap));
+
+	/*
+	 * Set or clear the CAP_REACHED flag based on the current usage.
+	 * Only projects having their own caps are ever marked as CAP_REACHED.
+	 */
+	cap_poke_waitq(cap);
+
+	/*
+	 * Add project's CPU usage to our zone's CPU usage.
+	 */
+	if (ZONE_IS_CAPPED(zone)) {
+		cpucap_t *zcap = zone->zone_cpucap;
+
+		ASSERT(zcap->cap_zone == zone);
+
+		/*
+		 * If we haven't reset this zone's usage during this clock tick
+		 * yet, then do it now. The cap_lbolt field is used to check
+		 * whether this is the first zone's project we see during this
+		 * tick or a subsequent one.
+		 */
+		if (zcap->cap_lbolt != lbolt64) {
+			if (zcap->cap_usage > zcap->cap_maxusage)
+				zcap->cap_maxusage = zcap->cap_usage;
+			zcap->cap_usage = 0;
+		}
+		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
+		    hrtime_t, cap_usage);
+		zcap->cap_usage += cap_usage;
+		/* Check for overflows */
+		if (zcap->cap_usage < 0)
+			zcap->cap_usage = MAX_USAGE - 1;
+	}
+
+	/*
+	 * Decay project usage.
+	 */
+	disp_lock_enter(&cap->cap_usagelock);
+	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
+	disp_lock_exit(&cap->cap_usagelock);
+}
+
+/*
+ * On every clock tick walk the list of project caps and update the CPU usage.
+ * Also walk the list of zone caps checking whether any threads should
+ * transition from wait queue to run queue.
+ *
+ * This function gets called by the clock thread directly when there are any
+ * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
+ * caps_lock for long periods of time, so there should be almost no contention
+ * for it.
+ */
+static void
+caps_update()
+{
+	mutex_enter(&caps_lock);
+	cap_walk(&capped_projects, cap_project_usage_walker);
+	cap_walk(&capped_zones, cap_poke_waitq);
+	mutex_exit(&caps_lock);
+}
+
+/*
+ * The function is called for each project in a zone when the zone cap is
+ * modified. It enables project caps if zone cap is enabled and disables if the
+ * zone cap is disabled and project doesn't have its own cap.
+ *
+ * For each project that does not have cpucap structure allocated it allocates a
+ * new structure and assigns to kpj->cpu_cap. The allocation is performed
+ * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
+ * held.
+ */
+static int
+cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
+{
+	cpucap_t *project_cap = NULL;
+	cpucap_t *zone_cap = (cpucap_t *)arg;
+
+	ASSERT(zone_cap != NULL);
+
+	if (kpj->kpj_cpucap == NULL) {
+		/*
+		 * This is the first time any cap was established for this
+		 * project. Allocate a new cpucap structure for it.
+		 */
+		project_cap = cap_alloc();
+	}
+
+	mutex_enter(&caps_lock);
+
+	/*
+	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
+	 * and assign the newly allocated cpucap structure to it.
+	 */
+	if (kpj->kpj_cpucap == NULL) {
+		kpj->kpj_cpucap = project_cap;
+	} else if (project_cap != NULL) {
+		cap_free(project_cap);
+	}
+
+	project_cap = kpj->kpj_cpucap;
+
+	if (CAP_DISABLED(zone_cap)) {
+		/*
+		 * Remove all projects in this zone without caps
+		 * from the capped_projects list.
+		 */
+		if (project_cap->cap_value == MAX_USAGE) {
+			cap_project_disable(kpj);
+		}
+	} else if (CAP_DISABLED(project_cap)) {
+		/*
+		 * Add the project to capped_projects list.
+		 */
+		ASSERT(project_cap->cap_value == 0);
+		cap_project_enable(kpj, MAX_USAGE);
+	}
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * Set zone cap to cap_val
+ * If cap_val is equal to NOCAP, disable zone cap.
+ *
+ * If this is the first time a cap is set on a zone, allocate cpucap structure
+ * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
+ */
+int
+cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	if (cap_val == 0)
+		return (EINVAL);
+
+	ASSERT(cap_val <= MAXCAP);
+	if (cap_val > MAXCAP)
+		cap_val = MAXCAP;
+
+	/*
+	 * Nothing to do if trying to disable a cap on a zone when caps are off
+	 * or a zone which does not have a cap yet.
+	 */
+	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
+		return (0);
+
+	if (zone->zone_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+
+	if (cpucaps_busy) {
+		mutex_exit(&caps_lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
+	 * held. If it is still NULL, assign a newly allocated cpucap to it.
+	 */
+	if (zone->zone_cpucap == NULL) {
+		zone->zone_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	cap = zone->zone_cpucap;
+	value = cap_val * cap_tick_cost;
+	if (value < 0)
+		value = MAX_USAGE;
+
+	/* Nothing to do if the value is staying the same */
+	if (value == cap->cap_value) {
+		mutex_exit(&caps_lock);
+		return (0);
+	}
+
+	/*
+	 * Clear cap statistics since the cap value itself changes.
+	 */
+	cap->cap_above = cap->cap_below = 0;
+
+
+	if (cap_val == NOCAP) {
+		if (CAP_ENABLED(cap)) {
+			/*
+			 * Remove cap for the zone
+			 */
+			cap_zone_disable(zone);
+			cpucaps_busy = B_TRUE;
+			mutex_exit(&caps_lock);
+			/*
+			 * Disable caps for all project belonging to this zone
+			 * unless they have their own cap.
+			 */
+			(void) project_walk_all(zone->zone_id,
+			    cap_project_zone_modify_walker, cap);
+
+			mutex_enter(&caps_lock);
+			cpucaps_busy = B_FALSE;
+		}
+	} else if (CAP_DISABLED(cap)) {
+		/*
+		 * Set a cap on a zone which previously was not capped.
+		 */
+		cap_zone_enable(zone, value);
+		cpucaps_busy = B_TRUE;
+		mutex_exit(&caps_lock);
+
+		/*
+		 * Enable cap for all projects belonging to this zone.
+		 */
+		(void) project_walk_all(zone->zone_id,
+		    cap_project_zone_modify_walker, cap);
+
+		mutex_enter(&caps_lock);
+		cpucaps_busy = B_FALSE;
+	} else {
+		/*
+		 * No state transitions, just change the value
+		 */
+		cap->cap_value = value;
+	}
+
+	ASSERT(MUTEX_HELD(&caps_lock));
+	ASSERT(!cpucaps_busy);
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * The project is going away so disable its cap.
+ */
+void
+cpucaps_project_remove(kproject_t *kpj)
+{
+	mutex_enter(&caps_lock);
+	if (PROJECT_IS_CAPPED(kpj))
+		cap_project_disable(kpj);
+	if (kpj->kpj_cpucap != NULL) {
+		cap_free(kpj->kpj_cpucap);
+		kpj->kpj_cpucap = NULL;
+	}
+	mutex_exit(&caps_lock);
+}
+
+/*
+ * The zone is going away, so disable its cap.
+ */
+void
+cpucaps_zone_remove(zone_t *zone)
+{
+	mutex_enter(&caps_lock);
+	while (ZONE_IS_CAPPED(zone)) {
+		mutex_exit(&caps_lock);
+		(void) cpucaps_zone_set(zone, NOCAP);
+		mutex_enter(&caps_lock);
+	}
+	if (zone->zone_cpucap != NULL) {
+		cap_free(zone->zone_cpucap);
+		zone->zone_cpucap = NULL;
+	}
+	mutex_exit(&caps_lock);
+}
+
+/*
+ * New project was created. It should be put on the capped_projects list if
+ * its zone has a cap.
+ */
+void
+cpucaps_project_add(kproject_t *kpj)
+{
+	cpucap_t *cap = NULL;
+
+	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
+		return;
+
+	/*
+	 * This project was never capped before, so allocate its cap structure.
+	 */
+	if (kpj->kpj_cpucap == NULL)
+		cap = cap_alloc();
+
+	mutex_enter(&caps_lock);
+	/*
+	 * Double-check with caps_lock held
+	 */
+	if (kpj->kpj_cpucap == NULL) {
+		kpj->kpj_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	if (ZONE_IS_CAPPED(kpj->kpj_zone))
+		cap_project_enable(kpj, MAX_USAGE);
+
+	mutex_exit(&caps_lock);
+}
+
+/*
+ * Set project cap to cap_val
+ * If cap_val is equal to NOCAP, disable project cap.
+ *
+ * If this is the first time a cap is set on a project, allocate cpucap
+ * structure without holding caps_lock to avoid KM_SLEEP allocation with
+ * caps_lock held.
+ */
+int
+cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
+{
+	cpucap_t *cap = NULL;
+	hrtime_t value;
+
+	if (cap_val == 0)
+		return (EINVAL);
+
+	ASSERT(cap_val <= MAXCAP);
+	if (cap_val > MAXCAP)
+		cap_val = MAXCAP;
+
+	/*
+	 * Nothing to do if trying to disable project cap and caps are not
+	 * enabled or if trying to disable cap on a project that does not have
+	 * cap enabled.
+	 */
+	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
+		return (0);
+
+	if (kpj->kpj_cpucap == NULL) {
+		/*
+		 * This project was never capped before, so allocate its cap
+		 * structure.
+		 */
+		cap = cap_alloc();
+	}
+
+	mutex_enter(&caps_lock);
+
+	/*
+	 * Double-check with caps_lock held.
+	 */
+	if (kpj->kpj_cpucap == NULL) {
+		kpj->kpj_cpucap = cap;
+	} else if (cap != NULL) {
+		cap_free(cap);
+	}
+
+	/*
+	 * Get the actual pointer to the project cap.
+	 */
+	cap = kpj->kpj_cpucap;
+	value = cap_val * cap_tick_cost;
+	if (value < 0)
+		value = MAX_USAGE;
+
+	/*
+	 * Nothing to do if the value is not changing
+	 */
+	if (value == cap->cap_value) {
+		mutex_exit(&caps_lock);
+		return (0);
+	}
+
+	/*
+	 * Clear cap statistics since the cap value itself changes.
+	 */
+	cap->cap_above = cap->cap_below = 0;
+	cap->cap_maxusage = 0;
+
+	if (cap_val != NOCAP) {
+		/*
+		 * Enable this cap if it is not already enabled.
+		 */
+		if (CAP_DISABLED(cap))
+			cap_project_enable(kpj, value);
+		else
+			cap->cap_value = value;
+	} else if (CAP_ENABLED(cap)) {
+		/*
+		 * User requested to drop a cap on the project. If it is part of
+		 * capped zone, keep the cap and set the value to MAX_USAGE,
+		 * otherwise disable the cap.
+		 */
+		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
+			cap->cap_value = MAX_USAGE;
+		} else {
+			cap_project_disable(kpj);
+		}
+	}
+	mutex_exit(&caps_lock);
+
+	return (0);
+}
+
+/*
+ * Get cap usage.
+ */
+static rctl_qty_t
+cap_get(cpucap_t *cap)
+{
+	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
+}
+
+/*
+ * Get current project usage.
+ */
+rctl_qty_t
+cpucaps_project_get(kproject_t *kpj)
+{
+	return (cap_get(kpj->kpj_cpucap));
+}
+
+/*
+ * Get current zone usage.
+ */
+rctl_qty_t
+cpucaps_zone_get(zone_t *zone)
+{
+	return (cap_get(zone->zone_cpucap));
+}
+
+/*
+ * Charge project of thread t the time thread t spent on CPU since previously
+ * adjusted.
+ *
+ * Record the current on-CPU time in the csc structure.
+ *
+ * Do not adjust for more than one tick worth of time.
+ *
+ */
+static void
+caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
+{
+	kproject_t	*kpj = ttoproj(t);
+	hrtime_t	new_usage;
+	hrtime_t	usage_delta;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(PROJECT_IS_CAPPED(kpj));
+
+	/* Get on-CPU time since birth of a thread */
+	new_usage = mstate_thread_onproc_time(t);
+
+	/* Time spent on CPU since last checked */
+	usage_delta = new_usage - csc->csc_cputime;
+
+	/* Save the accumulated on-CPU time */
+	csc->csc_cputime = new_usage;
+
+	/* Charge at most one tick worth of on-CPU time */
+	if (usage_delta > cap_tick_cost)
+		usage_delta = cap_tick_cost;
+
+	/* Add usage_delta to the project usage value. */
+	if (usage_delta > 0) {
+		cpucap_t *cap = kpj->kpj_cpucap;
+
+		DTRACE_PROBE2(cpucaps__project__charge,
+		    kthread_id_t, t, hrtime_t, usage_delta);
+
+		disp_lock_enter_high(&cap->cap_usagelock);
+		cap->cap_usage += usage_delta;
+
+		/* Check for overflows */
+		if (cap->cap_usage < 0)
+			cap->cap_usage = MAX_USAGE - 1;
+
+		disp_lock_exit_high(&cap->cap_usagelock);
+
+		/*
+		 * cap_maxusage is only kept for observability. Move it outside
+		 * the lock to reduce the time spent while holding the lock.
+		 */
+		if (cap->cap_usage > cap->cap_maxusage)
+			cap->cap_maxusage = cap->cap_usage;
+	}
+}
+
+/*
+ * Charge thread's project and return True if project or zone should be
+ * penalized because its project or zone is exceeding its cap. Also sets
+ * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
+ */
+boolean_t
+cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
+{
+	kproject_t	*kpj = ttoproj(t);
+	klwp_t		*lwp = t->t_lwp;
+	zone_t		*zone;
+	cpucap_t	*project_cap;
+	boolean_t	rc = B_FALSE;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	/* Nothing to do for projects that are not capped. */
+	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
+		return (B_FALSE);
+
+	caps_charge_adjust(t, csc);
+
+	/*
+	 * The caller only requested to charge the project usage, no enforcement
+	 * part.
+	 */
+	if (charge_type == CPUCAPS_CHARGE_ONLY)
+		return (B_FALSE);
+
+	project_cap = kpj->kpj_cpucap;
+
+	if (project_cap->cap_usage >= project_cap->cap_value) {
+		t->t_schedflag |= TS_PROJWAITQ;
+		rc = B_TRUE;
+	} else if (t->t_schedflag & TS_PROJWAITQ) {
+		t->t_schedflag &= ~TS_PROJWAITQ;
+	}
+
+	zone = ttozone(t);
+	if (!ZONE_IS_CAPPED(zone)) {
+		if (t->t_schedflag & TS_ZONEWAITQ)
+			t->t_schedflag &= ~TS_ZONEWAITQ;
+	} else {
+		cpucap_t *zone_cap = zone->zone_cpucap;
+
+		if (zone_cap->cap_usage >= zone_cap->cap_value) {
+			t->t_schedflag |= TS_ZONEWAITQ;
+			rc = B_TRUE;
+		} else if (t->t_schedflag & TS_ZONEWAITQ) {
+			t->t_schedflag &= ~TS_ZONEWAITQ;
+		}
+	}
+
+
+	return (rc);
+}
+
+/*
+ * Enforce CPU caps. If got preempted in the user-land, we know that thread does
+ * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
+ *
+ * CPU Caps are only enforced for user threads.
+ *
+ * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
+ * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
+ *
+ * It is possible that by the time we enter cpucaps_enforce() the cap is already
+ * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
+ * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
+ * apply.
+ */
+boolean_t
+cpucaps_enforce(kthread_t *t)
+{
+	klwp_t *lwp = t->t_lwp;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
+		if (t->t_schedflag & TS_PROJWAITQ) {
+			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
+			t->t_schedflag &= ~TS_ANYWAITQ;
+			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
+				t)) {
+				return (B_TRUE);
+			}
+		}
+		if (t->t_schedflag & TS_ZONEWAITQ) {
+			ASSERT(ttozone(t)->zone_cpucap != NULL);
+			t->t_schedflag &= ~TS_ZONEWAITQ;
+			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
+				t)) {
+				return (B_TRUE);
+			}
+		}
+	}
+
+	/*
+	 * The thread is not enqueued on the wait queue.
+	 */
+	return (B_FALSE);
+}
+
+/*
+ * Convert internal cap statistics into values exported by cap kstat.
+ */
+static int
+cap_kstat_update(kstat_t *ksp, int rw)
+{
+	struct cap_kstat *capsp = &cap_kstat;
+	cpucap_t *cap = ksp->ks_private;
+	clock_t	tick_sec = SEC_TO_TICK(1);
+	char *zonename = cap->cap_zone->zone_name;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	capsp->cap_value.value.ui64 =
+	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
+	capsp->cap_usage.value.ui64 =
+	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
+	capsp->cap_maxusage.value.ui64 =
+	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
+	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
+	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
+	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
+	kstat_named_setstr(&capsp->cap_zonename, zonename);
+
+	return (0);
+}
--- a/usr/src/uts/common/disp/fss.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/disp/fss.c	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
  */
 
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,6 +54,7 @@
 #include <sys/tnf_probe.h>
 #include <sys/policy.h>
 #include <sys/sdt.h>
+#include <sys/cpucaps.h>
 
 /*
  * FSS Data Structures:
@@ -1069,6 +1069,7 @@
 			goto next;
 		if ((fssproc->fss_flags & FSSKPRI) != 0)
 			goto next;
+
 		fssproj = FSSPROC2FSSPROJ(fssproc);
 		if (fssproj == NULL)
 			goto next;
@@ -1084,7 +1085,7 @@
 
 		if (t->t_schedctl && schedctl_get_nopreempt(t))
 			goto next;
-		if (t->t_state != TS_RUN) {
+		if (t->t_state != TS_RUN && t->t_state != TS_WAIT) {
 			/*
 			 * Make next syscall/trap call fss_trapret
 			 */
@@ -1373,6 +1374,7 @@
 
 	fssproc->fss_timeleft = fss_quantum;
 	fssproc->fss_tp = t;
+	cpucaps_sc_init(&fssproc->fss_caps);
 
 	/*
 	 * Put a lock on our fsspset structure.
@@ -1420,7 +1422,8 @@
 	t->t_cldata = (void *)fssproc;
 	t->t_schedflag |= TS_RUNQMATCH;
 	fss_change_priority(t, fssproc);
-	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+	    t->t_state == TS_WAIT)
 		fss_active(t);
 	thread_unlock(t);
 
@@ -1568,6 +1571,8 @@
 	cfssproc->fss_upri = pfssproc->fss_upri;
 	cfssproc->fss_tp = ct;
 	cfssproc->fss_nice = pfssproc->fss_nice;
+	cpucaps_sc_init(&cfssproc->fss_caps);
+
 	cfssproc->fss_flags =
 	    pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
 	ct->t_cldata = (void *)cfssproc;
@@ -1793,6 +1798,14 @@
 	}
 	mutex_exit(&fsspset->fssps_lock);
 	mutex_exit(&fsspsets_lock);
+
+	if (CPUCAPS_ON()) {
+		thread_lock(t);
+		fssproc = FSSPROC(t);
+		(void) cpucaps_charge(t, &fssproc->fss_caps,
+		    CPUCAPS_CHARGE_ONLY);
+		thread_unlock(t);
+	}
 }
 
 static void
@@ -1861,7 +1874,8 @@
 	if (INHERITED(t) ||
 	    (fssproc->fss_flags & FSSKPRI) ||
 	    (t->t_proc_flag & TP_LWPEXIT) ||
-	    (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) ||
+	    (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
+		TS_ONPROC | TS_WAIT)) ||
 	    !(t->t_schedflag & TS_LOAD) ||
 	    !(SWAP_OK(t)))
 		return (-1);
@@ -1971,6 +1985,20 @@
 		t->t_trapret = 1;	/* so that fss_trapret will run */
 		aston(t);
 	}
+
+	/*
+	 * This thread may be placed on wait queue by CPU Caps. In this case we
+	 * do not need to do anything until it is removed from the wait queue.
+	 * Do not enforce CPU caps on threads running at a kernel priority
+	 */
+	if (CPUCAPS_ON()) {
+		(void) cpucaps_charge(t, &fssproc->fss_caps,
+		    CPUCAPS_CHARGE_ONLY);
+
+		if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
+			return;
+	}
+
 	/*
 	 * If preempted in user-land mark the thread as swappable because it
 	 * cannot be holding any kernel locks.
@@ -2077,6 +2105,12 @@
 	ASSERT(THREAD_LOCK_HELD(t));
 
 	ASSERT(t->t_state == TS_ONPROC);
+
+	/*
+	 * Account for time spent on CPU before going to sleep.
+	 */
+	(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY);
+
 	fss_inactive(t);
 
 	/*
@@ -2117,6 +2151,8 @@
 	fssproc_t *fssproc;
 	fssproj_t *fssproj;
 	klwp_t *lwp;
+	boolean_t call_cpu_surrender = B_FALSE;
+	boolean_t cpucaps_enforce = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
 
@@ -2136,6 +2172,17 @@
 	}
 
 	/*
+	 * Keep track of thread's project CPU usage.  Note that projects
+	 * get charged even when threads are running in the kernel.
+	 * Do not surrender CPU if running in the SYS class.
+	 */
+	if (CPUCAPS_ON()) {
+		cpucaps_enforce = cpucaps_charge(t,
+		    &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
+		    !(fssproc->fss_flags & FSSKPRI);
+	}
+
+	/*
 	 * A thread's execution time for threads running in the SYS class
 	 * is not tracked.
 	 */
@@ -2180,8 +2227,7 @@
 					t->t_schedflag &= ~TS_DONT_SWAP;
 				fssproc->fss_timeleft = fss_quantum;
 			} else {
-				fssproc->fss_flags |= FSSBACKQ;
-				cpu_surrender(t);
+				call_cpu_surrender = B_TRUE;
 			}
 		} else if (t->t_state == TS_ONPROC &&
 			    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
@@ -2190,10 +2236,38 @@
 			 * waiting for a processor, then thread surrenders
 			 * the processor.
 			 */
-			fssproc->fss_flags |= FSSBACKQ;
-			cpu_surrender(t);
+			call_cpu_surrender = B_TRUE;
 		}
 	}
+
+	if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
+		/*
+		 * The thread used more than half of its quantum, so assume that
+		 * it used the whole quantum.
+		 *
+		 * Update thread's priority just before putting it on the wait
+		 * queue so that it gets charged for the CPU time from its
+		 * quantum even before that quantum expires.
+		 */
+		fss_newpri(fssproc);
+		if (t->t_pri != fssproc->fss_umdpri)
+			fss_change_priority(t, fssproc);
+
+		/*
+		 * We need to call cpu_surrender for this thread due to cpucaps
+		 * enforcement, but fss_change_priority may have already done
+		 * so. In this case FSSBACKQ is set and there is no need to call
+		 * cpu-surrender again.
+		 */
+		if (!(fssproc->fss_flags & FSSBACKQ))
+			call_cpu_surrender = B_TRUE;
+	}
+
+	if (call_cpu_surrender) {
+		fssproc->fss_flags |= FSSBACKQ;
+		cpu_surrender(t);
+	}
+
 	thread_unlock_nopreempt(t);	/* clock thread can't be preempted */
 }
 
@@ -2336,6 +2410,11 @@
 	ASSERT(THREAD_LOCK_HELD(t));
 
 	/*
+	 * Collect CPU usage spent before yielding
+	 */
+	(void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ONLY);
+
+	/*
 	 * Clear the preemption control "yield" bit since the user is
 	 * doing a yield.
 	 */
@@ -2439,7 +2518,8 @@
 	ASSERT(fssproj_new != NULL);
 
 	thread_lock(t);
-	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+	    t->t_state == TS_WAIT)
 		fss_inactive(t);
 	ASSERT(fssproj_old->fssp_threads > 0);
 	if (--fssproj_old->fssp_threads == 0) {
@@ -2449,7 +2529,8 @@
 	fssproc->fss_proj = fssproj_new;
 	fssproc->fss_fsspri = 0;
 	fssproj_new->fssp_threads++;
-	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
+	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+	    t->t_state == TS_WAIT)
 		fss_active(t);
 	thread_unlock(t);
 	if (free) {
@@ -2528,12 +2609,14 @@
 
 	fssproj_new->fssp_threads++;
 	thread_lock(t);
-	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
-		fss_inactive(t);
+	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+	    t->t_state == TS_WAIT)
+	    fss_inactive(t);
 	fssproc->fss_proj = fssproj_new;
 	fssproc->fss_fsspri = 0;
-	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC)
-		fss_active(t);
+	if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
+	    t->t_state == TS_WAIT)
+	    fss_active(t);
 	thread_unlock(t);
 	mutex_exit(&fsspset_new->fssps_lock);
 
--- a/usr/src/uts/common/disp/fx.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/disp/fx.c	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,6 +53,7 @@
 #include <sys/policy.h>
 #include <sys/sdt.h>
 #include <sys/cpupart.h>
+#include <sys/cpucaps.h>
 
 static pri_t fx_init(id_t, int, classfuncs_t **);
 
@@ -85,40 +85,6 @@
 #define	FX_MAX_UNPRIV_PRI	0	/* maximum unpriviledge priority */
 
 /*
- * The fxproc_t structures are kept in an array of circular doubly linked
- * lists. A hash on the thread pointer is used to determine which list
- * each fxproc structure should be placed. Each list has a dummy "head" which
- * is never removed, so the list is never empty.
- */
-
-#define	FX_LISTS 16		/* number of lists, must be power of 2 */
-#define	FX_LIST_HASH(tp)	(((uintptr_t)(tp) >> 9) & (FX_LISTS - 1))
-
-#define	FX_LIST_INSERT(fxpp)						\
-{									\
-	int index = FX_LIST_HASH(fxpp->fx_tp);				\
-	kmutex_t *lockp = &fx_list_lock[index];				\
-	fxproc_t *headp = &fx_plisthead[index];				\
-	mutex_enter(lockp);						\
-	fxpp->fx_next = headp->fx_next;					\
-	fxpp->fx_prev = headp;						\
-	headp->fx_next->fx_prev = fxpp;					\
-	headp->fx_next = fxpp;						\
-	mutex_exit(lockp);						\
-}
-
-#define	FX_LIST_DELETE(fxpp)						\
-{									\
-	int index = FX_LIST_HASH(fxpp->fx_tp);				\
-	kmutex_t *lockp = &fx_list_lock[index];				\
-	mutex_enter(lockp);						\
-	fxpp->fx_prev->fx_next = fxpp->fx_next;				\
-	fxpp->fx_next->fx_prev = fxpp->fx_prev;				\
-	mutex_exit(lockp);						\
-}
-
-
-/*
  * The fxproc_t structures that have a registered callback vector,
  * are also kept in an array of circular doubly linked lists. A hash on
  * the thread id (from ddi_get_kt_did()) is used to determine which list
@@ -192,10 +158,6 @@
 static kmutex_t	fx_dptblock;	/* protects fixed priority dispatch table */
 
 
-static kmutex_t	fx_list_lock[FX_LISTS];	/* protects fxproc lists */
-static fxproc_t	fx_plisthead[FX_LISTS];	/* dummy fxproc at head of lists */
-
-
 static kmutex_t	fx_cb_list_lock[FX_CB_LISTS];	/* protects list of fxprocs */
 						/* that have callbacks */
 static fxproc_t	fx_cb_plisthead[FX_CB_LISTS];	/* dummy fxproc at head of */
@@ -316,14 +278,6 @@
 	fx_cid = cid;		/* Record our class ID */
 
 	/*
-	 * Initialize the fxproc hash table
-	 */
-	for (i = 0; i < FX_LISTS; i++) {
-		fx_plisthead[i].fx_next = fx_plisthead[i].fx_prev =
-		    &fx_plisthead[i];
-	}
-
-	/*
 	 * Initialize the hash table for fxprocs with callbacks
 	 */
 	for (i = 0; i < FX_CB_LISTS; i++) {
@@ -477,7 +431,6 @@
 	return (0);
 }
 
-
 /*
  * Allocate a fixed priority class specific thread structure and
  * initialize it with the parameters supplied. Also move the thread
@@ -565,6 +518,7 @@
 	}
 
 	fxpp->fx_timeleft = fxpp->fx_pquantum;
+	cpucaps_sc_init(&fxpp->fx_caps);
 	fxpp->fx_tp = t;
 
 	thread_lock(t);			/* get dispatcher lock on thread */
@@ -575,8 +529,6 @@
 	fx_change_priority(t, fxpp);
 	thread_unlock(t);
 
-	FX_LIST_INSERT(fxpp);
-
 	return (0);
 }
 
@@ -591,6 +543,8 @@
 	thread_lock(t);
 	fxpp = (fxproc_t *)(t->t_cldata);
 
+	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
 	if (FX_HAS_CB(fxpp)) {
 		FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 		fxpp->fx_callback = NULL;
@@ -599,6 +553,7 @@
 		FX_CB_LIST_DELETE(fxpp);
 		return;
 	}
+
 	thread_unlock(t);
 }
 
@@ -621,7 +576,6 @@
 		FX_CB_LIST_DELETE(fxpp);
 	} else
 		thread_unlock(fxpp->fx_tp);
-	FX_LIST_DELETE(fxpp);
 
 	kmem_free(fxpp, sizeof (fxproc_t));
 }
@@ -662,6 +616,7 @@
 	cfxpp->fx_callback = NULL;
 	cfxpp->fx_cookie = NULL;
 	cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
+	cpucaps_sc_init(&cfxpp->fx_caps);
 
 	cfxpp->fx_tp = ct;
 	ct->t_cldata = (void *)cfxpp;
@@ -670,7 +625,6 @@
 	/*
 	 * Link new structure into fxproc list.
 	 */
-	FX_LIST_INSERT(cfxpp);
 	return (0);
 }
 
@@ -1157,13 +1111,12 @@
 fx_preempt(kthread_t *t)
 {
 	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
-#ifdef KSLICE
-	extern int	kslice;
-#endif
 
 	ASSERT(t == curthread);
 	ASSERT(THREAD_LOCK_HELD(curthread));
 
+	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
 	/*
 	 * Check to see if we're doing "preemption control" here.  If
 	 * we are, and if the user has requested that this thread not
@@ -1209,17 +1162,20 @@
 		THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
 	}
 
+	/*
+	 * This thread may be placed on wait queue by CPU Caps. In this case we
+	 * do not need to do anything until it is removed from the wait queue.
+	 */
+	if (CPUCAPS_ENFORCE(t)) {
+		return;
+	}
+
 	if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
 		fxpp->fx_timeleft = fxpp->fx_pquantum;
 		fxpp->fx_flags &= ~FXBACKQ;
 		setbackdq(t);
 	} else {
-#ifdef KSLICE
-		if (kslice)
-			setbackdq(t);
-		else
-#endif
-			setfrontdq(t);
+		setfrontdq(t);
 	}
 }
 
@@ -1250,6 +1206,11 @@
 	ASSERT(t == curthread);
 	ASSERT(THREAD_LOCK_HELD(t));
 
+	/*
+	 * Account for time spent on CPU before going to sleep.
+	 */
+	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
 	if (FX_HAS_CB(fxpp)) {
 		FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
 	}
@@ -1318,6 +1279,7 @@
 static void
 fx_tick(kthread_t *t)
 {
+	boolean_t call_cpu_surrender = B_FALSE;
 	fxproc_t *fxpp;
 
 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
@@ -1342,6 +1304,14 @@
 			fx_change_priority(t, fxpp);
 		}
 	}
+
+	/*
+	 * Keep track of thread's project CPU usage.  Note that projects
+	 * get charged even when threads are running in the kernel.
+	 */
+	call_cpu_surrender =  CPUCAPS_CHARGE(t, &fxpp->fx_caps,
+	    CPUCAPS_CHARGE_ENFORCE);
+
 	if ((fxpp->fx_pquantum != FX_TQINF) &&
 	    (--fxpp->fx_timeleft <= 0)) {
 		pri_t	new_pri;
@@ -1379,15 +1349,17 @@
 		if (thread_change_pri(t, new_pri, 0)) {
 			fxpp->fx_timeleft = fxpp->fx_pquantum;
 		} else {
-			fxpp->fx_flags |= FXBACKQ;
-			cpu_surrender(t);
+			call_cpu_surrender = B_TRUE;
 		}
 	} else if (t->t_state == TS_ONPROC &&
 		    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
+		call_cpu_surrender = B_TRUE;
+	}
+
+	if (call_cpu_surrender) {
 		fxpp->fx_flags |= FXBACKQ;
 		cpu_surrender(t);
 	}
-
 	thread_unlock_nopreempt(t);	/* clock thread can't be preempted */
 }
 
@@ -1453,6 +1425,11 @@
 	ASSERT(t == curthread);
 	ASSERT(THREAD_LOCK_HELD(t));
 
+	/*
+	 * Collect CPU usage spent before yielding CPU.
+	 */
+	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
+
 	if (FX_HAS_CB(fxpp))  {
 		clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
 		pri_t	newpri = fxpp->fx_pri;
--- a/usr/src/uts/common/disp/sysclass.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/disp/sysclass.c	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,9 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright (c) 1996-2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -98,11 +98,7 @@
 		sys_swappri,	/* swapin */
 		sys_swappri,	/* swapout */
 		sys_nullsys,	/* trapret */
-#ifdef KSLICE
-		sys_preempt,
-#else
-		setfrontdq,
-#endif
+		setfrontdq,	/* preempt */
 		setbackdq,	/* setrun */
 		sys_nullsys,	/* sleep */
 		sys_nullsys,	/* tick */
@@ -218,21 +214,6 @@
 {
 }
 
-#ifdef KSLICE
-static void
-sys_preempt(t)
-	kthread_id_t	t;
-{
-	extern int	kslice;
-
-	if (kslice)
-		setbackdq(t);
-	else
-		setfrontdq(t);
-}
-#endif
-
-
 /* ARGSUSED */
 static int
 sys_donice(t, cr, incr, retvalp)
--- a/usr/src/uts/common/disp/thread.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/disp/thread.c	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,6 +73,8 @@
 #include <sys/sdt.h>
 #include <sys/reboot.h>
 #include <sys/kdi.h>
+#include <sys/waitq.h>
+#include <sys/cpucaps.h>
 
 struct kmem_cache *thread_cache;	/* cache of free threads */
 struct kmem_cache *lwp_cache;		/* cache of free lwps */
@@ -185,10 +187,18 @@
 	label_init();
 	cred_init();
 
+	/*
+	 * Initialize various resource management facilities.
+	 */
 	rctl_init();
+	cpucaps_init();
+	/*
+	 * Zone_init() should be called before project_init() so that project ID
+	 * for the first project is initialized correctly.
+	 */
+	zone_init();
 	project_init();
 	brand_init();
-	zone_init();
 	task_init();
 	tcache_init();
 	pool_init();
@@ -1070,6 +1080,8 @@
 		 * Already on dispatcher queue.
 		 */
 		return;
+	} else if (t->t_state == TS_WAIT) {
+		waitq_setrun(t);
 	} else if (t->t_state == TS_STOPPED) {
 		/*
 		 * All of the sending of SIGCONT (TC_XSTART) and /proc
@@ -1111,8 +1123,6 @@
 		 */
 		CL_SETRUN(t);
 	}
-
-
 }
 
 void
@@ -1623,7 +1633,7 @@
 	 * If it's not on a queue, change the priority with
 	 * impunity.
 	 */
-	if ((state & (TS_SLEEP | TS_RUN)) == 0) {
+	if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
 		t->t_epri = disp_pri;
 
 		if (state == TS_ONPROC) {
@@ -1639,7 +1649,6 @@
 	 * It's either on a sleep queue or a run queue.
 	 */
 	if (state == TS_SLEEP) {
-
 		/*
 		 * Take the thread out of its sleep queue.
 		 * Change the inherited priority.
@@ -1648,6 +1657,13 @@
 		 * to do this in an appropriate manner.
 		 */
 		SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
+	} else if (state == TS_WAIT) {
+		/*
+		 * Re-enqueue a thread on the wait queue if its
+		 * effective priority needs to change.
+		 */
+		if (disp_pri != t->t_epri)
+			waitq_change_pri(t, disp_pri);
 	} else {
 		/*
 		 * The thread is on a run queue.
@@ -1682,7 +1698,7 @@
 	 * If it's not on a queue, change the priority with
 	 * impunity.
 	 */
-	if ((state & (TS_SLEEP | TS_RUN)) == 0) {
+	if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
 		t->t_pri = disp_pri;
 
 		if (state == TS_ONPROC) {
@@ -1707,6 +1723,13 @@
 		 */
 		if (disp_pri != t->t_pri)
 			SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
+	} else if (state == TS_WAIT) {
+		/*
+		 * Re-enqueue a thread on the wait queue if its
+		 * priority needs to change.
+		 */
+		if (disp_pri != t->t_pri)
+			waitq_change_pri(t, disp_pri);
 	} else {
 		/*
 		 * The thread is on a run queue.
--- a/usr/src/uts/common/disp/ts.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/disp/ts.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -59,10 +59,10 @@
 #include <sys/policy.h>
 #include <sys/sdt.h>
 #include <sys/cpupart.h>
-
 #include <vm/rm.h>
 #include <vm/seg_kmem.h>
 #include <sys/modctl.h>
+#include <sys/cpucaps.h>
 
 static pri_t ts_init(id_t, int, classfuncs_t **);
 
@@ -194,6 +194,7 @@
 static int	ts_vaparmsin(void *, pc_vaparms_t *);
 static int	ts_vaparmsout(void *, pc_vaparms_t *);
 static int	ts_parmsset(kthread_t *, void *, id_t, cred_t *);
+static void	ts_exit(kthread_t *);
 static int	ts_donice(kthread_t *, cred_t *, int, int *);
 static void	ts_exitclass(void *);
 static int	ts_canexit(kthread_t *, cred_t *);
@@ -258,7 +259,7 @@
 	ts_parmsget,
 	ts_parmsset,
 	ts_nullsys,	/* stop */
-	ts_nullsys,	/* exit */
+	ts_exit,
 	ts_nullsys,	/* active */
 	ts_nullsys,	/* inactive */
 	ts_swapin,
@@ -302,7 +303,7 @@
 	ia_parmsget,
 	ia_parmsset,
 	ts_nullsys,	/* stop */
-	ts_nullsys,	/* exit */
+	ts_exit,
 	ts_nullsys,	/* active */
 	ts_nullsys,	/* inactive */
 	ts_swapin,
@@ -622,6 +623,7 @@
 	tspp->ts_dispwait = 0;
 	tspp->ts_timeleft = ts_dptbl[tspp->ts_cpupri].ts_quantum;
 	tspp->ts_tp = t;
+	cpucaps_sc_init(&tspp->ts_caps);
 
 	/*
 	 * Reset priority. Process goes to a "user mode" priority
@@ -703,6 +705,7 @@
 	ctspp->ts_dispwait = 0;
 	ctspp->ts_flags = ptspp->ts_flags & ~(TSKPRI | TSBACKQ | TSRESTORE);
 	ctspp->ts_tp = ct;
+	cpucaps_sc_init(&ctspp->ts_caps);
 	thread_unlock(t);
 
 	/*
@@ -1307,6 +1310,24 @@
 	return (ts_parmsset(tx, parmsp, reqpcid, reqpcredp));
 }
 
+static void
+ts_exit(kthread_t *t)
+{
+	tsproc_t *tspp;
+
+	if (CPUCAPS_ON()) {
+		/*
+		 * A thread could be exiting in between clock ticks,
+		 * so we need to calculate how much CPU time it used
+		 * since it was charged last time.
+		 */
+		thread_lock(t);
+		tspp = (tsproc_t *)t->t_cldata;
+		(void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+		thread_unlock(t);
+	}
+}
+
 /*
  * Return the global scheduling priority that would be assigned
  * to a thread entering the time-sharing class with the ts_upri.
@@ -1337,10 +1358,7 @@
 ts_preempt(kthread_t *t)
 {
 	tsproc_t	*tspp = (tsproc_t *)(t->t_cldata);
-	klwp_t		*lwp;
-#ifdef KSLICE
-	extern int	kslice;
-#endif
+	klwp_t		*lwp = curthread->t_lwp;
 	pri_t		oldpri = t->t_pri;
 
 	ASSERT(t == curthread);
@@ -1350,7 +1368,6 @@
 	 * If preempted in the kernel, make sure the thread has
 	 * a kernel priority if needed.
 	 */
-	lwp = curthread->t_lwp;
 	if (!(tspp->ts_flags & TSKPRI) && lwp != NULL && t->t_kpri_req) {
 		tspp->ts_flags |= TSKPRI;
 		THREAD_CHANGE_PRI(t, ts_kmdpris[0]);
@@ -1358,9 +1375,21 @@
 		t->t_trapret = 1;		/* so ts_trapret will run */
 		aston(t);
 	}
+
 	/*
-	 * If preempted in user-land mark the thread
-	 * as swappable because I know it isn't holding any locks.
+	 * This thread may be placed on wait queue by CPU Caps. In this case we
+	 * do not need to do anything until it is removed from the wait queue.
+	 * Do not enforce CPU caps on threads running at a kernel priority
+	 */
+	if (CPUCAPS_ON()) {
+		(void) cpucaps_charge(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+		if (!(tspp->ts_flags & TSKPRI) && CPUCAPS_ENFORCE(t))
+			return;
+	}
+
+	/*
+	 * If thread got preempted in the user-land then we know
+	 * it isn't holding any locks.  Mark it as swappable.
 	 */
 	ASSERT(t->t_schedflag & TS_DONT_SWAP);
 	if (lwp != NULL && lwp->lwp_state == LWP_USER)
@@ -1420,12 +1449,7 @@
 		tspp->ts_flags &= ~TSBACKQ;
 		setbackdq(t);
 	} else {
-#ifdef KSLICE
-		if (kslice)
-			setbackdq(t);
-		else
-#endif
-			setfrontdq(t);
+		setfrontdq(t);
 	}
 
 done:
@@ -1482,6 +1506,11 @@
 	ASSERT(t == curthread);
 	ASSERT(THREAD_LOCK_HELD(t));
 
+	/*
+	 * Account for time spent on CPU before going to sleep.
+	 */
+	(void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+
 	flags = tspp->ts_flags;
 	if (t->t_kpri_req) {
 		tspp->ts_flags = flags | TSKPRI;
@@ -1605,7 +1634,8 @@
 
 	if (INHERITED(t) || (tspp->ts_flags & (TSKPRI | TSIASET)) ||
 	    (t->t_proc_flag & TP_LWPEXIT) ||
-	    (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED | TS_ONPROC)) ||
+	    (t->t_state & (TS_ZOMB | TS_FREE | TS_STOPPED |
+	    TS_ONPROC | TS_WAIT)) ||
 	    !(t->t_schedflag & TS_LOAD) || !SWAP_OK(t))
 		return (-1);
 
@@ -1653,17 +1683,27 @@
  * move thread to priority specified in tsdptbl for time slice expiration
  * and set runrun to cause preemption.
  */
-
 static void
 ts_tick(kthread_t *t)
 {
 	tsproc_t *tspp = (tsproc_t *)(t->t_cldata);
 	klwp_t *lwp;
+	boolean_t call_cpu_surrender = B_FALSE;
 	pri_t	oldpri = t->t_pri;
 
 	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
 
 	thread_lock(t);
+
+	/*
+	 * Keep track of thread's project CPU usage.  Note that projects
+	 * get charged even when threads are running in the kernel.
+	 */
+	if (CPUCAPS_ON()) {
+		call_cpu_surrender = cpucaps_charge(t, &tspp->ts_caps,
+		    CPUCAPS_CHARGE_ENFORCE) && !(tspp->ts_flags & TSKPRI);
+	}
+
 	if ((tspp->ts_flags & TSKPRI) == 0) {
 		if (--tspp->ts_timeleft <= 0) {
 			pri_t	new_pri;
@@ -1709,17 +1749,21 @@
 				tspp->ts_timeleft =
 				    ts_dptbl[tspp->ts_cpupri].ts_quantum;
 			} else {
-				tspp->ts_flags |= TSBACKQ;
-				cpu_surrender(t);
+				call_cpu_surrender = B_TRUE;
 			}
 			TRACE_2(TR_FAC_DISP, TR_TICK,
 			    "tick:tid %p old pri %d", t, oldpri);
 		} else if (t->t_state == TS_ONPROC &&
 			    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
-			tspp->ts_flags |= TSBACKQ;
-			cpu_surrender(t);
+			call_cpu_surrender = B_TRUE;
 		}
 	}
+
+	if (call_cpu_surrender) {
+		tspp->ts_flags |= TSBACKQ;
+		cpu_surrender(t);
+	}
+
 	thread_unlock_nopreempt(t);	/* clock thread can't be preempted */
 }
 
@@ -1877,8 +1921,8 @@
 			goto next;
 		if (tx->t_schedctl && schedctl_get_nopreempt(tx))
 			goto next;
-		if (tx->t_state != TS_RUN && (tx->t_state != TS_SLEEP ||
-		    !ts_sleep_promote)) {
+		if (tx->t_state != TS_RUN && tx->t_state != TS_WAIT &&
+		    (tx->t_state != TS_SLEEP || !ts_sleep_promote)) {
 			/* make next syscall/trap do CL_TRAPRET */
 			tx->t_trapret = 1;
 			aston(tx);
@@ -1907,7 +1951,6 @@
 	return (updated);
 }
 
-
 /*
  * Processes waking up go to the back of their queue.  We don't
  * need to assign a time quantum here because thread is still
@@ -1981,6 +2024,11 @@
 	ASSERT(THREAD_LOCK_HELD(t));
 
 	/*
+	 * Collect CPU usage spent before yielding
+	 */
+	(void) CPUCAPS_CHARGE(t, &tspp->ts_caps, CPUCAPS_CHARGE_ONLY);
+
+	/*
 	 * Clear the preemption control "yield" bit since the user is
 	 * doing a yield.
 	 */
--- a/usr/src/uts/common/dtrace/sdt_subr.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/dtrace/sdt_subr.c	Fri Mar 09 15:55:28 2007 -0800
@@ -115,6 +115,10 @@
 	{ "sched", "schedctl-yield", 0, 0, "int" },
 	{ "sched", "surrender", 0, 0, "kthread_t *", "lwpsinfo_t *" },
 	{ "sched", "surrender", 1, 0, "kthread_t *", "psinfo_t *" },
+	{ "sched", "cpucaps-sleep", 0, 0, "kthread_t *", "lwpsinfo_t *" },
+	{ "sched", "cpucaps-sleep", 1, 0, "kthread_t *", "psinfo_t *" },
+	{ "sched", "cpucaps-wakeup", 0, 0, "kthread_t *", "lwpsinfo_t *" },
+	{ "sched", "cpucaps-wakeup", 1, 0, "kthread_t *", "psinfo_t *" },
 	{ "proc", "create", 0, 0, "proc_t *", "psinfo_t *" },
 	{ "proc", "exec", 0, 0, "string" },
 	{ "proc", "exec-failure", 0, 0, "int" },
--- a/usr/src/uts/common/fs/proc/prcontrol.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/fs/proc/prcontrol.c	Fri Mar 09 15:55:28 2007 -0800
@@ -1034,8 +1034,12 @@
 			t->t_proc_flag |= TP_PRSTOP;
 			t->t_sig_check = 1;	/* do ISSIG */
 		}
-		if (t->t_state == TS_SLEEP &&
-		    (t->t_flag & T_WAKEABLE)) {
+
+		/* Move the thread from wait queue to run queue */
+		if (ISWAITING(t))
+			setrun_locked(t);
+
+		if (ISWAKEABLE(t)) {
 			if (t->t_wchan0 == NULL)
 				setrun_locked(t);
 			else if (!VSTOPPED(t)) {
@@ -1452,9 +1456,8 @@
 			}
 		}
 		thread_lock(t);
-		if (t->t_state == TS_SLEEP &&
-		    (t->t_flag & T_WAKEABLE)) {
-			/* Set signalled sleeping lwp running */
+		if (ISWAKEABLE(t) || ISWAITING(t)) {
+			/* Set signalled sleeping/waiting lwp running */
 			setrun_locked(t);
 		} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
 			/* If SIGKILL, set stopped lwp running */
@@ -1759,8 +1762,7 @@
 
 	schedctl_finish_sigblock(t);
 	sigutok(sp, &t->t_hold);
-	if (t->t_state == TS_SLEEP &&
-	    (t->t_flag & T_WAKEABLE) &&
+	if (ISWAKEABLE(t) &&
 	    (fsig(&p->p_sig, t) || fsig(&t->t_sig, t)))
 		setrun_locked(t);
 	t->t_sig_check = 1;	/* so thread will see new holdmask */
@@ -2363,10 +2365,9 @@
 			thread_lock(t);
 			t->t_proc_flag |= TP_PAUSE;
 			aston(t);
-			if (t->t_state == TS_SLEEP &&
-			    (t->t_flag & T_WAKEABLE)) {
-				if (t->t_wchan0 == NULL)
-					setrun_locked(t);
+			if ((ISWAKEABLE(t) && (t->t_wchan0 == NULL)) ||
+			    ISWAITING(t)) {
+				setrun_locked(t);
 			}
 			prpokethread(t);
 			thread_unlock(t);
--- a/usr/src/uts/common/fs/proc/prsubr.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/fs/proc/prsubr.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -166,6 +166,7 @@
 			}
 			break;
 		case TS_RUN:
+		case TS_WAIT:
 			if (t_run == NULL)
 				t_run = t;
 			break;
@@ -2507,6 +2508,7 @@
 	case TS_ONPROC:		state = SONPROC;	c = 'O';	break;
 	case TS_ZOMB:		state = SZOMB;		c = 'Z';	break;
 	case TS_STOPPED:	state = SSTOP;		c = 'T';	break;
+	case TS_WAIT:		state = SWAIT;		c = 'W';	break;
 	default:		state = 0;		c = '?';	break;
 	}
 	psp->pr_state = state;
@@ -2573,6 +2575,7 @@
 	case TS_ONPROC:		state = SONPROC;	c = 'O';	break;
 	case TS_ZOMB:		state = SZOMB;		c = 'Z';	break;
 	case TS_STOPPED:	state = SSTOP;		c = 'T';	break;
+	case TS_WAIT:		state = SWAIT;		c = 'W';	break;
 	default:		state = 0;		c = '?';	break;
 	}
 	psp->pr_state = state;
--- a/usr/src/uts/common/os/clock.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/clock.c	Fri Mar 09 15:55:28 2007 -0800
@@ -79,8 +79,7 @@
 #include <sys/inttypes.h>
 
 /*
- * clock is called straight from
- * the real time clock interrupt.
+ * clock() is called straight from the clock cyclic; see clock_init().
  *
  * Functions:
  *	reprime clock
@@ -314,10 +313,7 @@
 static void loadavg_update();
 
 void (*cmm_clock_callout)() = NULL;
-
-#ifdef	KSLICE
-int kslice = KSLICE;
-#endif
+void (*cpucaps_clock_callout)() = NULL;
 
 static void
 clock(void)
@@ -513,9 +509,10 @@
 
 	/*
 	 * Do tick processing for all the active threads running in
-	 * the system.
+	 * the system.  We're trying to be more fair by walking the
+	 * list of CPUs starting from a different CPUs each time.
 	 */
-	cp = cpu_list;
+	cp = clock_cpu_list;
 	nrunning = 0;
 	do {
 		klwp_id_t lwp;
@@ -649,21 +646,11 @@
 			clock_tick(t);
 		}
 
-#ifdef KSLICE
-		/*
-		 * Ah what the heck, give this kid a taste of the real
-		 * world and yank the rug out from under it.
-		 * But, only if we are running UniProcessor.
-		 */
-		if ((kslice) && (ncpus == 1)) {
-			aston(t);
-			cp->cpu_runrun = 1;
-			cp->cpu_kprunrun = 1;
-		}
-#endif
 		if (!exiting)
 			mutex_exit(plockp);
-	} while ((cp = cp->cpu_next) != cpu_list);
+	} while ((cp = cp->cpu_next) != clock_cpu_list);
+
+	clock_cpu_list = clock_cpu_list->cpu_next;
 
 	/*
 	 * bump time in ticks
@@ -683,6 +670,9 @@
 	if ((funcp = cmm_clock_callout) != NULL)
 		(*funcp)();
 
+	if ((funcp = cpucaps_clock_callout) != NULL)
+		(*funcp)();
+
 	/*
 	 * Wakeup the cageout thread waiters once per second.
 	 */
--- a/usr/src/uts/common/os/cpu.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/cpu.c	Fri Mar 09 15:55:28 2007 -0800
@@ -105,6 +105,7 @@
  */
 kmutex_t	cpu_lock;
 cpu_t		*cpu_list;		/* list of all CPUs */
+cpu_t		*clock_cpu_list;	/* used by clock to walk CPUs */
 cpu_t		*cpu_active;		/* list of active CPUs */
 static cpuset_t	cpu_available;		/* set of available CPUs */
 cpuset_t	cpu_seqid_inuse;	/* which cpu_seqids are in use */
@@ -1618,6 +1619,7 @@
 	cp->cpu_next = cp;
 	cp->cpu_prev = cp;
 	cpu_list = cp;
+	clock_cpu_list = cp;
 
 	cp->cpu_next_onln = cp;
 	cp->cpu_prev_onln = cp;
@@ -1763,7 +1765,10 @@
 	cp->cpu_prev->cpu_next = cp->cpu_next;
 	cp->cpu_next->cpu_prev = cp->cpu_prev;
 	if (cp == cpu_list)
-	    cpu_list = cpnext;
+		cpu_list = cpnext;
+	if (cp == clock_cpu_list)
+		clock_cpu_list = cpnext;
+
 
 	/*
 	 * Signals that the cpu has been deleted (see above).
--- a/usr/src/uts/common/os/kstat_fr.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/kstat_fr.c	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -289,7 +289,9 @@
 	ekstat_t *e = (ekstat_t *)k;
 	kstat_zone_t *kz;
 
-	kz = kmem_alloc(sizeof (*kz), KM_SLEEP);
+	kz = kmem_alloc(sizeof (*kz), KM_NOSLEEP);
+	if (kz == NULL)
+		return;
 	mutex_enter(&kstat_chain_lock);
 	kz->zoneid = zoneid;
 	kz->next = e->e_zone.next;
--- a/usr/src/uts/common/os/lwp.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/lwp.c	Fri Mar 09 15:55:28 2007 -0800
@@ -781,6 +781,7 @@
 
 	t->t_proc_flag |= TP_LWPEXIT;
 	term_mstate(t);
+
 #ifndef NPROBE
 	/* Kernel probe */
 	if (t->t_tnf_tpdp)
@@ -916,10 +917,12 @@
 		 * XXX Should use virtual stop like /proc does instead of
 		 * XXX waking the thread to get it to stop.
 		 */
-		if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE))
+		if (ISWAKEABLE(t) || ISWAITING(t)) {
 			setrun_locked(t);
-		else if (t->t_state == TS_ONPROC && t->t_cpu != CPU)
+		} else if (t->t_state == TS_ONPROC && t->t_cpu != CPU) {
 			poke_cpu(t->t_cpu->cpu_id);
+		}
+
 		tid = t->t_tid;	 /* remember thread ID */
 		/*
 		 * Wait for lwp to stop
@@ -1360,9 +1363,8 @@
 			continue;
 		thread_lock(t);
 		aston(t);	/* make thread trap or do post_syscall */
-		if (t->t_state == TS_SLEEP) {
-			if (t->t_flag & T_WAKEABLE)
-				setrun_locked(t);
+		if (ISWAKEABLE(t) || ISWAITING(t)) {
+			setrun_locked(t);
 		} else if (t->t_state == TS_STOPPED) {
 			/*
 			 * Ensure that proc_exit() is not blocked by lwps
--- a/usr/src/uts/common/os/msacct.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/msacct.c	Fri Mar 09 15:55:28 2007 -0800
@@ -236,6 +236,47 @@
 }
 
 /*
+ * Return an aggregation of user and system CPU time consumed by
+ * the specified thread in scaled nanoseconds.
+ */
+hrtime_t
+mstate_thread_onproc_time(kthread_t *t)
+{
+	hrtime_t aggr_time;
+	hrtime_t now;
+	hrtime_t state_start;
+	struct mstate *ms;
+	klwp_t *lwp;
+	int	mstate;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	if ((lwp = ttolwp(t)) == NULL)
+		return (0);
+
+	mstate = t->t_mstate;
+	ms = &lwp->lwp_mstate;
+	state_start = ms->ms_state_start;
+
+	aggr_time = ms->ms_acct[LMS_USER] +
+	    ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
+
+	now = gethrtime_unscaled();
+
+	/*
+	 * NOTE: gethrtime_unscaled on X86 taken on different CPUs is
+	 * inconsistent, so it is possible that now < state_start.
+	 */
+	if ((mstate == LMS_USER || mstate == LMS_SYSTEM ||
+		mstate == LMS_TRAP) && (now > state_start)) {
+			aggr_time += now - state_start;
+	}
+
+	scalehrtime(&aggr_time);
+	return (aggr_time);
+}
+
+/*
  * Return an aggregation of microstate times in scaled nanoseconds (high-res
  * time).  This keeps in mind that p_acct is already scaled, and ms_acct is
  * not.
--- a/usr/src/uts/common/os/project.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/project.c	Fri Mar 09 15:55:28 2007 -0800
@@ -41,6 +41,7 @@
 #include <sys/port_kernel.h>
 #include <sys/task.h>
 #include <sys/zone.h>
+#include <sys/cpucaps.h>
 
 int project_hash_size = 64;
 static kmutex_t project_hash_lock;
@@ -49,6 +50,7 @@
 static kproject_t *projects_list;
 
 rctl_hndl_t rc_project_cpu_shares;
+rctl_hndl_t rc_project_cpu_cap;
 rctl_hndl_t rc_project_nlwps;
 rctl_hndl_t rc_project_ntasks;
 rctl_hndl_t rc_project_msgmni;
@@ -156,6 +158,7 @@
 	kproject_t *kp = (kproject_t *)val;
 
 	ASSERT(kp->kpj_count == 0);
+	ASSERT(kp->kpj_cpucap == NULL);
 	kmem_free(kp, sizeof (kproject_t));
 }
 
@@ -251,6 +254,7 @@
 
 		p = spare_p;
 		p->kpj_id = id;
+		p->kpj_zone = zone;
 		p->kpj_zoneid = zone->zone_id;
 		p->kpj_count = 0;
 		p->kpj_shares = 1;
@@ -304,6 +308,13 @@
 	 * across reboots.
 	 */
 	if (create == B_TRUE) {
+		/*
+		 * Inform CPU caps framework of the new project
+		 */
+		cpucaps_project_add(p);
+		/*
+		 * Set up project kstats
+		 */
 		ksp = project_kstat_create(p, zone);
 		mutex_enter(&project_hash_lock);
 		ASSERT(p->kpj_data.kpd_lockedmem_kstat == NULL);
@@ -343,6 +354,8 @@
 			projects_list = p->kpj_next;
 		mutex_exit(&projects_list_lock);
 
+		cpucaps_project_remove(p);
+
 		rctl_set_free(p->kpj_rctls);
 		project_kstat_delete(p);
 
@@ -431,7 +444,6 @@
 	return (0);
 }
 
-
 static rctl_ops_t project_cpu_shares_ops = {
 	rcop_no_action,
 	project_cpu_shares_usage,
@@ -439,6 +451,43 @@
 	rcop_no_test
 };
 
+
+/*
+ * project.cpu-cap resource control support.
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+project_cpu_cap_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_project_get(p->p_task->tk_proj));
+}
+
+/*ARGSUSED*/
+static int
+project_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	kproject_t *kpj = e->rcep_p.proj;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_PROJECT);
+	if (kpj == NULL)
+		return (0);
+
+	/*
+	 * set cap to the new value.
+	 */
+	return (cpucaps_project_set(kpj,  nv));
+}
+
+static rctl_ops_t project_cpu_cap_ops = {
+	rcop_no_action,
+	project_cpu_cap_get,
+	project_cpu_cap_set,
+	rcop_no_test
+};
+
 /*ARGSUSED*/
 static rctl_qty_t
 project_lwps_usage(rctl_t *r, proc_t *p)
@@ -804,6 +853,13 @@
 	rctl_add_default_limit("project.cpu-shares", 1, RCPRIV_PRIVILEGED,
 	    RCTL_LOCAL_NOACTION);
 
+	rc_project_cpu_cap = rctl_register("project.cpu-cap",
+	    RCENTITY_PROJECT, RCTL_GLOBAL_SIGNAL_NEVER |
+	    RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC |
+	    RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER |
+	    RCTL_GLOBAL_INFINITE,
+	    MAXCAP, MAXCAP, &project_cpu_cap_ops);
+
 	rc_project_nlwps = rctl_register("project.max-lwps", RCENTITY_PROJECT,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
 	    INT_MAX, INT_MAX, &project_lwps_ops);
--- a/usr/src/uts/common/os/sig.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/sig.c	Fri Mar 09 15:55:28 2007 -0800
@@ -181,7 +181,7 @@
 	 */
 	if (!signal_is_blocked(t, sig)) {
 		t->t_sig_check = 1;	/* have thread do an issig */
-		if (t->t_state == TS_SLEEP && (t->t_flag & T_WAKEABLE)) {
+		if (ISWAKEABLE(t) || ISWAITING(t)) {
 			setrun_locked(t);
 			rval = 1;
 		} else if (t->t_state == TS_STOPPED && sig == SIGKILL &&
@@ -974,6 +974,11 @@
 						notify = 1;
 					}
 				}
+
+				/* Move waiting thread to run queue */
+				if (ISWAITING(tx))
+					setrun_locked(tx);
+
 				/*
 				 * force the thread into the kernel
 				 * if it is not already there.
--- a/usr/src/uts/common/os/task.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/task.c	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -388,8 +388,7 @@
 	tk->tk_nlwps = 0;
 	tk->tk_nlwps_ctl = INT_MAX;
 	tk->tk_usage = tu;
-	tk->tk_proj = project_hold_by_id(projid, zone,
-	    PROJECT_HOLD_INSERT);
+	tk->tk_proj = project_hold_by_id(projid, zone, PROJECT_HOLD_INSERT);
 	tk->tk_flags = TASK_NORMAL;
 
 	/*
@@ -670,6 +669,21 @@
 
 			thread_lock(t);
 			oldkpj = ttoproj(t);
+
+			/*
+			 * Kick this thread so that he doesn't sit
+			 * on a wrong wait queue.
+			 */
+			if (ISWAITING(t))
+				setrun_locked(t);
+
+			/*
+			 * The thread wants to go on the project wait queue, but
+			 * the waitq is changing.
+			 */
+			if (t->t_schedflag & TS_PROJWAITQ)
+				t->t_schedflag &= ~ TS_PROJWAITQ;
+
 			t->t_proj = kpj;
 			t->t_pre_sys = 1;		/* For cred update */
 			thread_unlock(t);
--- a/usr/src/uts/common/os/timers.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/timers.c	Fri Mar 09 15:55:28 2007 -0800
@@ -589,6 +589,7 @@
 			}
 			break;
 		case TS_RUN:
+		case TS_WAIT:
 			mstate = LMS_WAIT_CPU;
 			break;
 		case TS_ONPROC:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/os/waitq.c	Fri Mar 09 15:55:28 2007 -0800
@@ -0,0 +1,386 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/class.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+#include <sys/waitq.h>
+#include <sys/cmn_err.h>
+#include <sys/time.h>
+#include <sys/dtrace.h>
+#include <sys/sdt.h>
+#include <sys/zone.h>
+
+/*
+ * Wait queue implementation.
+ */
+
+void
+waitq_init(waitq_t *wq)
+{
+	DISP_LOCK_INIT(&wq->wq_lock);
+	wq->wq_first = NULL;
+	wq->wq_count = 0;
+	wq->wq_blocked = B_TRUE;
+}
+
+void
+waitq_fini(waitq_t *wq)
+{
+	ASSERT(wq->wq_count == 0);
+	ASSERT(wq->wq_first == NULL);
+	ASSERT(wq->wq_blocked == B_TRUE);
+	ASSERT(!DISP_LOCK_HELD(&wq->wq_lock));
+
+	DISP_LOCK_DESTROY(&wq->wq_lock);
+}
+
+/*
+ * Operations on waitq_t structures.
+ *
+ * A wait queue is a singly linked NULL-terminated list with doubly
+ * linked circular sublists.  The singly linked list is in descending
+ * priority order and FIFO for threads of the same priority.  It links
+ * through the t_link field of the thread structure.  The doubly linked
+ * sublists link threads of the same priority.  They use the t_priforw
+ * and t_priback fields of the thread structure.
+ *
+ * Graphically (with priorities in parens):
+ *
+ *         ________________           _______                   _______
+ *        /                \         /       \                 /       \
+ *        |                |         |       |                 |       |
+ *        v                v         v       v                 v       v
+ *     t1(60)-->t2(60)-->t3(60)-->t4(50)-->t5(50)-->t6(30)-->t7(0)-->t8(0)
+ *        ^      ^  ^      ^         ^       ^       ^  ^      ^       ^
+ *        |      |  |      |         |       |       |  |      |       |
+ *        \______/  \______/         \_______/       \__/      \_______/
+ *
+ * There are three interesting operations on a waitq list: inserting
+ * a thread into the proper position according to priority; removing a
+ * thread given a pointer to it; and walking the list, possibly
+ * removing threads along the way.  This design allows all three
+ * operations to be performed efficiently and easily.
+ *
+ * To insert a thread, traverse the list looking for the sublist of
+ * the same priority as the thread (or one of a lower priority,
+ * meaning there are no other threads in the list of the same
+ * priority).  This can be done without touching all threads in the
+ * list by following the links between the first threads in each
+ * sublist.  Given a thread t that is the head of a sublist (the first
+ * thread of that priority found when following the t_link pointers),
+ * t->t_priback->t_link points to the head of the next sublist.  It's
+ * important to do this since a waitq may contain thousands of
+ * threads.
+ *
+ * Removing a thread from the list is also efficient.  First, the
+ * t_waitq field contains a pointer to the waitq on which a thread
+ * is waiting (or NULL if it's not on a waitq).  This is used to
+ * determine if the given thread is on the given waitq without
+ * searching the list.  Assuming it is, if it's not the head of a
+ * sublist, just remove it from the sublist and use the t_priback
+ * pointer to find the thread that points to it with t_link.  If it is
+ * the head of a sublist, search for it by walking the sublist heads,
+ * similar to searching for a given priority level when inserting a
+ * thread.
+ *
+ * To walk the list, simply follow the t_link pointers.  Removing
+ * threads along the way can be done easily if the code maintains a
+ * pointer to the t_link field that pointed to the thread being
+ * removed.
+ */
+
+static void
+waitq_link(waitq_t *wq, kthread_t *t)
+{
+	kthread_t *next_tp;
+	kthread_t *last_tp;
+	kthread_t **tpp;
+	pri_t tpri, next_pri, last_pri = -1;
+
+	ASSERT(DISP_LOCK_HELD(&wq->wq_lock));
+
+	tpri = DISP_PRIO(t);
+	tpp = &wq->wq_first;
+	while ((next_tp = *tpp) != NULL) {
+		next_pri = DISP_PRIO(next_tp);
+		if (tpri > next_pri)
+			break;
+		last_tp = next_tp->t_priback;
+		last_pri = next_pri;
+		tpp = &last_tp->t_link;
+	}
+	*tpp = t;
+	t->t_link = next_tp;
+	if (last_pri == tpri) {
+		/* last_tp points to the last thread of this priority */
+		t->t_priback = last_tp;
+		t->t_priforw = last_tp->t_priforw;
+		last_tp->t_priforw->t_priback = t;
+		last_tp->t_priforw = t;
+	} else {
+		t->t_priback = t->t_priforw = t;
+	}
+	wq->wq_count++;
+	t->t_waitq = wq;
+}
+
+static void
+waitq_unlink(waitq_t *wq, kthread_t *t)
+{
+	kthread_t *nt;
+	kthread_t **ptl;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(DISP_LOCK_HELD(&wq->wq_lock));
+	ASSERT(t->t_waitq == wq);
+
+	ptl = &t->t_priback->t_link;
+	/*
+	 * Is it the head of a priority sublist?  If so, need to walk
+	 * the priorities to find the t_link pointer that points to it.
+	 */
+	if (*ptl != t) {
+		/*
+		 * Find the right priority level.
+		 */
+		ptl = &t->t_waitq->wq_first;
+		while ((nt = *ptl) != t)
+			ptl = &nt->t_priback->t_link;
+	}
+	/*
+	 * Remove thread from the t_link list.
+	 */
+	*ptl = t->t_link;
+
+	/*
+	 * Take it off the priority sublist if there's more than one
+	 * thread there.
+	 */
+	if (t->t_priforw != t) {
+		t->t_priback->t_priforw = t->t_priforw;
+		t->t_priforw->t_priback = t->t_priback;
+	}
+	t->t_link = NULL;
+
+	wq->wq_count--;
+	t->t_waitq = NULL;
+	t->t_priforw = NULL;
+	t->t_priback = NULL;
+}
+
+/*
+ * Put specified thread to specified wait queue without dropping thread's lock.
+ * Returns 1 if thread was successfully placed on project's wait queue, or
+ * 0 if wait queue is blocked.
+ */
+int
+waitq_enqueue(waitq_t *wq, kthread_t *t)
+{
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(t->t_sleepq == NULL);
+	ASSERT(t->t_waitq == NULL);
+	ASSERT(t->t_link == NULL);
+
+	disp_lock_enter_high(&wq->wq_lock);
+
+	/*
+	 * Can't enqueue anything on a blocked wait queue
+	 */
+	if (wq->wq_blocked) {
+		disp_lock_exit_high(&wq->wq_lock);
+		return (0);
+	}
+
+	/*
+	 * Mark the time when thread is placed on wait queue. The microstate
+	 * accounting code uses this timestamp to determine wait times.
+	 */
+	t->t_waitrq = gethrtime_unscaled();
+
+	/*
+	 * Mark thread as not swappable.  If necessary, it will get
+	 * swapped out when it returns to the userland.
+	 */
+	t->t_schedflag |= TS_DONT_SWAP;
+	DTRACE_SCHED1(cpucaps__sleep, kthread_t *, t);
+	waitq_link(wq, t);
+
+	THREAD_WAIT(t, &wq->wq_lock);
+	return (1);
+}
+
+/*
+ * Change thread's priority while on the wait queue.
+ * Dequeue and equeue it again so that it gets placed in the right place.
+ */
+void
+waitq_change_pri(kthread_t *t, pri_t new_pri)
+{
+	waitq_t *wq = t->t_waitq;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(ISWAITING(t));
+	ASSERT(wq != NULL);
+
+	waitq_unlink(wq, t);
+	t->t_pri = new_pri;
+	waitq_link(wq, t);
+}
+
+static void
+waitq_dequeue(waitq_t *wq, kthread_t *t)
+{
+	ASSERT(THREAD_LOCK_HELD(t));
+	ASSERT(t->t_waitq == wq);
+	ASSERT(ISWAITING(t));
+
+	waitq_unlink(wq, t);
+	DTRACE_SCHED1(cpucaps__wakeup, kthread_t *, t);
+
+	/*
+	 * Change thread to transition state without dropping
+	 * the wait queue lock.
+	 */
+	THREAD_TRANSITION_NOLOCK(t);
+}
+
+/*
+ * Return True iff there are any threads on the specified wait queue.
+ * The check is done **without holding any locks**.
+ */
+boolean_t
+waitq_isempty(waitq_t *wq)
+{
+	return (wq->wq_count == 0);
+}
+
+/*
+ * Take thread off its wait queue and make it runnable.
+ * Returns with thread lock held.
+ */
+void
+waitq_setrun(kthread_t *t)
+{
+	waitq_t *wq = t->t_waitq;
+
+	ASSERT(THREAD_LOCK_HELD(t));
+
+	ASSERT(ISWAITING(t));
+	if (wq == NULL)
+		panic("waitq_setrun: thread %p is not on waitq", t);
+	waitq_dequeue(wq, t);
+
+	disp_lock_exit_high(&wq->wq_lock);
+	CL_SETRUN(t);
+}
+
+/*
+ * Take the first thread off the wait queue and return pointer to it.
+ */
+static kthread_t *
+waitq_takeone(waitq_t *wq)
+{
+	kthread_t *t;
+
+	disp_lock_enter(&wq->wq_lock);
+	if ((t = wq->wq_first) != NULL)
+		waitq_dequeue(wq, wq->wq_first);
+	disp_lock_exit(&wq->wq_lock);
+	return (t);
+}
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ * Return the pointer to the thread or NULL if waitq is empty
+ */
+static kthread_t *
+waitq_runfirst(waitq_t *wq)
+{
+	kthread_t *t;
+
+	t = waitq_takeone(wq);
+	if (t != NULL) {
+		CL_SETRUN(t);
+		thread_unlock(t);	/* drops dispq lock */
+	}
+	return (t);
+}
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ */
+void
+waitq_runone(waitq_t *wq)
+{
+	(void) waitq_runfirst(wq);
+}
+
+/*
+ * Take all threads off the wait queue and make them runnable.
+ */
+static void
+waitq_runall(waitq_t *wq)
+{
+	while (waitq_runfirst(wq) != NULL)
+		;
+}
+
+/*
+ * Prevent any new threads from entering wait queue and make all threads
+ * currently on the wait queue runnable. After waitq_block() completion, no
+ * threads should ever appear on the wait queue untill it is unblocked.
+ */
+void
+waitq_block(waitq_t *wq)
+{
+	ASSERT(!wq->wq_blocked);
+	disp_lock_enter(&wq->wq_lock);
+	wq->wq_blocked = B_TRUE;
+	disp_lock_exit(&wq->wq_lock);
+	waitq_runall(wq);
+	ASSERT(waitq_isempty(wq));
+}
+
+/*
+ * Allow threads to be placed on the wait queue.
+ */
+void
+waitq_unblock(waitq_t *wq)
+{
+	disp_lock_enter(&wq->wq_lock);
+
+	ASSERT(waitq_isempty(wq));
+	ASSERT(wq->wq_blocked);
+
+	wq->wq_blocked = B_FALSE;
+
+	disp_lock_exit(&wq->wq_lock);
+}
--- a/usr/src/uts/common/os/zone.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/os/zone.c	Fri Mar 09 15:55:28 2007 -0800
@@ -240,6 +240,7 @@
 #include <sys/brand.h>
 #include <sys/zone.h>
 #include <net/if.h>
+#include <sys/cpucaps.h>
 #include <vm/seg.h>
 
 /*
@@ -328,6 +329,7 @@
 rctl_hndl_t rc_zone_cpu_shares;
 rctl_hndl_t rc_zone_locked_mem;
 rctl_hndl_t rc_zone_max_swap;
+rctl_hndl_t rc_zone_cpu_cap;
 rctl_hndl_t rc_zone_nlwps;
 rctl_hndl_t rc_zone_shmmax;
 rctl_hndl_t rc_zone_shmmni;
@@ -882,6 +884,43 @@
 	rcop_no_test
 };
 
+/*
+ * zone.cpu-cap resource control support.
+ */
+/*ARGSUSED*/
+static rctl_qty_t
+zone_cpu_cap_get(rctl_t *rctl, struct proc *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	return (cpucaps_zone_get(p->p_zone));
+}
+
+/*ARGSUSED*/
+static int
+zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e,
+    rctl_qty_t nv)
+{
+	zone_t *zone = e->rcep_p.zone;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+	ASSERT(e->rcep_t == RCENTITY_ZONE);
+
+	if (zone == NULL)
+		return (0);
+
+	/*
+	 * set cap to the new value.
+	 */
+	return (cpucaps_zone_set(zone, nv));
+}
+
+static rctl_ops_t zone_cpu_cap_ops = {
+	rcop_no_action,
+	zone_cpu_cap_get,
+	zone_cpu_cap_set,
+	rcop_no_test
+};
+
 /*ARGSUSED*/
 static rctl_qty_t
 zone_lwps_usage(rctl_t *r, proc_t *p)
@@ -1384,8 +1423,13 @@
 	rc_zone_cpu_shares = rctl_register("zone.cpu-shares",
 	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER |
 	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER,
-	    FSS_MAXSHARES, FSS_MAXSHARES,
-	    &zone_cpu_shares_ops);
+	    FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops);
+
+	rc_zone_cpu_cap = rctl_register("zone.cpu-cap",
+	    RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS |
+	    RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER |
+	    RCTL_GLOBAL_INFINITE,
+	    MAXCAP, MAXCAP, &zone_cpu_cap_ops);
 
 	rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE,
 	    RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT,
@@ -1530,6 +1574,13 @@
 	ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
 	    zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
 
+	/*
+	 * Remove any zone caps.
+	 */
+	cpucaps_zone_remove(zone);
+
+	ASSERT(zone->zone_cpucap == NULL);
+
 	/* remove from deathrow list */
 	if (zone_status_get(zone) == ZONE_IS_DEAD) {
 		ASSERT(zone->zone_ref == 0);
@@ -2501,6 +2552,10 @@
 		zone->zone_kthreads = NULL;
 		if (zone_status_get(zone) == ZONE_IS_EMPTY) {
 			zone_status_set(zone, ZONE_IS_DOWN);
+			/*
+			 * Remove any CPU caps on this zone.
+			 */
+			cpucaps_zone_remove(zone);
 		}
 	} else {
 		t->t_forw->t_back = t->t_back;
@@ -2616,8 +2671,9 @@
 		 * Make sure we are still in the booting state-- we could have
 		 * raced and already be shutting down, or even further along.
 		 */
-		if (zone_status_get(z) == ZONE_IS_BOOTING)
+		if (zone_status_get(z) == ZONE_IS_BOOTING) {
 			zone_status_set(z, ZONE_IS_SHUTTING_DOWN);
+		}
 		mutex_exit(&zone_status_lock);
 		/* It's gone bad, dispose of the process */
 		if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) {
@@ -3879,7 +3935,13 @@
 
 	}
 
-	/* Get rid of the zone's kstats. */
+	/*
+	 * Remove CPU cap for this zone now since we're not going to
+	 * fail below this point.
+	 */
+	cpucaps_zone_remove(zone);
+
+	/* Get rid of the zone's kstats */
 	zone_kstat_delete(zone);
 
 	/* Say goodbye to brand framework. */
@@ -3938,8 +4000,8 @@
 	char *outstr;
 	zone_status_t zone_status;
 	pid_t initpid;
-	boolean_t global = (curproc->p_zone == global_zone);
-	boolean_t curzone = (curproc->p_zone->zone_id == zoneid);
+	boolean_t global = (curzone == global_zone);
+	boolean_t inzone = (curzone->zone_id == zoneid);
 	ushort_t flags;
 
 	mutex_enter(&zonehash_lock);
@@ -3980,7 +4042,7 @@
 			bcopy(zone->zone_rootpath, zonepath, size);
 			zonepath[size - 1] = '\0';
 		} else {
-			if (curzone || !is_system_labeled()) {
+			if (inzone || !is_system_labeled()) {
 				/*
 				 * Caller is not in the global zone.
 				 * if the query is on the current zone
@@ -4011,7 +4073,7 @@
 			if (err != 0 && err != ENAMETOOLONG)
 				error = EFAULT;
 		}
-		if (global || (is_system_labeled() && !curzone))
+		if (global || (is_system_labeled() && !inzone))
 			kmem_free(zonepath, size);
 		break;
 
@@ -4365,6 +4427,7 @@
 	int err = 0;
 	rctl_entity_p_t e;
 	size_t swap;
+	kthread_id_t t;
 
 	if (secpolicy_zone_config(CRED()) != 0)
 		return (set_errno(EPERM));
@@ -4625,6 +4688,28 @@
 	pgjoin(pp, zone->zone_zsched->p_pidp);
 
 	/*
+	 * If any threads are scheduled to be placed on zone wait queue they
+	 * should abandon the idea since the wait queue is changing.
+	 * We need to be holding pidlock & p_lock to do this.
+	 */
+	if ((t = pp->p_tlist) != NULL) {
+		do {
+			thread_lock(t);
+			/*
+			 * Kick this thread so that he doesn't sit
+			 * on a wrong wait queue.
+			 */
+			if (ISWAITING(t))
+				setrun_locked(t);
+
+			if (t->t_schedflag & TS_ANYWAITQ)
+				t->t_schedflag &= ~ TS_ANYWAITQ;
+
+			thread_unlock(t);
+		} while ((t = t->t_forw) != pp->p_tlist);
+	}
+
+	/*
 	 * If there is a default scheduling class for the zone and it is not
 	 * the class we are currently in, change all of the threads in the
 	 * process to the new class.  We need to be holding pidlock & p_lock
@@ -4633,7 +4718,6 @@
 	if (zone->zone_defaultcid > 0 &&
 	    zone->zone_defaultcid != curthread->t_cid) {
 		pcparms_t pcparms;
-		kthread_id_t t;
 
 		pcparms.pc_cid = zone->zone_defaultcid;
 		pcparms.pc_clparms[0] = 0;
--- a/usr/src/uts/common/sys/Makefile	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/Makefile	Fri Mar 09 15:55:28 2007 -0800
@@ -110,6 +110,8 @@
 	byteorder.h		\
 	callb.h			\
 	callo.h			\
+	cpucaps.h		\
+	cpucaps_impl.h		\
 	ccompile.h		\
 	cdio.h			\
 	cladm.h			\
@@ -564,6 +566,7 @@
 	vuid_state.h		\
 	vuid_store.h		\
 	wait.h			\
+	waitq.h			\
 	wanboot_impl.h		\
 	watchpoint.h		\
 	winlockio.h		\
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/cpucaps.h	Fri Mar 09 15:55:28 2007 -0800
@@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CPUCAPS_H
+#define	_SYS_CPUCAPS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/zone.h>
+#include <sys/project.h>
+#include <sys/time.h>
+#include <sys/rctl.h>
+
+/*
+ * CPU caps provide an absolute hard CPU usage limit which is enforced even if
+ * some CPUs are idle. It can be enforced at project or zone level.
+ */
+
+#ifdef _KERNEL
+
+/*
+ * Valid caps values go from 1 to MAXCAP - 1. Specifying the MAXCAP as the cap
+ * value is equivalent to disabling the cap.
+ */
+#define	MAXCAP		UINT_MAX
+
+/*
+ * cpucaps_enabled is used to quickly check whether any CPU caps specific code
+ * should be invoked. Users outside CPU Caps framework should use CPUCAPS_ON()
+ * and CPUCAPS_OFF() macros.
+ */
+extern boolean_t cpucaps_enabled;
+
+#define	CPUCAPS_ON()	cpucaps_enabled
+#define	CPUCAPS_OFF()	(!cpucaps_enabled)
+
+/*
+ * Initialize the CPU caps framework.
+ */
+extern void cpucaps_init(void);
+
+/*
+ * Notify caps framework of a new project coming in or existing project
+ * going away
+ */
+extern void cpucaps_project_add(kproject_t *);
+extern void cpucaps_project_remove(kproject_t *);
+
+/*
+ * Notify caps framework when a zone is going away.
+ */
+extern void cpucaps_zone_remove(zone_t *);
+
+/*
+ * Set project/zone cap to specified value. Value of MAXCAP should disable caps.
+ */
+extern int cpucaps_project_set(kproject_t *, rctl_qty_t);
+extern int cpucaps_zone_set(zone_t *, rctl_qty_t);
+
+/*
+ * Get current CPU usage for a project/zone.
+ */
+extern rctl_qty_t cpucaps_project_get(kproject_t *);
+extern rctl_qty_t cpucaps_zone_get(zone_t *);
+
+/*
+ * Scheduling class hooks into CPU caps framework.
+ */
+
+/*
+ * CPU caps specific data for each scheduling class.
+ *
+ * There is a small amount of accounting data that should be kept by each
+ * scheduling class for each thread which is only used by CPU caps code. This
+ * data is kept in the caps_sc structure which is transparent for all scheduling
+ * classes. The fields in the structure are:
+ *
+ *     csc_cputime -  Total time spent on CPU during thread lifetime, obtained
+ *                    as the sum of user, system and trap time, reported by
+ *                    microstate accounting.
+ */
+typedef struct caps_sc {
+	hrtime_t	csc_cputime;
+} caps_sc_t;
+
+/*
+ * Initialize per-thread cpu-caps specific data.
+ */
+extern void cpucaps_sc_init(caps_sc_t *);
+
+/*
+ * Modus operandi for cpucaps_charge() function.
+ *
+ *   CPUCAPS_CHARGE_ENFORCE - charge a thread for its CPU time and
+ *				flag it to be placed on wait queue.
+ *
+ *   CPUCAPS_CHARGE_ONLY    - charge a thread for its CPU time.
+ */
+typedef enum {
+	CPUCAPS_CHARGE_ENFORCE,
+	CPUCAPS_CHARGE_ONLY
+} cpucaps_charge_t;
+
+/*
+ * Add accumulated CPU usage of a thread to its cap.
+ * Return True if thread should be placed on waitq.
+ */
+extern boolean_t cpucaps_charge(kthread_t *, caps_sc_t *, cpucaps_charge_t);
+#define	CPUCAPS_CHARGE(t, scp, flag) \
+	(CPUCAPS_ON() && cpucaps_charge(t, scp, flag))
+
+/*
+ * Request a thread to be placed on a wait queue because the cap is exceeded
+ */
+extern boolean_t cpucaps_enforce(kthread_t *);
+#define	CPUCAPS_ENFORCE(t) (CPUCAPS_ON() && cpucaps_enforce(t))
+
+/*
+ * CPU Caps hook into clock().
+ */
+extern void (*cpucaps_clock_callout)(void);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CPUCAPS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/cpucaps_impl.h	Fri Mar 09 15:55:28 2007 -0800
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CPUCAPS_IMPL_H
+#define	_SYS_CPUCAPS_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <sys/kstat.h>
+#include <sys/cpucaps.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/waitq.h>
+
+/*
+ * When resource control framework sets the cap to NOCAP value the cap
+ * is disabled.
+ */
+#define	NOCAP	MAXCAP
+
+/*
+ * Maximum value for the cap usage. Should be the maximum value for hrtime_t
+ */
+#if defined(_LP64)
+#define	MAX_USAGE LONG_MAX
+#else
+#define	MAX_USAGE 9223372036854775807LL
+#endif
+
+
+/*
+ * Most of the per-project or per-zone state related to CPU caps is kept in the
+ * cpucap_t structure.
+ */
+typedef struct cpucap {
+	list_node_t	cap_link;	/* next/prev capped entity	*/
+	struct kproject	*cap_project;	/* project for the cap		*/
+	struct zone	*cap_zone;	/* zone for the cap		*/
+	waitq_t		cap_waitq;	/* waitq for capped threads	*/
+	kstat_t		*cap_kstat;	/* cpucaps specific kstat	*/
+	int64_t		cap_lbolt;	/* zone cap specific 		*/
+	hrtime_t	cap_value;	/* scaled CPU usage cap		*/
+	hrtime_t	cap_usage;	/* current CPU usage		*/
+	disp_lock_t	cap_usagelock;	/* protects cap_usage above	*/
+	/*
+	 * Per cap statistics.
+	 */
+	hrtime_t	cap_maxusage;	/* maximum cap usage		*/
+	u_longlong_t	cap_below;	/* # of ticks spend below the cap */
+	u_longlong_t	cap_above;	/* # of ticks spend above the cap */
+} cpucap_t;
+
+/*
+ * Wrapper macros for checking cap state.
+ */
+#define	CAP_ENABLED(cap) ((cap)->cap_value != 0)
+#define	CAP_DISABLED(cap) (!CAP_ENABLED(cap))
+
+#define	PROJECT_IS_CAPPED(project) \
+	(((project)->kpj_cpucap != NULL) && \
+	CAP_ENABLED((project)->kpj_cpucap))
+
+#define	ZONE_IS_CAPPED(zone) \
+	(((zone)->zone_cpucap != NULL) && \
+	CAP_ENABLED((zone)->zone_cpucap))
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CPUCAPS_IMPL_H */
--- a/usr/src/uts/common/sys/cpuvar.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/cpuvar.h	Fri Mar 09 15:55:28 2007 -0800
@@ -508,6 +508,7 @@
 extern int		boot_max_ncpus;	/* like max_ncpus but for real */
 extern processorid_t	max_cpuid;	/* maximum CPU number */
 extern struct cpu	*cpu_inmotion;	/* offline or partition move target */
+extern cpu_t		*clock_cpu_list;
 
 #if defined(__i386) || defined(__amd64)
 extern struct cpu *curcpup(void);
--- a/usr/src/uts/common/sys/fss.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/fss.h	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
  */
 
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,11 +32,14 @@
 #include <sys/types.h>
 #include <sys/thread.h>
 #include <sys/project.h>
+#include <sys/cpucaps.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+#ifdef	_KERNEL
+
 typedef uint64_t fsspri_t;
 typedef	uint64_t fssusage_t;
 struct cpupart;
@@ -74,7 +76,7 @@
 typedef struct fsspset {
 	kmutex_t	fssps_lock;	/* lock to protect per-pset	*/
 					/* list of fssproj structures	*/
-	disp_lock_t	fssps_displock;	/* lock for fsps_maxfspri	*/
+	disp_lock_t	fssps_displock;	/* lock for fsps_maxfsspri	*/
 	struct cpupart	*fssps_cpupart;	/* ptr to our cpu partition	*/
 					/* protected by fsspsets_lock	*/
 	fsspri_t	fssps_maxfsspri; /* maximum fsspri value among	*/
@@ -113,7 +115,7 @@
  */
 typedef struct fssproc {
 	kthread_t *fss_tp;	/* pointer back to our thread		*/
-	fssproj_t *fss_proj;	/* pointer to our project FS data	*/
+	fssproj_t *fss_proj;	/* pointer to our project FSS data	*/
 	uchar_t fss_flags;	/* flags defined below			*/
 	int	fss_timeleft;	/* time remaining in procs quantum	*/
 	uint32_t fss_ticks;	/* ticks accumulated by this thread	*/
@@ -126,20 +128,22 @@
 	int	fss_runnable;	/* to indicate runnable/sleeping thread	*/
 	struct fssproc *fss_next; /* pointer to next fssproc_t struct	*/
 	struct fssproc *fss_prev; /* pointer to prev fssproc_t sturct	*/
+	caps_sc_t fss_caps;	/* CPU caps specific data		*/
 } fssproc_t;
 
 /*
- * One of these structures is allocated to each zone running within each active
- * cpu partition.
+ * One of these structures is allocated to each zone running within
+ * each active cpu partition.  This means that if a zone spans more
+ * than one cpu partition then it will have a few of these structures.
  */
 typedef struct fsszone {
-	struct zone 	*fssz_zone;	/* ptr to our zone structure */
-	struct fsszone	*fssz_next;	/* ptr to next fsszone in fsspset */
-	struct fsszone	*fssz_prev;	/* ptr to prev fsszone in fsspset */
-	uint32_t	fssz_shares;	/* total #shares for projs in zone */
-	uint32_t	fssz_nproj;	/* # fssproj_t's in this fsszone */
-	uint32_t	fssz_rshares;	/* "real" shares given to zone */
-	uint32_t	fssz_runnable;	/* # projects with runnable threads */
+	struct zone 	*fssz_zone;	/* ptr to our zone structure	*/
+	struct fsszone	*fssz_next;	/* next fsszone_t in fsspset_t	*/
+	struct fsszone	*fssz_prev;	/* prev fsszone_t in fsspset_t	*/
+	uint32_t	fssz_shares;	/* sum of all project shares	*/
+	uint32_t	fssz_nproj;	/* # of projects		*/
+	uint32_t	fssz_rshares;	/* "real" shares given to zone	*/
+	uint32_t	fssz_runnable;	/* # of runnable projects	*/
 } fsszone_t;
 
 #define	FSSPROC(tx)		((fssproc_t *)(tx->t_cldata))
@@ -158,6 +162,9 @@
 				/* the dispatch queue if preempted */
 #define	FSSRESTORE	0x04	/* thread was not preempted, due to schedctl */
 				/* restore priority from fss_scpri */
+
+#endif	/* _KERNEL */
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/sys/fx.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/fx.h	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
 #ifndef _SYS_FX_H
@@ -33,6 +32,7 @@
 #include <sys/thread.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
+#include <sys/cpucaps.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -114,9 +114,6 @@
 	char		fx_nice;	/* nice value for compatibility */
 	uchar_t 	fx_flags;	/* flags defined below */
 	kthread_t 	*fx_tp;		/* pointer to thread */
-	struct fxproc 	*fx_next;	/* pointer to next fxproc */
-
-	struct fxproc 	*fx_prev;	/* pointer to previous fxproc */
 
 	/* the following are used only when we have callbacks registered */
 	kt_did_t	fx_ktid;
@@ -128,6 +125,7 @@
 	fx_cookie_t	fx_cookie;	/* cookie with which callback */
 					/* was registered */
 	fx_callbacks_t 	*fx_callback;	/* pointer to callback structure */
+	caps_sc_t	fx_caps;	/* CPU caps specific data */
 } fxproc_t;
 
 
--- a/usr/src/uts/common/sys/proc.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/proc.h	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -424,11 +424,12 @@
 /* stat codes */
 
 #define	SSLEEP	1		/* awaiting an event */
-#define	SRUN	2		/* running */
+#define	SRUN	2		/* runnable */
 #define	SZOMB	3		/* process terminated but not waited for */
 #define	SSTOP	4		/* process stopped by debugger */
 #define	SIDL	5		/* intermediate state in process creation */
 #define	SONPROC	6		/* process is being run on a processor */
+#define	SWAIT	7		/* process is waiting to become runnable */
 
 /* p_pidflag codes */
 #define	CLDPEND		0x0001	/* have yet to post a SIGCHLD to the parent */
@@ -639,6 +640,7 @@
 extern	void	estimate_msacct(kthread_t *, hrtime_t);
 extern	void	disable_msacct(proc_t *);
 extern	hrtime_t mstate_aggr_state(proc_t *, int);
+extern	hrtime_t mstate_thread_onproc_time(kthread_t *);
 extern	void	syscall_mstate(int, int);
 
 extern	uint_t	cpu_update_pct(kthread_t *, hrtime_t);
--- a/usr/src/uts/common/sys/project.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/project.h	Fri Mar 09 15:55:28 2007 -0800
@@ -39,6 +39,7 @@
 #include <sys/mutex.h>
 #include <sys/rctl.h>
 #include <sys/ipc_rctl.h>
+#include <sys/zone.h>
 
 typedef struct kproject_kstat {
 	kstat_named_t kpk_zonename;
@@ -58,25 +59,29 @@
 
 } kproject_data_t;
 
+struct cpucap;
+
 /*
  * The first two fields of this structure must not be reordered.
  */
 typedef struct kproject {
 	projid_t 	kpj_id;		/* project ID		*/
 	zoneid_t	kpj_zoneid;	/* zone ID		*/
+	struct zone	*kpj_zone;	/* zone pointer		*/
 	uint_t		kpj_count;	/* reference counter	*/
 	uint32_t	kpj_shares;	/* number of shares	*/
 	rctl_set_t	*kpj_rctls;	/* resource control set */
 	struct kproject	*kpj_prev;	/* previous project	*/
 	struct kproject	*kpj_next;	/* next project		*/
 	kproject_data_t	kpj_data;	/* subsystem-specfic data */
-	kmutex_t	kpj_poolbind;	/* synch. with pools	*/
+	kmutex_t	kpj_poolbind;	/* synchronization with pools	*/
 	rctl_qty_t	kpj_nlwps;	/* protected by project's zone's */
 					/* zone_nlwps_lock */
 	rctl_qty_t	kpj_nlwps_ctl;	/* protected by kpj_rctls->rcs_lock */
 	rctl_qty_t	kpj_ntasks;	/* protected by project's zone's */
 					/* zone_nlwps_lock */
 	rctl_qty_t	kpj_ntasks_ctl;	/* protected by kpj_rctls->rcs_lock */
+	struct cpucap	*kpj_cpucap;	/* CPU cap data			*/
 } kproject_t;
 
 #ifdef _KERNEL
@@ -87,8 +92,6 @@
 #define	PROJECT_HOLD_FIND	1
 #define	PROJECT_HOLD_INSERT	2
 
-struct zone;
-
 void project_init(void);
 kproject_t *project_hold(kproject_t *);
 kproject_t *project_hold_by_id(projid_t, struct zone *, int);
--- a/usr/src/uts/common/sys/schedctl.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/schedctl.h	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1997-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -81,6 +80,7 @@
 #define	SC_RUN		0x02
 #define	SC_ONPROC	0x04
 #define	SC_STOPPED	0x10
+#define	SC_WAIT		0x20
 
 /* preemption control settings */
 #define	SC_MAX_TICKS	2		/* max time preemption can be blocked */
--- a/usr/src/uts/common/sys/thread.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/thread.h	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,7 @@
 #define	TS_ONPROC	0x04	/* Thread is being run on a processor */
 #define	TS_ZOMB		0x08	/* Thread has died but hasn't been reaped */
 #define	TS_STOPPED	0x10	/* Stopped, initial state */
+#define	TS_WAIT		0x20	/* Waiting to become runnable */
 
 typedef struct ctxop {
 	void	(*save_op)(void *);	/* function to invoke to save context */
@@ -98,6 +99,7 @@
 struct upimutex;
 struct kproject;
 struct on_trap_data;
+struct waitq;
 
 /* Definition for kernel thread identifier type */
 typedef uint64_t kt_did_t;
@@ -333,6 +335,7 @@
 #endif
 	hrtime_t	t_hrtime;	/* high-res last time on cpu */
 	kmutex_t	t_ctx_lock;	/* protects t_ctx in removectx() */
+	struct waitq	*t_waitq;	/* wait queue */
 } kthread_t;
 
 /*
@@ -391,6 +394,8 @@
 #define	TS_SWAPENQ	0x0004	/* swap thread when it reaches a safe point */
 #define	TS_ON_SWAPQ	0x0008	/* thread is on the swap queue */
 #define	TS_SIGNALLED	0x0010	/* thread was awakened by cv_signal() */
+#define	TS_PROJWAITQ	0x0020	/* thread is on its project's waitq */
+#define	TS_ZONEWAITQ	0x0040	/* thread is on its zone's waitq */
 #define	TS_CSTART	0x0100	/* setrun() by continuelwps() */
 #define	TS_UNPAUSE	0x0200	/* setrun() by unpauselwps() */
 #define	TS_XSTART	0x0400	/* setrun() by SIGCONT */
@@ -400,6 +405,7 @@
 #define	TS_RUNQMATCH	0x4000	/* exact run queue balancing by setbackdq() */
 #define	TS_ALLSTART	\
 	(TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE)
+#define	TS_ANYWAITQ	(TS_PROJWAITQ|TS_ZONEWAITQ)
 
 /*
  * No locking needed for AST field.
@@ -411,6 +417,13 @@
 #define	ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
 			!((t)->t_schedflag & TS_PSTART))
 
+/* True if thread is asleep and wakeable */
+#define	ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \
+			((t)->t_flag & T_WAKEABLE)))
+
+/* True if thread is on the wait queue */
+#define	ISWAITING(t) ((t)->t_state == TS_WAIT)
+
 /* similar to ISTOPPED except the event of interest is CPR */
 #define	CPR_ISTOPPED(t) ((t)->t_state == TS_STOPPED && \
 			!((t)->t_schedflag & TS_RESUME))
@@ -465,6 +478,9 @@
  * ttoproj(x)
  * 	convert a thread pointer to its project pointer.
  *
+ * ttozone(x)
+ * 	convert a thread pointer to its zone pointer.
+ *
  * lwptot(x)
  *	convert a lwp pointer to its thread pointer.
  *
@@ -476,6 +492,7 @@
 #define	ttolwp(x)	((x)->t_lwp)
 #define	ttoproc(x)	((x)->t_procp)
 #define	ttoproj(x)	((x)->t_proj)
+#define	ttozone(x)	((x)->t_procp->p_zone)
 #define	lwptot(x)	((x)->lwp_thread)
 #define	lwptoproc(x)	((x)->lwp_procp)
 
@@ -488,6 +505,7 @@
 #define	curthread	(threadp())		/* current thread pointer */
 #define	curproc		(ttoproc(curthread))	/* current process pointer */
 #define	curproj		(ttoproj(curthread))	/* current project pointer */
+#define	curzone		(curproc->p_zone)	/* current zone pointer */
 
 extern	struct _kthread	t0;		/* the scheduler thread */
 extern	kmutex_t	pidlock;	/* global process lock */
@@ -583,6 +601,12 @@
 #define	THREAD_RUN(tp, lp)	THREAD_SET_STATE(tp, TS_RUN, lp)
 
 /*
+ * Put thread in wait state, and set the lock pointer to the wait queue
+ * lock pointer provided.  This lock should be held.
+ */
+#define	THREAD_WAIT(tp, lp)	THREAD_SET_STATE(tp, TS_WAIT, lp)
+
+/*
  * Put thread in run state, and set the lock pointer to the dispatcher queue
  * lock pointer provided (i.e., the "swapped_lock").  This lock should be held.
  */
@@ -620,7 +644,6 @@
 #define	THREAD_FREEINTR(tp, cpu)	\
 		THREAD_SET_STATE(tp, TS_FREE, &(cpu)->cpu_thread_lock)
 
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/sys/ts.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/ts.h	Fri Mar 09 15:55:28 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -21,7 +20,7 @@
  */
 
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,6 +35,7 @@
 
 #include <sys/types.h>
 #include <sys/thread.h>
+#include <sys/cpucaps.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -57,36 +57,38 @@
 				/*  ts_maxwait */
 } tsdpent_t;
 
-
+#ifdef _KERNEL
 /*
  * time-sharing class specific thread structure
  */
 typedef struct tsproc {
-	int		ts_timeleft;	/* time remaining in procs quantum */
-	uint_t		ts_dispwait;	/* wall clock seconds since start */
-				/*   of quantum (not reset upon preemption */
+	int	ts_timeleft;	/* time remaining in procs quantum	*/
+	uint_t	ts_dispwait;	/* wall clock seconds since start	*/
+				/* of quantum (not reset upon preempt)	*/
 	pri_t	ts_cpupri;	/* system controlled component of ts_umdpri */
-	pri_t	ts_uprilim;	/* user priority limit */
-	pri_t	ts_upri;	/* user priority */
-	pri_t	ts_umdpri;	/* user mode priority within ts class */
-	pri_t	ts_scpri;	/* remembered priority, for schedctl */
-	char	ts_nice;	/* nice value for compatibility */
-	char	ts_boost;	/* interactive priority offset */
-	uchar_t	ts_flags;	/* flags defined below */
-	kthread_t *ts_tp;	/* pointer to thread */
-	struct tsproc *ts_next;	/* link to next tsproc on list */
-	struct tsproc *ts_prev;	/* link to previous tsproc on list */
+	pri_t	ts_uprilim;	/* user priority limit			*/
+	pri_t	ts_upri;	/* user priority			*/
+	pri_t	ts_umdpri;	/* user mode priority within ts class	*/
+	pri_t	ts_scpri;	/* remembered priority, for schedctl	*/
+	char	ts_nice;	/* nice value for compatibility		*/
+	char	ts_boost;	/* interactive priority offset		*/
+	uchar_t	ts_flags;	/* flags defined below			*/
+	kthread_t *ts_tp;	/* pointer to thread			*/
+	struct tsproc *ts_next;	/* link to next tsproc on list		*/
+	struct tsproc *ts_prev;	/* link to previous tsproc on list	*/
+	caps_sc_t ts_caps;	/* CPU caps specific data		*/
 } tsproc_t;
 
+/* flags */
+#define	TSKPRI		0x01	/* thread at kernel mode priority	*/
+#define	TSBACKQ		0x02	/* thread goes to back of dispq if preempted */
+#define	TSIA		0x04	/* thread is interactive		*/
+#define	TSIASET		0x08	/* interactive thread is "on"		*/
+#define	TSIANICED	0x10	/* interactive thread has been niced	*/
+#define	TSRESTORE	0x20	/* thread was not preempted, due to schedctl */
+				/* restore priority from ts_scpri	*/
 
-/* flags */
-#define	TSKPRI	0x01	/* thread at kernel mode priority */
-#define	TSBACKQ	0x02	/* thread goes to back of disp q when preempted */
-#define	TSIA	0x04	/* thread is interactive */
-#define	TSIASET	0x08	/* interactive thread is "on" */
-#define	TSIANICED	0x10	/* interactive thread has been niced */
-#define	TSRESTORE	0x20	/* thread was not preempted, due to schedctl */
-				/* restore priority from ts_scpri */
+#endif /* _KERNEL */
 
 #ifdef	__cplusplus
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/waitq.h	Fri Mar 09 15:55:28 2007 -0800
@@ -0,0 +1,90 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_WAITQ_H
+#define	_SYS_WAITQ_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	_KERNEL
+
+#include <sys/types.h>
+#include <sys/machlock.h>
+#include <sys/thread.h>
+
+typedef struct waitq {
+	disp_lock_t	wq_lock;	/* protects all fields */
+	kthread_t	*wq_first;	/* first thread on the queue */
+	int		wq_count;	/* number of threads on the queue */
+	boolean_t	wq_blocked;	/* True if threads can't be enqueued */
+} waitq_t;
+
+extern void		waitq_init(waitq_t *);
+extern void		waitq_fini(waitq_t *);
+
+/*
+ * Place the thread on the wait queue. An attempt to enqueue a thread onto a
+ * blocked queue fails and returns zero. Successful enqueue returns non-zero
+ * value.
+ */
+extern int		waitq_enqueue(waitq_t *, kthread_t *);
+
+/*
+ * Take thread off its wait queue and make it runnable.
+ */
+extern void		waitq_setrun(kthread_t *t);
+
+/*
+ * Change priority for the thread on wait queue.
+ */
+extern void		waitq_change_pri(kthread_t *, pri_t);
+
+/*
+ * Take the first thread off the wait queue and make it runnable.
+ */
+extern void		waitq_runone(waitq_t *);
+
+/*
+ * Return True if there are no threads on the queue.
+ */
+extern boolean_t	waitq_isempty(waitq_t *);
+
+/*
+ * Prevent and allow placing new threads on wait queue.
+ */
+extern void		waitq_block(waitq_t *);
+extern void		waitq_unblock(waitq_t *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_WAITQ_H */
--- a/usr/src/uts/common/sys/zone.h	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/common/sys/zone.h	Fri Mar 09 15:55:28 2007 -0800
@@ -305,6 +305,8 @@
 	kstat_named_t zk_value;
 } zone_kstat_t;
 
+struct cpucap;
+
 typedef struct zone {
 	/*
 	 * zone_name is never modified once set.
@@ -416,6 +418,7 @@
 	 */
 	struct dlnamelist *zone_dl_list;
 	netstack_t	*zone_netstack;
+	struct cpucap	*zone_cpucap;	/* CPU caps data */
 } zone_t;
 
 /*
--- a/usr/src/uts/i86pc/os/trap.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/i86pc/os/trap.c	Fri Mar 09 15:55:28 2007 -0800
@@ -1408,7 +1408,7 @@
 		CL_TRAPRET(ct);
 		thread_unlock(ct);
 	}
-	if (CPU->cpu_runrun)
+	if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
 		preempt();
 	(void) new_mstate(ct, mstate);
 
--- a/usr/src/uts/intel/ia32/os/syscall.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/intel/ia32/os/syscall.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -761,7 +761,7 @@
 		CL_TRAPRET(t);
 		thread_unlock(t);
 	}
-	if (CPU->cpu_runrun)
+	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
 		preempt();
 
 	lwp->lwp_errno = 0;		/* clear error for next time */
--- a/usr/src/uts/sparc/os/syscall.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/sparc/os/syscall.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -857,7 +857,7 @@
 		CL_TRAPRET(t);
 		thread_unlock(t);
 	}
-	if (CPU->cpu_runrun)
+	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
 		preempt();
 
 	/*
--- a/usr/src/uts/sun4/os/trap.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/sun4/os/trap.c	Fri Mar 09 15:55:28 2007 -0800
@@ -1564,7 +1564,7 @@
 		CL_TRAPRET(curthread);
 		thread_unlock(curthread);
 	}
-	if (CPU->cpu_runrun)
+	if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
 		preempt();
 	if (lwp->lwp_pcb.pcb_step != STEP_NONE)
 		prdostep();
--- a/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/sun4u/ngdr/io/dr_quiesce.c	Fri Mar 09 15:55:28 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -566,8 +566,7 @@
 
 				aston(tp);
 
-				if (tp->t_state == TS_SLEEP &&
-				    (tp->t_flag & T_WAKEABLE)) {
+				if (ISWAKEABLE(tp) || ISWAITING(tp)) {
 					setrun_locked(tp);
 				}
 
--- a/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/sun4u/serengeti/io/sbdp_quiesce.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -541,11 +541,9 @@
 
 				aston(tp);
 
-				if (tp->t_state == TS_SLEEP &&
-				    (tp->t_flag & T_WAKEABLE)) {
+				if (ISWAKEABLE(tp) || ISWAITING(tp)) {
 					setrun_locked(tp);
 				}
-
 			}
 
 			/* grab thread if needed */
--- a/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c	Fri Mar 09 15:42:52 2007 -0800
+++ b/usr/src/uts/sun4u/sunfire/io/sysctrl_quiesce.c	Fri Mar 09 15:55:28 2007 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -372,8 +372,7 @@
 
 				aston(tp);
 
-				if (tp->t_state == TS_SLEEP &&
-				    (tp->t_flag & T_WAKEABLE)) {
+				if (ISWAKEABLE(tp) || ISWAITING(tp)) {
 					setrun_locked(tp);
 				}