PSARC/2009/099 dladm show-usage modifications
authorVenugopal Iyer <Venu.Iyer@Sun.COM>
Tue, 17 Feb 2009 01:31:30 -0800
changeset 8833 8adf20bc60e3
parent 8832 dbf480d38c0d
child 8834 00075a1c20bb
PSARC/2009/099 dladm show-usage modifications 6726676 flow should not be seen by flowadm show-usage after the flow been removed by flowadm remove-flow 6766669 "dladm show-vnic -o" can't accept MACADDRESS 6773854 Per Tx ring flow control for UDP 6777547 mac_tx() should compute the hash if the passed hint is zero 6778557 nxge m_tx() should fanout to multiple rings for vnet scalability 6779356 sometimes packets are not classified to the correct flow 6783011 pre-existing subflows not initialized on a non-dls client when brought up 6786734 acctadm dladm_start_usagelog() calls need some work 6789760 mac perimeter deadlock due to dls_devnet_stat_update() 6789883 dladm show-link -s is adrift again. 6791099 mac_tx() frees the message but returns non-NULL cookie which causes panic 6791109 maxbw set on a link should not apply if this link is the underlying port of an aggregation 6791118 panic in mac_bcast_delete() unplumbing an IP interface 6791456 deleting last vnic interface causes bge interface to stop working 6791678 xvm guests don't communicate through vnics configured on vlan 6792164 race between mac_tx_is_flow_blocked() and mac_srs_group_teardown() could cause panic 6792546 paniced in bge_ring_tx()/freemsg() due to mp->b_next == NULL && mp->b_prev == NULL 6792555 paniced in mac_flow_walk_nolock() due to assertion failed: cnt == ft->ft_flow_count 6792871 multiple VLANs per MAC client cause hang in mac_flow_wait() 6792942 60% regression for Guest-to-Guest network throughput on snv106 6793278 the multicast addresses are not added to the aggregation port in certain scenarios 6793436 panic in mac_fini_macaddr() on mac_register() failure 6796850 SUNWcnetr postinstall script spews errors due to bad interface matching 6803378 need support for dls_bypass and rx fanout on non-ethernet media
usr/src/cmd/acctadm/main.c
usr/src/cmd/dladm/dladm.c
usr/src/cmd/flowadm/flowadm.c
usr/src/lib/libdladm/common/usage.c
usr/src/pkgdefs/SUNWcnetr/postinstall
usr/src/tools/scripts/bfu.sh
usr/src/uts/common/inet/ip.h
usr/src/uts/common/inet/ip/ip.c
usr/src/uts/common/inet/ip/ip6.c
usr/src/uts/common/inet/ip/ip_if.c
usr/src/uts/common/inet/ip_impl.h
usr/src/uts/common/inet/ip_stack.h
usr/src/uts/common/inet/ipclassifier.h
usr/src/uts/common/inet/tcp/tcp.c
usr/src/uts/common/inet/udp/udp.c
usr/src/uts/common/inet/udp_impl.h
usr/src/uts/common/io/aggr/aggr_grp.c
usr/src/uts/common/io/aggr/aggr_port.c
usr/src/uts/common/io/aggr/aggr_send.c
usr/src/uts/common/io/dld/dld_proto.c
usr/src/uts/common/io/dls/dls.c
usr/src/uts/common/io/dls/dls_link.c
usr/src/uts/common/io/dls/dls_mgmt.c
usr/src/uts/common/io/e1000g/e1000g_main.c
usr/src/uts/common/io/e1000g/e1000g_rx.c
usr/src/uts/common/io/e1000g/e1000g_sw.h
usr/src/uts/common/io/mac/mac.c
usr/src/uts/common/io/mac/mac_bcast.c
usr/src/uts/common/io/mac/mac_client.c
usr/src/uts/common/io/mac/mac_datapath_setup.c
usr/src/uts/common/io/mac/mac_flow.c
usr/src/uts/common/io/mac/mac_provider.c
usr/src/uts/common/io/mac/mac_sched.c
usr/src/uts/common/io/mac/mac_soft_ring.c
usr/src/uts/common/io/mac/mac_util.c
usr/src/uts/common/io/nxge/nxge_send.c
usr/src/uts/common/io/softmac/softmac_main.c
usr/src/uts/common/sys/aggr_impl.h
usr/src/uts/common/sys/dld.h
usr/src/uts/common/sys/dld_impl.h
usr/src/uts/common/sys/dls_impl.h
usr/src/uts/common/sys/mac.h
usr/src/uts/common/sys/mac_client.h
usr/src/uts/common/sys/mac_client_impl.h
usr/src/uts/common/sys/mac_impl.h
usr/src/uts/common/sys/mac_provider.h
usr/src/uts/common/sys/mac_soft_ring.h
usr/src/uts/common/xen/io/xnbo.c
--- a/usr/src/cmd/acctadm/main.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/cmd/acctadm/main.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,6 +41,8 @@
 #include "aconf.h"
 #include "res.h"
 
+#define	ACCTADM_NET_LOG_INTERVAL	20
+
 static const char USAGE[] = "\
 Usage:\n\
     acctadm [ {process | task | flow | net} ]\n\
@@ -126,6 +128,7 @@
 	int optcnt = 0;
 	int state;
 	const char *fmri;	/* FMRI for this instance */
+	int err = 0;
 
 	setup_privs();
 
@@ -309,10 +312,14 @@
 		if (type & AC_NET) {
 			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
 			    PRIV_SYS_DL_CONFIG, NULL);
-			(void) dladm_stop_usagelog(dld_handle,
+			err = dladm_stop_usagelog(dld_handle,
 			    DLADM_LOGTYPE_FLOW);
 			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
 			    PRIV_SYS_DL_CONFIG, NULL);
+			if (err != DLADM_STATUS_OK) {
+				die(gettext("failed to stop logging network "
+				    "information, error %d\n"), errno);
+			}
 		}
 		state = AC_OFF;
 
@@ -356,18 +363,44 @@
 			if (type & AC_NET) {
 				(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
 				    PRIV_SYS_DL_CONFIG, NULL);
-				(void) dladm_stop_usagelog(dld_handle,
-				    strncmp(disabled, "basic", strlen("basic"))
-				    == 0 ? DLADM_LOGTYPE_LINK :
-				    DLADM_LOGTYPE_FLOW);
+				err = dladm_stop_usagelog(dld_handle,
+				    strcmp(disabled, "basic") == 0 ?
+				    DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW);
 				(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
 				    PRIV_SYS_DL_CONFIG, NULL);
+				if (err != DLADM_STATUS_OK) {
+					die(gettext("failed to stop logging "
+					    "network information, error %d\n"),
+					    errno);
+				}
 			}
 			str2buf(buf, disabled, AC_OFF, type);
 		}
-		if (enabled)
+		if (enabled) {
+			/*
+			 * Lets us get network logging started.
+			 */
+			if (type & AC_NET) {
+				/*
+				 * Default logging interval for AC_NET is
+				 * ACCTADM_NET_LOG_INTERVAL.
+				 */
+				(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+				    PRIV_SYS_DL_CONFIG, NULL);
+				err = dladm_start_usagelog(dld_handle,
+				    strcmp(enabled, "basic") == 0 ?
+				    DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW,
+				    ACCTADM_NET_LOG_INTERVAL);
+				(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+				    PRIV_SYS_DL_CONFIG, NULL);
+				if (err != DLADM_STATUS_OK) {
+					die(gettext("failed to start logging "
+					    "network information, error %d\n"),
+					    errno);
+				}
+			}
 			str2buf(buf, enabled, AC_ON, type);
-
+		}
 		(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
 		if (acctctl(type | AC_RES_SET, buf, AC_BUFSIZE) == -1) {
 			free(buf);
@@ -384,24 +417,6 @@
 		if (aconf_set_string(AC_PROP_UNTRACKED, untracked) == -1)
 			die(gettext("cannot update %s property\n"),
 			    AC_PROP_UNTRACKED);
-		/*
-		 * We will enable net logging after turning it on so that
-		 * it can immediately start writing log.
-		 */
-		if (type & AC_NET && enabled != NULL) {
-			/*
-			 * Default logging interval for AC_NET is 20.
-			 * XXX need to find the right place to
-			 * configure it.
-			 */
-			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
-			    PRIV_SYS_DL_CONFIG, NULL);
-			(void) dladm_start_usagelog(dld_handle,
-			    strncmp(enabled, "basic", strlen("basic")) == 0 ?
-			    DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW, 20);
-			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
-			    PRIV_SYS_DL_CONFIG, NULL);
-		}
 		free(tracked);
 		free(untracked);
 		free(buf);
@@ -445,10 +460,14 @@
 		if (type & AC_NET) {
 			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
 			    PRIV_SYS_DL_CONFIG, NULL);
-			(void) dladm_stop_usagelog(dld_handle,
+			err = dladm_stop_usagelog(dld_handle,
 			    DLADM_LOGTYPE_FLOW);
 			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
 			    PRIV_SYS_DL_CONFIG, NULL);
+			if (err != DLADM_STATUS_OK) {
+				die(gettext("failed to stop logging "
+				    "network information, error %d\n"), errno);
+			}
 		}
 		state = AC_OFF;
 
@@ -468,6 +487,26 @@
 		/*
 		 * Enable accounting
 		 */
+
+		/*
+		 * Let's get network logging started.
+		 */
+		if (type & AC_NET) {
+			/*
+			 * Default logging interval for AC_NET is
+			 * ACCTADM_NET_LOG_INTERVAL.
+			 */
+			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			err = dladm_start_usagelog(dld_handle,
+			    DLADM_LOGTYPE_FLOW, ACCTADM_NET_LOG_INTERVAL);
+			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
+			    PRIV_SYS_DL_CONFIG, NULL);
+			if (err != DLADM_STATUS_OK) {
+				die(gettext("failed to start logging "
+				    "network information, error %d\n"), errno);
+			}
+		}
 		state = AC_ON;
 
 		(void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL);
@@ -480,18 +519,6 @@
 			die(gettext("cannot update %s property\n"),
 			    AC_PROP_STATE);
 		modified++;
-		if (type & AC_NET) {
-			/*
-			 * Default logging interval for AC_NET is 20,
-			 * XXX need to find the right place to configure it.
-			 */
-			(void) priv_set(PRIV_ON, PRIV_EFFECTIVE,
-			    PRIV_SYS_DL_CONFIG, NULL);
-			(void) dladm_start_usagelog(dld_handle,
-			    DLADM_LOGTYPE_FLOW, 20);
-			(void) priv_set(PRIV_OFF, PRIV_EFFECTIVE,
-			    PRIV_SYS_DL_CONFIG, NULL);
-		}
 	}
 	(void) priv_set(PRIV_OFF, PRIV_PERMITTED, PRIV_SYS_ACCT, NULL);
 
--- a/usr/src/cmd/dladm/dladm.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/cmd/dladm/dladm.c	Tue Feb 17 01:31:30 2009 -0800
@@ -252,6 +252,7 @@
 	boolean_t	us_parseable;
 	boolean_t	us_printheader;
 	boolean_t	us_first;
+	boolean_t	us_showall;
 	print_state_t	us_print;
 } show_usage_state_t;
 
@@ -392,7 +393,7 @@
 	{ "show-etherstub",	do_show_etherstub,
 	    "    show-etherstub   [-t] [<link>]\n"			},
 	{ "show-usage",		do_show_usage,
-	    "    show-usage       [-d|-p -F <format>] "
+	    "    show-usage       [-a] [-d | -F <format>] "
 	    "[-s <DD/MM/YYYY,HH:MM:SS>]\n"
 	    "\t\t     [-e <DD/MM/YYYY,HH:MM:SS>] -f <logfile> [<link>]"	}
 };
@@ -480,6 +481,14 @@
 	{ 0, 0, 0, 0 }
 };
 
+static const struct option usage_opts[] = {
+	{"file",	required_argument,	0, 'f'	},
+	{"format",	required_argument,	0, 'F'	},
+	{"start",	required_argument,	0, 's'	},
+	{"stop",	required_argument,	0, 'e'	},
+	{ 0, 0, 0, 0 }
+};
+
 /*
  * structures for 'dladm show-ether'
  */
@@ -527,26 +536,33 @@
  * structures for 'dladm show-link -s' (print statistics)
  */
 typedef enum {
-	DEVS_LINK,
-	DEVS_IPKTS,
-	DEVS_RBYTES,
-	DEVS_IERRORS,
-	DEVS_OPKTS,
-	DEVS_OBYTES,
-	DEVS_OERRORS
-} devs_field_index_t;
-
-static print_field_t devs_fields[] = {
+	LINK_S_LINK,
+	LINK_S_IPKTS,
+	LINK_S_RBYTES,
+	LINK_S_IERRORS,
+	LINK_S_OPKTS,
+	LINK_S_OBYTES,
+	LINK_S_OERRORS
+} link_s_field_index_t;
+
+static print_field_t link_s_fields[] = {
 /* name,	header,		field width,	index,		cmdtype	*/
-{ "link",	"LINK",			15,	DEVS_LINK,	CMD_TYPE_ANY},
-{ "ipackets",	"IPACKETS",		10,	DEVS_IPKTS,	CMD_TYPE_ANY},
-{ "rbytes",	"RBYTES",		8,	DEVS_RBYTES,	CMD_TYPE_ANY},
-{ "ierrors",	"IERRORS",		10,	DEVS_IERRORS,	CMD_TYPE_ANY},
-{ "opackets",	"OPACKETS",		12,	DEVS_OPKTS,	CMD_TYPE_ANY},
-{ "obytes",	"OBYTES",		12,	DEVS_OBYTES,	CMD_TYPE_ANY},
-{ "oerrors",	"OERRORS",		8,	DEVS_OERRORS,	CMD_TYPE_ANY}}
+{ "link",	"LINK",			15,	LINK_S_LINK,	CMD_TYPE_ANY},
+{ "ipackets",	"IPACKETS",		10,	LINK_S_IPKTS,	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",		8,	LINK_S_RBYTES,	CMD_TYPE_ANY},
+{ "ierrors",	"IERRORS",		10,	LINK_S_IERRORS,	CMD_TYPE_ANY},
+{ "opackets",	"OPACKETS",		12,	LINK_S_OPKTS,	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",		12,	LINK_S_OBYTES,	CMD_TYPE_ANY},
+{ "oerrors",	"OERRORS",		8,	LINK_S_OERRORS,	CMD_TYPE_ANY}}
 ;
-#define	DEVS_MAX_FIELDS	(sizeof (devs_fields) / sizeof (print_field_t))
+#define	LINK_S_MAX_FIELDS \
+	(sizeof (link_s_fields) / sizeof (print_field_t))
+
+typedef struct link_args_s {
+	char		*link_s_link;
+	pktsum_t	*link_s_psum;
+} link_args_t;
+static char *print_link_stats(print_field_t *, void *);
 
 /*
  * buffer used by print functions for show-{link,phys,vlan} commands.
@@ -925,7 +941,7 @@
     offsetof(vnic_fields_buf_t, vnic_over),		CMD_TYPE_ANY},
 { "speed",		"SPEED",	6,
     offsetof(vnic_fields_buf_t, vnic_speed),		CMD_TYPE_ANY},
-{ "macaddr",		"MACADDRESS",	20,
+{ "macaddress",		"MACADDRESS",	20,
     offsetof(vnic_fields_buf_t, vnic_macaddr),		CMD_TYPE_ANY},
 { "macaddrtype",	"MACADDRTYPE",	19,
     offsetof(vnic_fields_buf_t, vnic_macaddrtype),	CMD_TYPE_ANY},
@@ -1077,9 +1093,24 @@
 static int
 show_usage_date(dladm_usage_t *usage, void *arg)
 {
-
-	time_t	stime;
-	char	timebuf[20];
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	time_t			stime;
+	char			timebuf[20];
+	dladm_status_t		status;
+	uint32_t		flags;
+
+	/*
+	 * Only show usage information for existing links unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall) {
+		if ((status = dladm_name2info(handle, usage->du_name,
+		    NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) {
+			return (status);
+		}
+		if ((flags & DLADM_OPT_ACTIVE) == 0)
+			return (DLADM_STATUS_LINKINVAL);
+	}
 
 	stime = usage->du_stime;
 	(void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
@@ -1097,6 +1128,21 @@
 	usage_l_fields_buf_t 	ubuf;
 	time_t			time;
 	double			bw;
+	dladm_status_t		status;
+	uint32_t		flags;
+
+	/*
+	 * Only show usage information for existing links unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall) {
+		if ((status = dladm_name2info(handle, usage->du_name,
+		    NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) {
+			return (status);
+		}
+		if ((flags & DLADM_OPT_ACTIVE) == 0)
+			return (DLADM_STATUS_LINKINVAL);
+	}
 
 	if (state->us_plot) {
 		if (!state->us_printheader) {
@@ -1164,6 +1210,21 @@
 	show_usage_state_t	*state = (show_usage_state_t *)arg;
 	char			buf[DLADM_STRSIZE];
 	usage_fields_buf_t	ubuf;
+	dladm_status_t		status;
+	uint32_t		flags;
+
+	/*
+	 * Only show usage information for existing links unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall) {
+		if ((status = dladm_name2info(handle, usage->du_name,
+		    NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) {
+			return (status);
+		}
+		if ((flags & DLADM_OPT_ACTIVE) == 0)
+			return (DLADM_STATUS_LINKINVAL);
+	}
 
 	bzero(&ubuf, sizeof (ubuf));
 
@@ -1210,7 +1271,6 @@
 	int			opt;
 	dladm_status_t		status;
 	boolean_t		d_arg = B_FALSE;
-	boolean_t		p_arg = B_FALSE;
 	char			*stime = NULL;
 	char			*etime = NULL;
 	char			*resource = NULL;
@@ -1232,13 +1292,14 @@
 	state.us_plot = B_FALSE;
 	state.us_first = B_TRUE;
 
-	while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+	while ((opt = getopt_long(argc, argv, "das:e:o:f:F:",
+	    usage_opts, NULL)) != -1) {
 		switch (opt) {
 		case 'd':
 			d_arg = B_TRUE;
 			break;
-		case 'p':
-			state.us_plot = p_arg = B_TRUE;
+		case 'a':
+			state.us_showall = B_TRUE;
 			break;
 		case 'f':
 			file = optarg;
@@ -1254,7 +1315,7 @@
 			fields_str = optarg;
 			break;
 		case 'F':
-			F_arg = B_TRUE;
+			state.us_plot = F_arg = B_TRUE;
 			formatspec_str = optarg;
 			break;
 		default:
@@ -1267,7 +1328,15 @@
 		die("show-usage requires a file");
 
 	if (optind == (argc-1)) {
+		uint32_t 	flags;
+
 		resource = argv[optind];
+		if (!state.us_showall &&
+		    (((status = dladm_name2info(handle, resource, NULL, &flags,
+		    NULL, NULL)) != DLADM_STATUS_OK) ||
+		    ((flags & DLADM_OPT_ACTIVE) == 0))) {
+			die("invalid link: '%s'", resource);
+		}
 	}
 
 	if (resource == NULL && stime == NULL && etime == NULL) {
@@ -1289,11 +1358,8 @@
 	state.us_print.ps_fields = fields;
 	state.us_print.ps_nfields = nfields;
 
-	if (p_arg && d_arg)
-		die("plot and date options are incompatible");
-
-	if (p_arg && !F_arg)
-		die("specify format speicifier: -F <format>");
+	if (F_arg && d_arg)
+		die("incompatible -d and -F options");
 
 	if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
 		die("Format specifier %s not supported", formatspec_str);
@@ -1303,7 +1369,7 @@
 		status = dladm_usage_dates(show_usage_date,
 		    DLADM_LOGTYPE_LINK, file, resource, &state);
 	} else if (resource == NULL && stime == NULL && etime == NULL &&
-	    !p_arg) {
+	    !F_arg) {
 		/* Print summary */
 		status = dladm_usage_summary(show_usage_res,
 		    DLADM_LOGTYPE_LINK, file, &state);
@@ -2320,6 +2386,48 @@
 	return (DLADM_WALK_CONTINUE);
 }
 
+static char *
+print_link_stats(print_field_t *pf, void *arg)
+{
+	link_args_t *largs = arg;
+	pktsum_t *diff_stats = largs->link_s_psum;
+	static char buf[DLADM_STRSIZE];
+
+	switch (pf->pf_index) {
+	case LINK_S_LINK:
+		(void) snprintf(buf, sizeof (buf), "%s", largs->link_s_link);
+		break;
+	case LINK_S_IPKTS:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->ipackets);
+		break;
+	case LINK_S_RBYTES:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->rbytes);
+		break;
+	case LINK_S_IERRORS:
+		(void) snprintf(buf, sizeof (buf), "%u",
+		    diff_stats->ierrors);
+		break;
+	case LINK_S_OPKTS:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->opackets);
+		break;
+	case LINK_S_OBYTES:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->obytes);
+		break;
+	case LINK_S_OERRORS:
+		(void) snprintf(buf, sizeof (buf), "%u",
+		    diff_stats->oerrors);
+		break;
+	default:
+		die("invalid input");
+		break;
+	}
+	return (buf);
+}
+
 static int
 show_link_stats(dladm_handle_t dh, datalink_id_t linkid, void *arg)
 {
@@ -2328,6 +2436,7 @@
 	show_state_t		*state = (show_state_t *)arg;
 	pktsum_t		stats, diff_stats;
 	dladm_phys_attr_t	dpa;
+	link_args_t		largs;
 
 	if (state->ls_firstonly) {
 		if (state->ls_donefirst)
@@ -2356,13 +2465,10 @@
 	}
 	dladm_stats_diff(&diff_stats, &stats, &state->ls_prevstats);
 
-	(void) printf("%-12s", link);
-	(void) printf("%-10llu", diff_stats.ipackets);
-	(void) printf("%-12llu", diff_stats.rbytes);
-	(void) printf("%-8llu", diff_stats.ierrors);
-	(void) printf("%-10llu", diff_stats.opackets);
-	(void) printf("%-12llu", diff_stats.obytes);
-	(void) printf("%-8llu\n", diff_stats.oerrors);
+	largs.link_s_link = link;
+	largs.link_s_psum = &diff_stats;
+	dladm_print_output(&state->ls_print, state->ls_parseable,
+	    print_link_stats, &largs);
 
 	state->ls_prevstats = stats;
 	return (DLADM_WALK_CONTINUE);
@@ -4200,7 +4306,7 @@
 	int			pfmax;
 	uint_t			nfields;
 	char			*all_fields =
-	    "link,over,speed,macaddr,macaddrtype,vid";
+	    "link,over,speed,macaddress,macaddrtype,vid";
 	char			*all_e_fields =
 	    "link";
 
@@ -4408,8 +4514,8 @@
 	print_field_t	**fields;
 	uint_t		nfields;
 
-	fields = parse_output_fields(fields_str, devs_fields, DEVS_MAX_FIELDS,
-	    CMD_TYPE_ANY, &nfields);
+	fields = parse_output_fields(fields_str, link_s_fields,
+	    LINK_S_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
 	if (fields == NULL) {
 		die("invalid field(s) specified");
 		return;
--- a/usr/src/cmd/flowadm/flowadm.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/cmd/flowadm/flowadm.c	Tue Feb 17 01:31:30 2009 -0800
@@ -83,6 +83,7 @@
 	boolean_t	us_parseable;
 	boolean_t	us_printheader;
 	boolean_t	us_first;
+	boolean_t	us_showall;
 	print_state_t	us_print;
 } show_usage_state_t;
 
@@ -105,6 +106,22 @@
 
 #define	MAX_FIELD_LEN	32
 
+typedef struct show_flow_state {
+	boolean_t		fs_firstonly;
+	boolean_t		fs_donefirst;
+	pktsum_t		fs_prevstats;
+	uint32_t		fs_flags;
+	dladm_status_t		fs_status;
+	print_state_t		fs_print;
+	const char		*fs_flow;
+	const char		*fs_link;
+	boolean_t		fs_parseable;
+	boolean_t		fs_printheader;
+	boolean_t		fs_persist;
+	boolean_t		fs_stats;
+	uint64_t		fs_mask;
+} show_flow_state_t;
+
 typedef void cmdfunc_t(int, char **);
 
 static cmdfunc_t do_add_flow, do_remove_flow, do_init_flow, do_show_flow;
@@ -114,7 +131,8 @@
 static int	show_flow(dladm_flow_attr_t *, void *);
 static int	show_flows_onelink(dladm_handle_t, datalink_id_t, void *);
 
-static void	flow_stats(const char *, datalink_id_t,  uint_t);
+static void	flow_stats(const char *, datalink_id_t,  uint_t, char *,
+		    show_flow_state_t *);
 static void	get_flow_stats(const char *, pktsum_t *);
 static int	show_flow_stats(dladm_flow_attr_t *, void *);
 static int	show_link_flow_stats(dladm_handle_t, datalink_id_t, void *);
@@ -168,26 +186,6 @@
 };
 
 /*
- * structures for 'flowadm show-flow'
- */
-
-typedef struct show_flow_state {
-	boolean_t		fs_firstonly;
-	boolean_t		fs_donefirst;
-	pktsum_t		fs_prevstats;
-	uint32_t		fs_flags;
-	dladm_status_t		fs_status;
-	print_state_t		fs_print;
-	const char		*fs_flow;
-	const char		*fs_link;
-	boolean_t		fs_parseable;
-	boolean_t		fs_printheader;
-	boolean_t		fs_persist;
-	boolean_t		fs_stats;
-	uint64_t		fs_mask;
-} show_flow_state_t;
-
-/*
  * structures for 'flowadm remove-flow'
  */
 
@@ -197,15 +195,6 @@
 	dladm_status_t	fs_status;
 } remove_flow_state_t;
 
-typedef struct flow_args_s {
-	const char		*fa_link;
-	int			fa_attrno;	/* -1 indicates flow itself */
-	uint64_t		fa_mask;
-	dladm_flow_attr_t	*fa_finfop;
-	dladm_status_t		*fa_status;
-	boolean_t		fa_parseable;
-} flow_args_t;
-
 #define	PROTO_MAXSTR_LEN	7
 #define	PORT_MAXSTR_LEN		6
 #define	DSFIELD_MAXSTR_LEN	10
@@ -288,9 +277,40 @@
 	char			*fs_propname;
 	char			*fs_flowname;
 } flowprop_args_t;
+/*
+ * structures for 'flowadm show-flow -s' (print statistics)
+ */
+typedef enum {
+	FLOW_S_FLOW,
+	FLOW_S_IPKTS,
+	FLOW_S_RBYTES,
+	FLOW_S_IERRORS,
+	FLOW_S_OPKTS,
+	FLOW_S_OBYTES,
+	FLOW_S_OERRORS
+} flow_s_field_index_t;
+
+static print_field_t flow_s_fields[] = {
+/* name,	header,		field width,	index,		cmdtype	*/
+{ "flow",	"FLOW",			15,	FLOW_S_FLOW,	CMD_TYPE_ANY},
+{ "ipackets",	"IPACKETS",		10,	FLOW_S_IPKTS,	CMD_TYPE_ANY},
+{ "rbytes",	"RBYTES",		8,	FLOW_S_RBYTES,	CMD_TYPE_ANY},
+{ "ierrors",	"IERRORS",		10,	FLOW_S_IERRORS,	CMD_TYPE_ANY},
+{ "opackets",	"OPACKETS",		12,	FLOW_S_OPKTS,	CMD_TYPE_ANY},
+{ "obytes",	"OBYTES",		12,	FLOW_S_OBYTES,	CMD_TYPE_ANY},
+{ "oerrors",	"OERRORS",		8,	FLOW_S_OERRORS,	CMD_TYPE_ANY}}
+;
+#define	FLOW_S_MAX_FIELDS \
+	(sizeof (flow_s_fields) / sizeof (print_field_t))
+
+typedef struct flow_args_s {
+	char		*flow_s_flow;
+	pktsum_t	*flow_s_psum;
+} flow_args_t;
+static char *print_flow_stats(print_field_t *, void *);
 
 /*
- * structures for 'flow show-usage'
+ * structures for 'flowadm show-usage'
  */
 
 typedef struct  usage_fields_buf_s {
@@ -392,7 +412,7 @@
 	    "    reset-flowprop [-t] [-p <prop>,...] <flow>\n"
 	    "    show-flowprop  [-cP] [-l <link>] [-p <prop>,...] "
 	    "[<flow>]\n\n"
-	    "    show-usage     [-d|-p -F <format>] "
+	    "    show-usage     [-a] [-d | -F <format>] "
 	    "[-s <DD/MM/YYYY,HH:MM:SS>]\n"
 	    "\t\t   [-e <DD/MM/YYYY,HH:MM:SS>] -f <logfile> [<flow>]\n"));
 
@@ -476,9 +496,20 @@
 static int
 show_usage_date(dladm_usage_t *usage, void *arg)
 {
+	show_usage_state_t	*state = (show_usage_state_t *)arg;
+	time_t			stime;
+	char			timebuf[20];
+	dladm_flow_attr_t	attr;
+	dladm_status_t		status;
 
-	time_t	stime;
-	char	timebuf[20];
+	/*
+	 * Only show usage information for existing flows unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall && ((status = dladm_flow_info(handle,
+	    usage->du_name, &attr)) != DLADM_STATUS_OK)) {
+		return (status);
+	}
 
 	stime = usage->du_stime;
 	(void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y",
@@ -496,6 +527,17 @@
 	usage_l_fields_buf_t 	ubuf;
 	time_t			time;
 	double			bw;
+	dladm_flow_attr_t	attr;
+	dladm_status_t		status;
+
+	/*
+	 * Only show usage information for existing flows unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall && ((status = dladm_flow_info(handle,
+	    usage->du_name, &attr)) != DLADM_STATUS_OK)) {
+		return (status);
+	}
 
 	if (state->us_plot) {
 		if (!state->us_printheader) {
@@ -563,6 +605,17 @@
 	show_usage_state_t	*state = (show_usage_state_t *)arg;
 	char			buf[DLADM_STRSIZE];
 	usage_fields_buf_t	ubuf;
+	dladm_flow_attr_t	attr;
+	dladm_status_t		status;
+
+	/*
+	 * Only show usage information for existing flows unless '-a'
+	 * is specified.
+	 */
+	if (!state->us_showall && ((status = dladm_flow_info(handle,
+	    usage->du_name, &attr)) != DLADM_STATUS_OK)) {
+		return (status);
+	}
 
 	bzero(&ubuf, sizeof (ubuf));
 
@@ -608,7 +661,6 @@
 	int			opt;
 	dladm_status_t		status;
 	boolean_t		d_arg = B_FALSE;
-	boolean_t		p_arg = B_FALSE;
 	char			*stime = NULL;
 	char			*etime = NULL;
 	char			*resource = NULL;
@@ -630,13 +682,13 @@
 	state.us_plot = B_FALSE;
 	state.us_first = B_TRUE;
 
-	while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) {
+	while ((opt = getopt(argc, argv, "das:e:o:f:F:")) != -1) {
 		switch (opt) {
 		case 'd':
 			d_arg = B_TRUE;
 			break;
-		case 'p':
-			state.us_plot = p_arg = B_TRUE;
+		case 'a':
+			state.us_showall = B_TRUE;
 			break;
 		case 'f':
 			file = optarg;
@@ -652,7 +704,7 @@
 			fields_str = optarg;
 			break;
 		case 'F':
-			F_arg = B_TRUE;
+			state.us_plot = F_arg = B_TRUE;
 			formatspec_str = optarg;
 			break;
 		default:
@@ -664,6 +716,13 @@
 		die("show-usage requires a file");
 
 	if (optind == (argc-1)) {
+		dladm_flow_attr_t	attr;
+
+		if (!state.us_showall &&
+		    dladm_flow_info(handle, resource, &attr) !=
+		    DLADM_STATUS_OK) {
+			die("invalid flow: '%s'", resource);
+		}
 		resource = argv[optind];
 	}
 
@@ -686,11 +745,8 @@
 	state.us_print.ps_fields = fields;
 	state.us_print.ps_nfields = nfields;
 
-	if (p_arg && d_arg)
-		die("plot and date options are incompatible");
-
-	if (p_arg && !F_arg)
-		die("specify format speicifier: -F <format>");
+	if (F_arg && d_arg)
+		die("incompatible -d and -F options");
 
 	if (F_arg && valid_formatspec(formatspec_str) == B_FALSE)
 		die("Format specifier %s not supported", formatspec_str);
@@ -700,7 +756,7 @@
 		status = dladm_usage_dates(show_usage_date,
 		    DLADM_LOGTYPE_FLOW, file, resource, &state);
 	} else if (resource == NULL && stime == NULL && etime == NULL &&
-	    !p_arg) {
+	    !F_arg) {
 		/* Print summary */
 		status = dladm_usage_summary(show_usage_res,
 		    DLADM_LOGTYPE_FLOW, file, &state);
@@ -997,13 +1053,56 @@
 	(void) kstat_close(kcp);
 }
 
+
+static char *
+print_flow_stats(print_field_t *pf, void *arg)
+{
+	flow_args_t	*fargs = arg;
+	pktsum_t	*diff_stats = fargs->flow_s_psum;
+	static char	buf[DLADM_STRSIZE];
+
+	switch (pf->pf_index) {
+	case FLOW_S_FLOW:
+		(void) snprintf(buf, sizeof (buf), "%s", fargs->flow_s_flow);
+		break;
+	case FLOW_S_IPKTS:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->ipackets);
+		break;
+	case FLOW_S_RBYTES:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->rbytes);
+		break;
+	case FLOW_S_IERRORS:
+		(void) snprintf(buf, sizeof (buf), "%u",
+		    diff_stats->ierrors);
+		break;
+	case FLOW_S_OPKTS:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->opackets);
+		break;
+	case FLOW_S_OBYTES:
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    diff_stats->obytes);
+		break;
+	case FLOW_S_OERRORS:
+		(void) snprintf(buf, sizeof (buf), "%u",
+		    diff_stats->oerrors);
+		break;
+	default:
+		die("invalid input");
+		break;
+	}
+	return (buf);
+}
 /* ARGSUSED */
 static int
 show_flow_stats(dladm_flow_attr_t *attr, void *arg)
 {
-	show_flow_state_t *state = (show_flow_state_t *)arg;
-	const char *name = attr->fa_flowname;
-	pktsum_t stats, diff_stats;
+	show_flow_state_t	*state = (show_flow_state_t *)arg;
+	char			*name = attr->fa_flowname;
+	pktsum_t		stats, diff_stats;
+	flow_args_t		fargs;
 
 	if (state->fs_firstonly) {
 		if (state->fs_donefirst)
@@ -1016,13 +1115,10 @@
 	get_flow_stats(name, &stats);
 	dladm_stats_diff(&diff_stats, &stats, &state->fs_prevstats);
 
-	(void) printf("%-12s", name);
-	(void) printf("%-10llu", diff_stats.ipackets);
-	(void) printf("%-12llu", diff_stats.rbytes);
-	(void) printf("%-8llu", diff_stats.ierrors);
-	(void) printf("%-10llu", diff_stats.opackets);
-	(void) printf("%-12llu", diff_stats.obytes);
-	(void) printf("%-8llu\n", diff_stats.oerrors);
+	fargs.flow_s_flow = name;
+	fargs.flow_s_psum = &diff_stats;
+	flowadm_print_output(&state->fs_print, state->fs_parseable,
+	    print_flow_stats, &fargs);
 
 	state->fs_prevstats = stats;
 
@@ -1046,45 +1142,52 @@
 
 /* ARGSUSED */
 static void
-flow_stats(const char *flow, datalink_id_t linkid,  uint_t interval)
+flow_stats(const char *flow, datalink_id_t linkid,  uint_t interval,
+    char *fields_str, show_flow_state_t *state)
 {
-	show_flow_state_t	state;
 	dladm_flow_attr_t	attr;
+	print_field_t		**fields;
+	uint_t			nfields;
+
+	fields = parse_output_fields(fields_str, flow_s_fields,
+	    FLOW_S_MAX_FIELDS, CMD_TYPE_ANY, &nfields);
+	if (fields == NULL) {
+		die("invalid field(s) specified");
+		return;
+	}
+
+	state->fs_print.ps_fields = fields;
+	state->fs_print.ps_nfields = nfields;
 
 	if (flow != NULL &&
 	    dladm_flow_info(handle, flow, &attr) != DLADM_STATUS_OK)
 		die("invalid flow %s", flow);
 
-	bzero(&state, sizeof (state));
-
 	/*
 	 * If an interval is specified, continuously show the stats
 	 * for only the first flow.
 	 */
-	state.fs_firstonly = (interval != 0);
+	state->fs_firstonly = (interval != 0);
 
+	if (!state->fs_parseable)
+		print_header(&state->fs_print);
 	for (;;) {
-		if (!state.fs_donefirst)
-			(void) printf("%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n",
-			    "FLOW", "IPACKETS", "RBYTES", "IERRORS",
-			    "OPACKETS", "OBYTES", "OERRORS");
-
-		state.fs_donefirst = B_FALSE;
+		state->fs_donefirst = B_FALSE;
 
 		/* Show stats for named flow */
 		if (flow != NULL)  {
-			state.fs_flow = flow;
-			(void) show_flow_stats(&attr, &state);
+			state->fs_flow = flow;
+			(void) show_flow_stats(&attr, state);
 
 		/* Show all stats on a link */
 		} else if (linkid != DATALINK_INVALID_LINKID) {
 			(void) dladm_walk_flow(show_flow_stats, handle, linkid,
-			    &state, B_FALSE);
+			    state, B_FALSE);
 
 		/* Show all stats by datalink */
 		} else {
 			(void) dladm_walk_datalink_id(show_link_flow_stats,
-			    handle, &state, DATALINK_CLASS_ALL,
+			    handle, state, DATALINK_CLASS_ALL,
 			    DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE);
 		}
 
@@ -1115,6 +1218,8 @@
 	uint_t			nfields;
 	char			*all_fields =
 	    "flow,link,ipaddr,proto,port,dsfld";
+	char			*allstat_fields =
+	    "flow,ipackets,rbytes,ierrors,opackets,obytes,oerrors";
 
 	bzero(&state, sizeof (state));
 
@@ -1173,11 +1278,6 @@
 			break;
 		}
 	}
-	if (state.fs_parseable && !o_arg)
-		die("-p requires -o");
-
-	if (state.fs_parseable && strcasecmp(fields_str, "all") == 0)
-		die("\"-o all\" is invalid with -p");
 
 	if (i_arg && !(s_arg || S_arg))
 		die("the -i option can be used only with -s or -S");
@@ -1193,19 +1293,23 @@
 		state.fs_flow = flowname;
 	}
 
-	if (s_arg) {
-		flow_stats(state.fs_flow, linkid, interval);
-		return;
-	}
-
 	if (S_arg) {
 		dladm_continuous(handle, linkid, state.fs_flow, interval,
 		    FLOW_REPORT);
 		return;
 	}
 
-	if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0))
-		fields_str = all_fields;
+	if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) {
+		if (s_arg)
+			fields_str = allstat_fields;
+		else
+			fields_str = all_fields;
+	}
+
+	if (s_arg) {
+		flow_stats(state.fs_flow, linkid, interval, fields_str, &state);
+		return;
+	}
 
 	fields = parse_output_fields(fields_str, flow_fields, FLOW_MAX_FIELDS,
 	    CMD_TYPE_ANY, &nfields);
--- a/usr/src/lib/libdladm/common/usage.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/lib/libdladm/common/usage.c	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -580,12 +580,8 @@
 	int	count;
 
 	for (count = 0; count < nentries; count++) {
-		if ((strlen(nns->net_stat_name) ==
-		    strlen(pe[count].net_pe_name)) &&
-		    (strncmp(pe[count].net_pe_name, nns->net_stat_name,
-		    strlen(nns->net_stat_name)) == 0)) {
+		if (strcmp(pe[count].net_pe_name, nns->net_stat_name) == 0)
 			break;
-		}
 	}
 	if (count == nentries)
 		return;
@@ -638,10 +634,8 @@
 
 	for (count = 0; count < net_table->net_entries; count++) {
 		nd = ne->net_entry_desc;
-		if ((strlen(name) == strlen(nd->net_desc_name)) &&
-		    (strncmp(name, nd->net_desc_name, strlen(name)) == 0)) {
+		if (strcmp(name, nd->net_desc_name) == 0)
 			return (ne);
-		}
 		ne = ne->net_entry_next;
 	}
 	return (NULL);
@@ -657,13 +651,8 @@
 
 	for (count = 0; count < net_table->net_entries; count++) {
 		nd1 = ne->net_entry_desc;
-		if (strlen(nd1->net_desc_name) == strlen(nd->net_desc_name) &&
-		    strlen(nd1->net_desc_devname) ==
-		    strlen(nd->net_desc_devname) &&
-		    strncmp(nd1->net_desc_name, nd->net_desc_name,
-		    strlen(nd1->net_desc_name)) == 0 &&
-		    strncmp(nd1->net_desc_devname, nd->net_desc_devname,
-		    strlen(nd1->net_desc_devname)) == 0 &&
+		if (strcmp(nd1->net_desc_name, nd->net_desc_name) == 0 &&
+		    strcmp(nd1->net_desc_devname, nd->net_desc_devname) == 0 &&
 		    bcmp(nd1->net_desc_ehost, nd->net_desc_ehost,
 		    ETHERADDRL) == 0 &&
 		    bcmp(nd1->net_desc_edest, nd->net_desc_edest,
@@ -841,10 +830,8 @@
 			    NET_DATE_GREATER) {
 				break;
 			}
-			if ((strlen(ns1->net_stat_name) ==
-			    strlen(ns->net_stat_name)) &&
-			    (strncmp(ns1->net_stat_name, ns->net_stat_name,
-			    strlen(ns1->net_stat_name)) == 0)) {
+			if (strcmp(ns1->net_stat_name, ns->net_stat_name) ==
+			    0) {
 				ntc->net_time_entry_next =
 				    end->net_time_entry_next;
 				if (end->net_time_entry_next != NULL) {
@@ -1084,9 +1071,7 @@
 		nns = start->my_time_stat;
 
 		/* Get to the resource we are interested in */
-		if ((strlen(resource) != strlen(nns->net_stat_name)) ||
-		    (strncmp(resource, nns->net_stat_name,
-		    strlen(nns->net_stat_name)) != 0)) {
+		if (strcmp(resource, nns->net_stat_name) != 0) {
 			start = start->net_time_entry_next;
 			continue;
 		}
@@ -1400,9 +1385,7 @@
 
 		/* get to the resource we are interested in */
 		if (resource != NULL) {
-			if ((strlen(resource) != strlen(nns->net_stat_name)) ||
-			    (strncmp(resource, nns->net_stat_name,
-			    strlen(nns->net_stat_name)) != 0)) {
+			if (strcmp(resource, nns->net_stat_name) != 0) {
 				start = start->net_time_entry_next;
 				continue;
 			}
@@ -1422,6 +1405,8 @@
 		    compare_date(&nns->net_stat_time, lasttime) ==
 		    NET_DATE_GREATER) {
 			bzero(&usage, sizeof (dladm_usage_t));
+			(void) strlcpy(usage.du_name, nns->net_stat_name,
+			    sizeof (usage.du_name));
 			bcopy(&nns->net_stat_ctime, &usage.du_stime,
 			    sizeof (usage.du_stime));
 			fn(&usage, arg);
--- a/usr/src/pkgdefs/SUNWcnetr/postinstall	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/pkgdefs/SUNWcnetr/postinstall	Tue Feb 17 01:31:30 2009 -0800
@@ -18,7 +18,7 @@
 #
 # CDDL HEADER END
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -109,9 +109,31 @@
 	removef -f $PKGINST > /dev/null 2>&1
 fi
 
-# Convert hostname.xxx and zonecfg vlan entries
-host_ifs=`ls -1 $rootprefix/etc | egrep -e '^hostname.|^hostname6.|^dhcp.'| \
-    cut -d . -f2 | sort -u` 
+# Convert (hostname|hostname6|dhcp).xxx and zonecfg vlan entries
+
+for iftype in hostname hostname6 dhcp
+do
+	interface_names="`echo $rootprefix/etc/$iftype.*[0-9] 2>/dev/null`"
+	if [ "$interface_names" != "$rootprefix/etc/$iftype.*[0-9]" ]; then
+		ORIGIFS="$IFS"
+		IFS="$IFS."
+		set -- $interface_names
+		IFS="$ORIGIFS"
+		while [ $# -ge 2 ]; do
+			shift
+			if [ $# -gt 1 -a \
+			    "$2" != "$rootprefix/etc/$iftype" ]; then
+				while [ $# -gt 1 -a \
+				    "$1" != "$rootprefix/etc/$iftype" ]; do
+					shift
+				done
+			else
+				host_ifs="$host_ifs $1"
+				shift
+			fi
+		done
+	fi
+done
 
 zones=`zoneadm list -c | grep -v global`
 for zone in $zones
--- a/usr/src/tools/scripts/bfu.sh	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/tools/scripts/bfu.sh	Tue Feb 17 01:31:30 2009 -0800
@@ -4708,8 +4708,31 @@
 	if [[ ! -f $root/sbin/flowadm ]] && \
 	    archive_file_exists generic.sbin "sbin/flowadm"; then
 		flowadm_status="new"
-		host_ifs=`ls -1 $rootprefix/etc | egrep -e \
-	  	  '^hostname.|^hostname6.|^dhcp.'|  cut -d . -f2 | sort -u` 
+
+		for iftype in hostname hostname6 dhcp
+		do
+			interface_names="`echo /etc/$iftype.*[0-9] 2>/dev/null`"
+			if [ "$interface_names" != "/etc/iftype.*[0-9]" ]; then
+				ORIGIFS="$IFS"
+				IFS="$IFS."
+				set -- $interface_names
+				IFS="$ORIGIFS"
+				while [ $# -ge 2 ]; do
+					shift
+					if [ $# -gt 1 -a \
+					    "$2" != "/etc/$iftype" ]; then
+						while [ $# -gt 1 -a \
+						    "$1" != "/etc/$iftype" ]; do
+							shift
+						done
+					else
+						host_ifs="$host_ifs $1"
+						shift
+					fi
+				done
+			fi
+		done
+
 		zones=`zoneadm list -c | grep -v global`
 		for zone in $zones
 		do
--- a/usr/src/uts/common/inet/ip.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip.h	Tue Feb 17 01:31:30 2009 -0800
@@ -1128,7 +1128,7 @@
 extern const iulp_t ire_uinfo_null;
 
 /*
- * The conn drain list structure.
+ * The conn drain list structure (idl_t).
  * The list is protected by idl_lock. Each conn_t inserted in the list
  * points back at this idl_t using conn_idl. IP primes the draining of the
  * conns queued in these lists, by qenabling the 1st conn of each list. This
@@ -1137,8 +1137,27 @@
  * idl_lock protects all other members of idl_t and conn_drain_next
  * and conn_drain_prev of conn_t. The conn_lock protects IPCF_DRAIN_DISABLED
  * flag of the conn_t and conn_idl.
+ *
+ * The conn drain list, idl_t, itself is part of tx cookie list structure.
+ * A tx cookie list points to a blocked Tx ring and contains the list of
+ * all conn's that are blocked due to the flow-controlled Tx ring (via
+ * the idl drain list). Note that a link can have multiple Tx rings. The
+ * drain list will store the conn's blocked due to Tx ring being flow
+ * controlled.
  */
-typedef struct idl_s {
+
+typedef uintptr_t ip_mac_tx_cookie_t;
+typedef	struct idl_s idl_t;
+typedef	struct idl_tx_list_s idl_tx_list_t;
+
+struct idl_tx_list_s {
+	ip_mac_tx_cookie_t	txl_cookie;
+	kmutex_t		txl_lock;	/* Lock for this list */
+	idl_t			*txl_drain_list;
+	int			txl_drain_index;
+};
+
+struct idl_s {
 	conn_t		*idl_conn;		/* Head of drain list */
 	kmutex_t	idl_lock;		/* Lock for this list */
 	conn_t		*idl_conn_draining;	/* conn that is draining */
@@ -1146,7 +1165,8 @@
 		idl_repeat : 1,			/* Last conn must re-enable */
 						/* drain list again */
 		idl_unused : 31;
-} idl_t;
+	idl_tx_list_t	*idl_itl;
+};
 
 #define	CONN_DRAIN_LIST_LOCK(connp)	(&((connp)->conn_idl->idl_lock))
 /*
@@ -3336,8 +3356,8 @@
 extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
 extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
 
-extern void	conn_drain_insert(conn_t *connp);
-extern int	conn_ipsec_length(conn_t *connp);
+extern void	conn_drain_insert(conn_t *, idl_tx_list_t *);
+extern int	conn_ipsec_length(conn_t *);
 extern void	ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
     ire_t *);
 extern ipaddr_t	ip_get_dst(ipha_t *);
@@ -3587,13 +3607,16 @@
  * we need to duplicate the definitions here because we cannot
  * include mac/dls header files here.
  */
-typedef void	*ip_mac_tx_cookie_t;
-typedef void	(*ip_mac_intr_disable_t)(void *);
-typedef void	(*ip_mac_intr_enable_t)(void *);
-typedef void	*(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
-typedef	void	(*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
-typedef void	*(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *);
-typedef int	(*ip_capab_func_t)(void *, uint_t, void *, uint_t);
+typedef void			(*ip_mac_intr_disable_t)(void *);
+typedef void			(*ip_mac_intr_enable_t)(void *);
+typedef ip_mac_tx_cookie_t	(*ip_dld_tx_t)(void *, mblk_t *,
+    uint64_t, uint16_t);
+typedef	void			(*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
+typedef void			*(*ip_dld_callb_t)(void *,
+    ip_flow_enable_t, void *);
+typedef boolean_t		(*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t);
+typedef int			(*ip_capab_func_t)(void *, uint_t,
+    void *, uint_t);
 
 /*
  * POLLING README
@@ -3640,6 +3663,8 @@
 	void			*idd_tx_dh;	/* dld_str_t *dsp */
 	ip_dld_callb_t		idd_tx_cb_df;	/* mac_tx_srs_notify */
 	void			*idd_tx_cb_dh;	/* mac_client_handle_t *mch */
+	ip_dld_fctl_t		idd_tx_fctl_df;	/* mac_tx_is_flow_blocked */
+	void			*idd_tx_fctl_dh;	/* mac_client_handle */
 } ill_dld_direct_t;
 
 /* IP - DLD polling capability */
--- a/usr/src/uts/common/inet/ip/ip.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip/ip.c	Tue Feb 17 01:31:30 2009 -0800
@@ -451,29 +451,115 @@
  * policy change may affect them.
  *
  * IP Flow control notes:
- *
- * Non-TCP streams are flow controlled by IP. On the send side, if the packet
- * cannot be sent down to the driver by IP, because of a canput failure, IP
- * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq.
- * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained
- * when the flowcontrol condition subsides. Ultimately STREAMS backenables the
- * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the
- * first conn in the list of conn's to be drained. ip_wsrv on this conn drains
- * the queued messages, and removes the conn from the drain list, if all
- * messages were drained. It also qenables the next conn in the drain list to
- * continue the drain process.
+ * ---------------------
+ * Non-TCP streams are flow controlled by IP. The way this is accomplished
+ * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
+ * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
+ * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
+ * functions.
+ *
+ * Per Tx ring udp flow control:
+ * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
+ * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
+ *
+ * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
+ * To achieve best performance, outgoing traffic need to be fanned out among
+ * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
+ * traffic out of the NIC and it takes a fanout hint. UDP connections pass
+ * the address of connp as fanout hint to mac_tx(). Under flow controlled
+ * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
+ * cookie points to a specific Tx ring that is blocked. The cookie is used to
+ * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
+ * point to drain_lists (idl_t's). These drain list will store the blocked UDP
+ * connp's. The drain list is not a single list but a configurable number of
+ * lists.
+ *
+ * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
+ * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
+ * which is equal to 128. This array in turn contains a pointer to idl_t[],
+ * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
+ * list will point to the list of connp's that are flow controlled.
+ *
+ *                      ---------------   -------   -------   -------
+ *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
+ *                   |  ---------------   -------   -------   -------
+ *                   |  ---------------   -------   -------   -------
+ *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
+ * ----------------  |  ---------------   -------   -------   -------
+ * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
+ * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
+ *                   |  ---------------   -------   -------   -------
+ *                   .        .              .         .         .
+ *                   |  ---------------   -------   -------   -------
+ *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
+ *                      ---------------   -------   -------   -------
+ *                      ---------------   -------   -------   -------
+ *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
+ *                   |  ---------------   -------   -------   -------
+ *                   |  ---------------   -------   -------   -------
+ * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
+ * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
+ * ----------------  |        .              .         .         .
+ *                   |  ---------------   -------   -------   -------
+ *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
+ *                      ---------------   -------   -------   -------
+ *     .....
+ * ----------------
+ * |idl_tx_list[n]|-> ...
+ * ----------------
+ *
+ * When mac_tx() returns a cookie, the cookie is used to hash into a
+ * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
+ * called passing idl_tx_list. The connp gets inserted in a drain list
+ * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
+ * the sockets (non stream based) and sets QFULL condition for conn_wq.
+ * connp->conn_direct_blocked will be set to indicate the blocked
+ * condition.
+ *
+ * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
+ * A cookie is passed in the call to ill_flow_enable() that identifies the
+ * blocked Tx ring. This cookie is used to get to the idl_tx_list that
+ * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
+ * and goes through each of the drain list (q)enabling the conn_wq of the
+ * first conn in each of the drain list. This causes ip_wsrv to run for the
+ * conn. ip_wsrv drains the queued messages, and removes the conn from the
+ * drain list, if all messages were drained. It also qenables the next conn
+ * in the drain list to continue the drain process.
  *
  * In reality the drain list is not a single list, but a configurable number
- * of lists. The ip_wsrv on the IP module, qenables the first conn in each
- * list. If the ip_wsrv of the next qenabled conn does not run, because the
+ * of lists. conn_drain_walk() in the IP module, qenables the first conn in
+ * each list. If the ip_wsrv of the next qenabled conn does not run, because
+ * the stream closes, ip_close takes responsibility to qenable the next conn
+ * in the drain list. conn_drain_insert and conn_drain_tail are the only
+ * functions that manipulate this drain list. conn_drain_insert is called in
+ * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS
+ * case -- see below). The synchronization between drain insertion and flow
+ * control wakeup is handled by using idl_txl->txl_lock.
+ *
+ * Flow control using STREAMS:
+ * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
+ * is used. On the send side, if the packet cannot be sent down to the
+ * driver by IP, because of a canput failure, IP does a putq on the conn_wq.
+ * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts
+ * the conn in a list of conn's that need to be drained when the flow
+ * control condition subsides. The blocked connps are put in first member
+ * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv
+ * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0].
+ * ips_idl_tx_list[0] contains the drain lists of blocked conns. The
+ * conn_wq of the first conn in the drain lists is (q)enabled to run.
+ * ip_wsrv on this conn drains the queued messages, and removes the conn
+ * from the drain list, if all messages were drained. It also qenables the
+ * next conn in the drain list to continue the drain process.
+ *
+ * If the ip_wsrv of the next qenabled conn does not run, because the
  * stream closes, ip_close takes responsibility to qenable the next conn in
  * the drain list. The directly called ip_wput path always does a putq, if
  * it cannot putnext. Thus synchronization problems are handled between
  * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
  * functions that manipulate this drain list. Furthermore conn_drain_insert
- * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv
- * running on a queue at any time. conn_drain_tail can be simultaneously called
- * from both ip_wsrv and ip_close.
+ * is called only from ip_wsrv for the STREAMS case, and there can be only 1
+ * instance of ip_wsrv running on a queue at any time. conn_drain_tail can
+ * be simultaneously called from both ip_wsrv and ip_close.
  *
  * IPQOS notes:
  *
@@ -732,9 +818,11 @@
 static void	conn_drain_fini(ip_stack_t *);
 static void	conn_drain_tail(conn_t *connp, boolean_t closing);
 
-static void	conn_walk_drain(ip_stack_t *);
+static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
 static void	conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *,
     zoneid_t);
+static void	conn_setqfull(conn_t *);
+static void	conn_clrqfull(conn_t *);
 
 static void	*ip_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	ip_stack_shutdown(netstackid_t stackid, void *arg);
@@ -5372,6 +5460,7 @@
 	ipif_t	*ipif;
 	queue_t	*q = ill->ill_rq;
 	ip_stack_t	*ipst = ill->ill_ipst;
+	int	i;
 
 	/*
 	 * The punlink prior to this may have initiated a capability
@@ -5463,7 +5552,9 @@
 	 * get unblocked.
 	 */
 	ip1dbg(("ip_wsrv: walking\n"));
-	conn_walk_drain(ipst);
+	for (i = 0; i < TX_FANOUT_SIZE; i++) {
+		conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
+	}
 
 	mutex_enter(&ipst->ips_ip_mi_lock);
 	mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
@@ -13908,8 +13999,7 @@
 			ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
 			    ALL_ZONES, ill, IPV4_VERSION, hlen, ipst);
 		}
-
-		ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC);
+		ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL);
 	}
 	return (ire);
 
@@ -22341,8 +22431,13 @@
 	if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
 		queue_t *dev_q = stq->q_next;
 
-		/* flow controlled */
-		if (DEV_Q_FLOW_BLOCKED(dev_q))
+		/*
+		 * For DIRECT_CAPABLE, we do flow control at
+		 * the time of sending the packet. See
+		 * ILL_SEND_TX().
+		 */
+		if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
+		    (DEV_Q_FLOW_BLOCKED(dev_q)))
 			goto blocked;
 
 		if ((PROTO == IPPROTO_UDP) &&
@@ -22765,7 +22860,8 @@
 		} else {
 			queue_t	*dev_q = stq->q_next;
 
-			if (DEV_Q_FLOW_BLOCKED(dev_q)) {
+			if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
+			    (DEV_Q_FLOW_BLOCKED(dev_q))) {
 blocked:
 				ipha->ipha_ident = ip_hdr_included;
 				/*
@@ -22780,10 +22876,15 @@
 				    connp != NULL &&
 				    caller != IRE_SEND) {
 					if (caller == IP_WSRV) {
+						idl_tx_list_t *idl_txl;
+
+						idl_txl =
+						    &ipst->ips_idl_tx_list[0];
 						connp->conn_did_putbq = 1;
 						(void) putbq(connp->conn_wq,
 						    first_mp);
-						conn_drain_insert(connp);
+						conn_drain_insert(connp,
+						    idl_txl);
 						/*
 						 * This is the service thread,
 						 * and the queue is already
@@ -24401,7 +24502,7 @@
 			    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
 			    ipha_t *, ipha, ip6_t *, NULL, int, 0);
 
-			ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
+			ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp);
 
 			BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
 			UPDATE_MIB(out_ill->ill_ip_mib,
@@ -24708,7 +24809,8 @@
 				    __dtrace_ipsr_ill_t *, out_ill, ipha_t *,
 				    ipha, ip6_t *, NULL, int, 0);
 
-				ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
+				ILL_SEND_TX(out_ill, ire, connp,
+				    xmit_mp, 0, connp);
 
 				BUMP_MIB(out_ill->ill_ip_mib,
 				    ipIfStatsHCOutTransmits);
@@ -27921,7 +28023,8 @@
 static void
 conn_drain_init(ip_stack_t *ipst)
 {
-	int i;
+	int i, j;
+	idl_tx_list_t *itl_tx;
 
 	ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
 
@@ -27937,12 +28040,19 @@
 			ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
 	}
 
-	ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt *
-	    sizeof (idl_t), KM_SLEEP);
-
-	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
-		mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
+	ipst->ips_idl_tx_list =
+	    kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
+	for (i = 0; i < TX_FANOUT_SIZE; i++) {
+		itl_tx =  &ipst->ips_idl_tx_list[i];
+		itl_tx->txl_drain_list =
+		    kmem_zalloc(ipst->ips_conn_drain_list_cnt *
+		    sizeof (idl_t), KM_SLEEP);
+		mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
+		for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
+			mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
+			    MUTEX_DEFAULT, NULL);
+			itl_tx->txl_drain_list[j].idl_itl = itl_tx;
+		}
 	}
 }
 
@@ -27950,12 +28060,16 @@
 conn_drain_fini(ip_stack_t *ipst)
 {
 	int i;
-
-	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++)
-		mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock);
-	kmem_free(ipst->ips_conn_drain_list,
-	    ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
-	ipst->ips_conn_drain_list = NULL;
+	idl_tx_list_t *itl_tx;
+
+	for (i = 0; i < TX_FANOUT_SIZE; i++) {
+		itl_tx =  &ipst->ips_idl_tx_list[i];
+		kmem_free(itl_tx->txl_drain_list,
+		    ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
+	}
+	kmem_free(ipst->ips_idl_tx_list,
+	    TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
+	ipst->ips_idl_tx_list = NULL;
 }
 
 /*
@@ -27968,16 +28082,11 @@
  * the first conn in each of these drain lists. Each of these qenabled conns
  * in turn enables the next in the list, after it runs, or when it closes,
  * thus sustaining the drain process.
- *
- * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput ->
- * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert
- * running at any time, on a given conn, since there can be only 1 service proc
- * running on a queue at any time.
- */
-void
-conn_drain_insert(conn_t *connp)
-{
-	idl_t	*idl;
+ */
+void
+conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
+{
+	idl_t	*idl = tx_list->txl_drain_list;
 	uint_t	index;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 
@@ -27996,13 +28105,13 @@
 		 * Atomicity of load/stores is enough to make sure that
 		 * conn_drain_list_index is always within bounds.
 		 */
-		index = ipst->ips_conn_drain_list_index;
+		index = tx_list->txl_drain_index;
 		ASSERT(index < ipst->ips_conn_drain_list_cnt);
-		connp->conn_idl = &ipst->ips_conn_drain_list[index];
+		connp->conn_idl = &tx_list->txl_drain_list[index];
 		index++;
 		if (index == ipst->ips_conn_drain_list_cnt)
 			index = 0;
-		ipst->ips_conn_drain_list_index = index;
+		tx_list->txl_drain_index = index;
 	}
 	mutex_exit(&connp->conn_lock);
 
@@ -28044,8 +28153,12 @@
 	 * For non streams based sockets assert flow control.
 	 */
 	if (IPCL_IS_NONSTR(connp)) {
+		DTRACE_PROBE1(su__txq__full, conn_t *, connp);
 		(*connp->conn_upcalls->su_txq_full)
 		    (connp->conn_upper_handle, B_TRUE);
+	} else {
+		conn_setqfull(connp);
+		noenable(connp->conn_wq);
 	}
 	mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
 }
@@ -28167,6 +28280,9 @@
 		if (IPCL_IS_NONSTR(connp)) {
 			(*connp->conn_upcalls->su_txq_full)
 			    (connp->conn_upper_handle, B_FALSE);
+		} else {
+			conn_clrqfull(connp);
+			enableok(connp->conn_wq);
 		}
 	}
 
@@ -28194,6 +28310,8 @@
 	if (q->q_next) {
 		ill = (ill_t *)q->q_ptr;
 		if (ill->ill_state_flags == 0) {
+			ip_stack_t *ipst = ill->ill_ipst;
+
 			/*
 			 * The device flow control has opened up.
 			 * Walk through conn drain lists and qenable the
@@ -28202,7 +28320,7 @@
 			 * Hence the if check above.
 			 */
 			ip1dbg(("ip_wsrv: walking\n"));
-			conn_walk_drain(ill->ill_ipst);
+			conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
 		}
 		return;
 	}
@@ -28229,12 +28347,14 @@
 	 *    (causing an infinite loop).
 	 */
 	ASSERT(!connp->conn_did_putbq);
+
 	while ((q->q_first != NULL) && !connp->conn_did_putbq) {
 		connp->conn_draining = 1;
 		noenable(q);
 		while ((mp = getq(q)) != NULL) {
 			ASSERT(CONN_Q(q));
 
+			DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp);
 			ip_output(Q_TO_CONN(q), mp, q, IP_WSRV);
 			if (connp->conn_did_putbq) {
 				/* ip_wput did a putbq */
@@ -28253,12 +28373,23 @@
 		 */
 		connp->conn_draining = 0;
 		enableok(q);
-
 	}
 
 	/* Enable the next conn for draining */
 	conn_drain_tail(connp, B_FALSE);
 
+	/*
+	 * conn_direct_blocked is used to indicate blocked
+	 * condition for direct path (ILL_DIRECT_CAPABLE()).
+	 * This is the only place where it is set without
+	 * checking for ILL_DIRECT_CAPABLE() and setting it
+	 * to 0 is ok even if it is not ILL_DIRECT_CAPABLE().
+	 */
+	if (!connp->conn_did_putbq && connp->conn_direct_blocked) {
+		DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp);
+		connp->conn_direct_blocked = B_FALSE;
+	}
+
 	connp->conn_did_putbq = 0;
 }
 
@@ -28274,11 +28405,18 @@
  * function and wakes up corresponding mac worker threads, which in turn
  * calls this callback function, and disables flow control.
  */
-/* ARGSUSED */
-void
-ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie)
-{
-	qenable(((ill_t *)ill)->ill_wq);
+void
+ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
+{
+	ill_t *ill = (ill_t *)arg;
+	ip_stack_t *ipst = ill->ill_ipst;
+	idl_tx_list_t *idl_txl;
+
+	idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+	mutex_enter(&idl_txl->txl_lock);
+	/* add code to to set a flag to indicate idl_txl is enabled */
+	conn_walk_drain(ipst, idl_txl);
+	mutex_exit(&idl_txl->txl_lock);
 }
 
 /*
@@ -28315,7 +28453,7 @@
  * in turn qenable the next conn, when it is done/blocked/closing.
  */
 static void
-conn_walk_drain(ip_stack_t *ipst)
+conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
 {
 	int i;
 	idl_t *idl;
@@ -28323,7 +28461,7 @@
 	IP_STAT(ipst, ip_conn_walk_drain);
 
 	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
-		idl = &ipst->ips_conn_drain_list[i];
+		idl = &tx_list->txl_drain_list[i];
 		mutex_enter(&idl->idl_lock);
 		if (idl->idl_conn == NULL) {
 			mutex_exit(&idl->idl_lock);
@@ -28521,6 +28659,41 @@
 	return (found);
 }
 
+static void
+conn_setqfull(conn_t *connp)
+{
+	queue_t *q = connp->conn_wq;
+
+	if (!(q->q_flag & QFULL)) {
+		mutex_enter(QLOCK(q));
+		if (!(q->q_flag & QFULL)) {
+			/* still need to set QFULL */
+			q->q_flag |= QFULL;
+			mutex_exit(QLOCK(q));
+		} else {
+			mutex_exit(QLOCK(q));
+		}
+	}
+}
+
+static void
+conn_clrqfull(conn_t *connp)
+{
+	queue_t *q = connp->conn_wq;
+
+	if (q->q_flag & QFULL) {
+		mutex_enter(QLOCK(q));
+		if (q->q_flag & QFULL) {
+			q->q_flag &= ~QFULL;
+			mutex_exit(QLOCK(q));
+			if (q->q_flag & QWANTW)
+				qbackenable(q, 0);
+		} else {
+			mutex_exit(QLOCK(q));
+		}
+	}
+}
+
 /*
  * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp.
  */
@@ -29666,7 +29839,7 @@
 					    0);
 
 					ILL_SEND_TX(out_ill,
-					    ire, connp, first_mp, 0);
+					    ire, connp, first_mp, 0, connp);
 				} else {
 					BUMP_MIB(out_ill->ill_ip_mib,
 					    ipIfStatsOutDiscards);
--- a/usr/src/uts/common/inet/ip/ip6.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip/ip6.c	Tue Feb 17 01:31:30 2009 -0800
@@ -10807,9 +10807,12 @@
 			if (ipst->ips_ip_output_queue && connp != NULL &&
 			    !mctl_present && caller != IRE_SEND) {
 				if (caller == IP_WSRV) {
+					idl_tx_list_t *idl_txl;
+
+					idl_txl = &ipst->ips_idl_tx_list[0];
 					connp->conn_did_putbq = 1;
 					(void) putbq(connp->conn_wq, mp);
-					conn_drain_insert(connp);
+					conn_drain_insert(connp, idl_txl);
 					/*
 					 * caller == IP_WSRV implies we are
 					 * the service thread, and the
--- a/usr/src/uts/common/inet/ip/ip_if.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Tue Feb 17 01:31:30 2009 -0800
@@ -3083,6 +3083,8 @@
 		idd->idd_tx_dh = direct.di_tx_dh;
 		idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
 		idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
+		idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
+		idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
 		/*
 		 * One time registration of flow enable callback function
 		 */
--- a/usr/src/uts/common/inet/ip_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -503,24 +503,72 @@
 #define	ILL_DIRECT_CAPABLE(ill)						\
 	(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
 
-#define	ILL_SEND_TX(ill, ire, hint, mp, flag) {			\
-	if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) {	\
-		ill_dld_direct_t *idd;				\
-								\
-		idd = &(ill)->ill_dld_capab->idc_direct;	\
-		/*						\
-		 * Send the packet directly to DLD, where it	\
-		 * may be queued depending on the availability	\
-		 * of transmit resources at the media layer.	\
-		 * Ignore the returned value for the time being \
-		 * In future, we may want to take this into	\
-		 * account and flow control the TCP.		\
-		 */						\
-		(void) idd->idd_tx_df(idd->idd_tx_dh, mp,	\
-		    (uintptr_t)(hint), flag);			\
-	} else {						\
-		putnext((ire)->ire_stq, mp);			\
-	}							\
+#define	ILL_SEND_TX(ill, ire, hint, mp, flag, connp) {			\
+	if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) {		\
+		ill_dld_direct_t *idd;					\
+		uintptr_t	cookie;					\
+		conn_t		*udp_connp = (conn_t *)connp;		\
+									\
+		idd = &(ill)->ill_dld_capab->idc_direct;		\
+		/*							\
+		 * Send the packet directly to DLD, where it		\
+		 * may be queued depending on the availability		\
+		 * of transmit resources at the media layer.		\
+		 * Ignore the returned value for the time being 	\
+		 * In future, we may want to take this into		\
+		 * account and flow control the TCP.			\
+		 */							\
+		cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,		\
+		    (uintptr_t)(hint), flag);				\
+									\
+		/*							\
+		 * non-NULL cookie indicates flow control situation	\
+		 * and the cookie itself identifies this specific	\
+		 * Tx ring that is blocked. This cookie is used to	\
+		 * block the UDP conn that is sending packets over	\
+		 * this specific Tx ring.				\
+		 */							\
+		if ((cookie != NULL) && (udp_connp != NULL) &&		\
+		    (udp_connp->conn_ulp == IPPROTO_UDP)) {		\
+			idl_tx_list_t *idl_txl;				\
+			ip_stack_t *ipst;				\
+									\
+			/*						\
+			 * Flow controlled.				\
+			 */						\
+			DTRACE_PROBE2(ill__send__tx__cookie,		\
+			    uintptr_t, cookie, conn_t *, udp_connp);	\
+			ipst = udp_connp->conn_netstack->netstack_ip;	\
+			idl_txl =					\
+			    &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\
+			mutex_enter(&idl_txl->txl_lock);		\
+			if (udp_connp->conn_direct_blocked ||		\
+			    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,	\
+			    cookie) == 0)) {				\
+				DTRACE_PROBE1(ill__tx__not__blocked,	\
+				    boolean,				\
+				    udp_connp->conn_direct_blocked);	\
+			} else if (idl_txl->txl_cookie != NULL &&	\
+			    idl_txl->txl_cookie != cookie) {		\
+				udp_t *udp = udp_connp->conn_udp;	\
+				udp_stack_t *us = udp->udp_us;		\
+									\
+				DTRACE_PROBE2(ill__send__tx__collision,	\
+				    uintptr_t, cookie,			\
+				    uintptr_t, idl_txl->txl_cookie);	\
+				UDP_STAT(us, udp_cookie_coll);		\
+			} else {					\
+				udp_connp->conn_direct_blocked = B_TRUE;\
+				idl_txl->txl_cookie = cookie;		\
+				conn_drain_insert(udp_connp, idl_txl);	\
+				DTRACE_PROBE1(ill__send__tx__insert,	\
+				    conn_t *, udp_connp);		\
+			}						\
+			mutex_exit(&idl_txl->txl_lock);			\
+		}							\
+	} else {							\
+		putnext((ire)->ire_stq, mp);				\
+	}								\
 }
 
 #define	MBLK_RX_FANOUT_SLOWPATH(mp, ipha)				\
--- a/usr/src/uts/common/inet/ip_stack.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ip_stack.h	Tue Feb 17 01:31:30 2009 -0800
@@ -131,6 +131,9 @@
 	uint64_t ire_stats_deleted;	/* # of ires deleted from the bucket */
 } ire_stats_t;
 
+#define	TX_FANOUT_SIZE	128
+#define	IDLHASHINDEX(X)	\
+	((((uintptr_t)(X) >> 2) + ((uintptr_t)(X) >> 9)) & (TX_FANOUT_SIZE - 1))
 
 /*
  * IP stack instances
@@ -348,9 +351,9 @@
 
 	kstat_t		*ips_loopback_ksp;
 
-	struct idl_s	*ips_conn_drain_list;	/* Array of conn drain lists */
+	/* Array of conn drain lists */
+	struct idl_tx_list_s	*ips_idl_tx_list;
 	uint_t		ips_conn_drain_list_cnt; /* Count of conn_drain_list */
-	int		ips_conn_drain_list_index; /* Next drain_list */
 
 	/*
 	 * ID used to assign next free one.
--- a/usr/src/uts/common/inet/ipclassifier.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/ipclassifier.h	Tue Feb 17 01:31:30 2009 -0800
@@ -245,6 +245,7 @@
 
 	unsigned int
 		conn_lso_ok : 1;		/* LSO is usable */
+	boolean_t conn_direct_blocked;		/* conn is flow-controlled */
 
 	squeue_t	*conn_initial_sqp;	/* Squeue at open time */
 	squeue_t	*conn_final_sqp;	/* Squeue after connect */
--- a/usr/src/uts/common/inet/tcp/tcp.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Tue Feb 17 01:31:30 2009 -0800
@@ -86,6 +86,7 @@
 #include <inet/kstatcom.h>
 #include <inet/tcp.h>
 #include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
 #include <inet/ipdrop.h>
@@ -19431,7 +19432,7 @@
 			    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
 		}
 
-		ILL_SEND_TX(ill, ire, connp, mp, 0);
+		ILL_SEND_TX(ill, ire, connp, mp, 0, NULL);
 	}
 
 	IRE_REFRELE(ire);
@@ -21418,7 +21419,7 @@
 			    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
 		}
 
-		ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0);
+		ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL);
 	}
 }
 
--- a/usr/src/uts/common/inet/udp/udp.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/udp/udp.c	Tue Feb 17 01:31:30 2009 -0800
@@ -5604,6 +5604,7 @@
 	udp_stack_t	*us = udp->udp_us;
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 	boolean_t	ll_multicast = B_FALSE;
+	boolean_t	direct_send;
 
 	dev_q = ire->ire_stq->q_next;
 	ASSERT(dev_q != NULL);
@@ -5611,16 +5612,24 @@
 	ill = ire_to_ill(ire);
 	ASSERT(ill != NULL);
 
+	/*
+	 * For the direct send case, if resetting of conn_direct_blocked
+	 * was missed, it is still ok because the putq() would enable
+	 * the queue and write service will drain it out.
+	 */
+	direct_send = ILL_DIRECT_CAPABLE(ill);
+
 	/* is queue flow controlled? */
-	if (q->q_first != NULL || connp->conn_draining ||
-	    DEV_Q_FLOW_BLOCKED(dev_q)) {
+	if ((!direct_send) && (q->q_first != NULL || connp->conn_draining ||
+	    DEV_Q_FLOW_BLOCKED(dev_q))) {
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-
-		if (ipst->ips_ip_output_queue)
+		if (ipst->ips_ip_output_queue) {
+			DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp);
 			(void) putq(connp->conn_wq, mp);
-		else
+		} else {
 			freemsg(mp);
+		}
 		ire_refrele(ire);
 		return;
 	}
@@ -5718,20 +5727,60 @@
 		    ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
 	}
 
-	if (mp != NULL) {
-		DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
-		    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
-		    ipha_t *, ipha, ip6_t *, NULL, int, 0);
-
-		if (ILL_DIRECT_CAPABLE(ill)) {
-			ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
-
-			(void) idd->idd_tx_df(idd->idd_tx_dh, mp,
-			    (uintptr_t)connp, 0);
-		} else {
-			putnext(ire->ire_stq, mp);
-		}
-	}
+	if (mp == NULL)
+		goto bail;
+
+	DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+	    void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
+	    ipha_t *, ipha, ip6_t *, NULL, int, 0);
+
+	if (direct_send) {
+		uintptr_t cookie;
+		ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+
+		cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,
+		    (uintptr_t)connp, 0);
+		if (cookie != NULL) {
+			idl_tx_list_t *idl_txl;
+
+			/*
+			 * Flow controlled.
+			 */
+			DTRACE_PROBE2(non__null__cookie, uintptr_t,
+			    cookie, conn_t *, connp);
+			idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+			mutex_enter(&idl_txl->txl_lock);
+			/*
+			 * Check again after holding txl_lock to see if Tx
+			 * ring is still blocked and only then insert the
+			 * connp into the drain list.
+			 */
+			if (connp->conn_direct_blocked ||
+			    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,
+			    cookie) == 0)) {
+				mutex_exit(&idl_txl->txl_lock);
+				goto bail;
+			}
+			if (idl_txl->txl_cookie != NULL &&
+			    idl_txl->txl_cookie != cookie) {
+				DTRACE_PROBE2(udp__xmit__collision,
+				    uintptr_t, cookie,
+				    uintptr_t, idl_txl->txl_cookie);
+				UDP_STAT(us, udp_cookie_coll);
+			} else {
+				connp->conn_direct_blocked = B_TRUE;
+				idl_txl->txl_cookie = cookie;
+				conn_drain_insert(connp, idl_txl);
+				DTRACE_PROBE1(udp__xmit__insert,
+				    conn_t *, connp);
+			}
+			mutex_exit(&idl_txl->txl_lock);
+		}
+	} else {
+		DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp);
+		putnext(ire->ire_stq, mp);
+	}
+bail:
 	IRE_REFRELE(ire);
 }
 
--- a/usr/src/uts/common/inet/udp_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/inet/udp_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -192,10 +192,7 @@
 	kstat_named_t	udp_in_recvtclass;
 	kstat_named_t	udp_in_timestamp;
 	kstat_named_t	udp_ip_rcvpktinfo;
-	kstat_named_t	udp_direct_send;
-	kstat_named_t	udp_bwsq_send;
-	kstat_named_t	udp_connected_direct_send;
-	kstat_named_t	udp_connected_bwsq_send;
+	kstat_named_t	udp_cookie_coll;
 #ifdef DEBUG
 	kstat_named_t	udp_data_conn;
 	kstat_named_t	udp_data_notconn;
--- a/usr/src/uts/common/io/aggr/aggr_grp.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c	Tue Feb 17 01:31:30 2009 -0800
@@ -313,13 +313,13 @@
 		link_state_changed = B_TRUE;
 	}
 
-	aggr_grp_multicst_port(port, B_TRUE);
-
 	/*
 	 * Update port's state.
 	 */
 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
 
+	aggr_grp_multicst_port(port, B_TRUE);
+
 	/*
 	 * Set port's receive callback
 	 */
@@ -2028,8 +2028,10 @@
 /*
  * Add or remove the multicast addresses that are defined for the group
  * to or from the specified port.
- * This function is called before stopping a port, before a port
- * is detached from a group, and when attaching a port to a group.
+ *
+ * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
+ * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
+ * called when the port is either stopped or detached.
  */
 void
 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
@@ -2039,7 +2041,7 @@
 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
-	if (!port->lp_started)
+	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
 		return;
 
 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
@@ -2055,8 +2057,10 @@
 
 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
-		if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
+		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
+		    !port->lp_started) {
 			continue;
+		}
 		cerr = aggr_port_multicst(port, add, addrp);
 		if (cerr != 0 && err == 0)
 			err = cerr;
--- a/usr/src/uts/common/io/aggr/aggr_port.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/aggr/aggr_port.c	Tue Feb 17 01:31:30 2009 -0800
@@ -493,9 +493,11 @@
 {
 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
 
-	if (!port->lp_started)
-		port->lp_started = B_TRUE;
+	if (port->lp_started)
+		return (0);
 
+	port->lp_started = B_TRUE;
+	aggr_grp_multicst_port(port, B_TRUE);
 	return (0);
 }
 
@@ -507,8 +509,7 @@
 	if (!port->lp_started)
 		return;
 
-	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
-		aggr_grp_multicst_port(port, B_FALSE);
+	aggr_grp_multicst_port(port, B_FALSE);
 
 	/* update the port state */
 	port->lp_started = B_FALSE;
--- a/usr/src/uts/common/io/aggr/aggr_send.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/aggr/aggr_send.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,6 +35,7 @@
 #include <sys/vlan.h>
 #include <sys/strsun.h>
 #include <sys/strsubr.h>
+#include <sys/dlpi.h>
 
 #include <inet/common.h>
 #include <inet/led.h>
@@ -42,184 +43,29 @@
 #include <inet/ip6.h>
 #include <inet/tcp.h>
 #include <netinet/udp.h>
-#include <inet/ipsec_impl.h>
-#include <inet/sadb.h>
-#include <inet/ipsecesp.h>
-#include <inet/ipsecah.h>
 
 #include <sys/aggr.h>
 #include <sys/aggr_impl.h>
 
-#define	HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
-#define	HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
-
-static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
-
-static uint64_t
-aggr_send_hash(aggr_grp_t *grp, mblk_t *mp)
-{
-	struct ether_header *ehp;
-	uint16_t sap;
-	uint_t skip_len;
-	uint8_t proto;
-	uint32_t policy = grp->lg_tx_policy;
-	uint64_t hash = 0;
-
-	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
-	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
-	ASSERT(RW_READ_HELD(&grp->lg_tx_lock));
-
-	/* compute MAC hash */
-
-	ehp = (struct ether_header *)mp->b_rptr;
-
-	if (policy & AGGR_POLICY_L2) {
-		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
-		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
-		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
-		policy &= ~AGGR_POLICY_L2;
-	}
-
-	if (policy == 0)
-		goto done;
-
-	/* skip ethernet header */
-
-	if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
-		struct ether_vlan_header *evhp;
-		mblk_t *newmp = NULL;
-
-		skip_len = sizeof (struct ether_vlan_header);
-		if (MBLKL(mp) < skip_len) {
-			/* the vlan tag is the payload, pull up first */
-			newmp = msgpullup(mp, -1);
-			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
-				goto done;
-			}
-			evhp = (struct ether_vlan_header *)newmp->b_rptr;
-		} else {
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-		}
-
-		sap = ntohs(evhp->ether_type);
-		freemsg(newmp);
-	} else {
-		sap = ntohs(ehp->ether_type);
-		skip_len = sizeof (struct ether_header);
-	}
-
-	/* if ethernet header is in its own mblk, skip it */
-	if (MBLKL(mp) <= skip_len) {
-		skip_len -= MBLKL(mp);
-		mp = mp->b_cont;
-	}
-
-	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
-
-	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
-
-	switch (sap) {
-	case ETHERTYPE_IP: {
-		ipha_t *iphp;
-
-		if (MBLKL(mp) < (skip_len + sizeof (ipha_t)))
-			goto done;
-
-		iphp = (ipha_t *)(mp->b_rptr + skip_len);
-		proto = iphp->ipha_protocol;
-		skip_len += IPH_HDR_LENGTH(iphp);
-
-		if (policy & AGGR_POLICY_L3) {
-			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
-			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
-
-			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
-			policy &= ~AGGR_POLICY_L3;
-		}
-		break;
-	}
-	case ETHERTYPE_IPV6: {
-		ip6_t *ip6hp;
-
-		/*
-		 * if ipv6 packet has options, the proto will not be one of the
-		 * ones handled by the ULP processor below, and will return 0
-		 * as the index
-		 */
-		if (MBLKL(mp) < (skip_len + sizeof (ip6_t)))
-			goto done;
-
-		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
-		proto = ip6hp->ip6_nxt;
-		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
-
-		if (policy & AGGR_POLICY_L3) {
-			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
-			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
-
-			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
-			policy &= ~AGGR_POLICY_L3;
-		}
-		break;
-	}
-	default:
-		goto done;
-	}
-
-	if (!(policy & AGGR_POLICY_L4))
-		goto done;
-
-	/* if ip header is in its own mblk, skip it */
-	if (MBLKL(mp) <= skip_len) {
-		skip_len -= MBLKL(mp);
-		mp = mp->b_cont;
-	}
-
-	/* parse ULP header */
-again:
-	switch (proto) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_ESP:
-	case IPPROTO_SCTP:
-		/*
-		 * These Internet Protocols are intentionally designed
-		 * for hashing from the git-go.  Port numbers are in the first
-		 * word for transports, SPI is first for ESP.
-		 */
-		hash ^= HASH_4BYTES((mp->b_rptr + skip_len));
-		break;
-
-	case IPPROTO_AH: {
-		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
-
-		uint_t ah_length = AH_TOTAL_LEN(ah);
-		proto = ah->ah_nexthdr;
-		skip_len += ah_length;
-
-		/* if ip header is in its own mblk, skip it */
-		if (MBLKL(mp) <= skip_len) {
-			skip_len -= MBLKL(mp);
-			mp = mp->b_cont;
-		}
-
-		goto again;
-	}
-	}
-
-done:
-	return (hash);
-}
-
 /*
  * Update the TX load balancing policy of the specified group.
  */
 void
 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
 {
+	uint8_t mac_policy = 0;
+
 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 
+	if ((policy & AGGR_POLICY_L2) != 0)
+		mac_policy |= MAC_PKT_HASH_L2;
+	if ((policy & AGGR_POLICY_L3) != 0)
+		mac_policy |= MAC_PKT_HASH_L3;
+	if ((policy & AGGR_POLICY_L4) != 0)
+		mac_policy |= MAC_PKT_HASH_L4;
+
 	grp->lg_tx_policy = policy;
+	grp->lg_mac_tx_policy = mac_policy;
 }
 
 /*
@@ -250,7 +96,8 @@
 		nextp = mp->b_next;
 		mp->b_next = NULL;
 
-		hash = aggr_send_hash(grp, mp);
+		hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy,
+		    B_TRUE);
 		port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
 
 		/*
@@ -266,7 +113,7 @@
 			 */
 			freemsg(mp);
 		} else {
-			mblk_t	*ret_mp;
+			mblk_t	*ret_mp = NULL;
 
 			/*
 			 * It is fine that the port state changes now.
@@ -385,51 +232,3 @@
 
 	port->lp_tx_enabled = B_FALSE;
 }
-
-static uint16_t
-aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
-{
-	uint16_t length;
-	uint_t	ehdrlen;
-	uint8_t	*nexthdrp;
-	uint8_t *whereptr;
-	uint8_t *endptr;
-	ip6_dest_t *desthdr;
-	ip6_rthdr_t *rthdr;
-	ip6_frag_t *fraghdr;
-
-	length = IPV6_HDR_LEN;
-	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
-	endptr = mp->b_wptr;
-
-	nexthdrp = &ip6h->ip6_nxt;
-	while (whereptr < endptr) {
-		switch (*nexthdrp) {
-		case IPPROTO_HOPOPTS:
-		case IPPROTO_DSTOPTS:
-			/* Assumes the headers are identical for hbh and dst */
-			desthdr = (ip6_dest_t *)whereptr;
-			ehdrlen = 8 * (desthdr->ip6d_len + 1);
-			nexthdrp = &desthdr->ip6d_nxt;
-			break;
-		case IPPROTO_ROUTING:
-			rthdr = (ip6_rthdr_t *)whereptr;
-			ehdrlen =  8 * (rthdr->ip6r_len + 1);
-			nexthdrp = &rthdr->ip6r_nxt;
-			break;
-		case IPPROTO_FRAGMENT:
-			fraghdr = (ip6_frag_t *)whereptr;
-			ehdrlen = sizeof (ip6_frag_t);
-			nexthdrp = &fraghdr->ip6f_nxt;
-			break;
-		case IPPROTO_NONE:
-			/* No next header means we're finished */
-		default:
-			return (length);
-		}
-		length += ehdrlen;
-		whereptr += ehdrlen;
-	}
-
-	return (length);
-}
--- a/usr/src/uts/common/io/dld/dld_proto.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/dld/dld_proto.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1334,25 +1334,14 @@
 	case DLD_ENABLE:
 		dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf,
 		    direct->di_rx_ch);
-		/*
-		 * TODO: XXXGopi
-		 *
-		 * Direct pointer to functions in the MAC layer
-		 * should be passed here:
-		 *
-		 * 1) pass mac_tx() and mac_client_handle instead
-		 * of str_mdata_fastpath_put() and dld_str_t. But
-		 * not done presently because of some VLAN
-		 * processing stuff in str_mdata_fastpath_put().
-		 *
-		 * 2) pass a MAC layer callback instead of
-		 * dld_flow_ctl_callb().
-		 */
+
 		direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put;
 		direct->di_tx_dh = dsp;
-
 		direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify;
 		direct->di_tx_cb_dh = dsp->ds_mch;
+		direct->di_tx_fctl_df = (uintptr_t)mac_tx_is_flow_blocked;
+		direct->di_tx_fctl_dh = dsp->ds_mch;
+
 		dsp->ds_direct = B_TRUE;
 
 		return (0);
--- a/usr/src/uts/common/io/dls/dls.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/dls/dls.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -607,14 +607,6 @@
 		 * Set the function to start receiving packets.
 		 */
 		mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp);
-
-		/*
-		 * We've got a MAC client for this link now.
-		 * Push down the flows that were defined on this link
-		 * hitherto. The flows are added to the active flow table
-		 * and SRS, softrings etc. are created as needed.
-		 */
-		mac_link_init_flows(dlp->dl_mch);
 	}
 	dlp->dl_nactive++;
 	return (0);
@@ -625,20 +617,6 @@
 {
 	if (--dlp->dl_nactive == 0) {
 		ASSERT(dlp->dl_mah != NULL);
-		/*
-		 * We would have initialized subflows etc. only if we
-		 * brought up the primary client and set the unicast
-		 * unicast address etc. Deactivate the flows. The flow
-		 * entry will be removed from the active flow tables,
-		 * and the associated SRS, softrings etc will be
-		 * deleted. But the flow entry itself won't be
-		 * destroyed, instead it will continue to be
-		 * archived off the  the global flow hash list, for a
-		 * possible future activation when say
-		 * IP is plumbed again
-		 */
-
-		mac_link_release_flows(dlp->dl_mch);
 		(void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah);
 		dlp->dl_mah = NULL;
 		mac_rx_clear(dlp->dl_mch);
--- a/usr/src/uts/common/io/dls/dls_link.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/dls/dls_link.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,7 +36,7 @@
 #include	<sys/atomic.h>
 
 static kmem_cache_t	*i_dls_link_cachep;
-static mod_hash_t	*i_dls_link_hash;
+mod_hash_t		*i_dls_link_hash;
 static uint_t		i_dls_link_count;
 
 #define		LINK_HASHSZ	67	/* prime */
--- a/usr/src/uts/common/io/dls/dls_mgmt.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/dls/dls_mgmt.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,10 +60,15 @@
 /* Upcall door handle */
 static door_handle_t	dls_mgmt_dh = NULL;
 
-#define	DD_CONDEMNED	0x1
+#define	DD_CONDEMNED		0x1
+#define	DD_KSTAT_CHANGING	0x2
 
 /*
  * This structure is used to keep the <linkid, macname> mapping.
+ * This structure itself is not protected by the mac perimeter, but is
+ * protected by the dd_mutex and i_dls_devnet_lock. Thus most of the
+ * functions manipulating this structure such as dls_devnet_set/unset etc.
+ * may be called while not holding the mac perimeter.
  */
 typedef struct dls_devnet_s {
 	datalink_id_t	dd_linkid;
@@ -614,6 +619,11 @@
 
 /*
  * Query the "link" kstats.
+ *
+ * We may be called from the kstat subsystem in an arbitrary context.
+ * If the caller is the stack, the context could be an upcall data
+ * thread. Hence we can't acquire the mac perimeter in this function
+ * for fear of deadlock.
  */
 static int
 dls_devnet_stat_update(kstat_t *ksp, int rw)
@@ -621,21 +631,34 @@
 	dls_devnet_t	*ddp = ksp->ks_private;
 	dls_link_t	*dlp;
 	int		err;
-	mac_perim_handle_t	mph;
 
-	err = mac_perim_enter_by_macname(ddp->dd_mac, &mph);
-	if (err != 0)
-		return (err);
+	/*
+	 * Check the link is being renamed or if the link is going away
+	 * before incrementing dd_tref which in turn prevents the link
+	 * from being renamed or deleted until we finish.
+	 */
+	mutex_enter(&ddp->dd_mutex);
+	if (ddp->dd_flags & (DD_CONDEMNED | DD_KSTAT_CHANGING)) {
+		mutex_exit(&ddp->dd_mutex);
+		return (ENOENT);
+	}
+	ddp->dd_tref++;
+	mutex_exit(&ddp->dd_mutex);
 
-	err = dls_link_hold(ddp->dd_mac, &dlp);
-	if (err != 0) {
-		mac_perim_exit(mph);
-		return (err);
+	/*
+	 * If a device detach happens at this time, it will block in
+	 * dls_devnet_unset since the dd_tref has been bumped up above. So the
+	 * access to 'dlp' is safe even though we don't hold the mac perimeter.
+	 */
+	if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)ddp->dd_mac,
+	    (mod_hash_val_t *)&dlp) != 0) {
+		dls_devnet_rele_tmp(ddp);
+		return (ENOENT);
 	}
 
 	err = dls_stat_update(ksp, dlp, rw);
-	dls_link_rele(dlp);
-	mac_perim_exit(mph);
+
+	dls_devnet_rele_tmp(ddp);
 	return (err);
 }
 
@@ -707,6 +730,7 @@
 	dls_devnet_t		*ddp = NULL;
 	datalink_class_t	class;
 	int			err;
+	boolean_t		stat_create = B_FALSE;
 
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_hash,
@@ -748,8 +772,7 @@
 		    (mod_hash_key_t)(uintptr_t)linkid,
 		    (mod_hash_val_t)ddp) == 0);
 		devnet_need_rebuild = B_TRUE;
-		dls_devnet_stat_create(ddp);
-
+		stat_create = B_TRUE;
 		mutex_enter(&ddp->dd_mutex);
 		if (!ddp->dd_prop_loaded && (ddp->dd_prop_taskid == NULL)) {
 			ddp->dd_prop_taskid = taskq_dispatch(system_taskq,
@@ -761,6 +784,20 @@
 	err = 0;
 done:
 	rw_exit(&i_dls_devnet_lock);
+	/*
+	 * It is safe to drop the i_dls_devnet_lock at this point. In the case
+	 * of physical devices, the softmac framework will fail the device
+	 * detach based on the smac_state or smac_hold_cnt. Other cases like
+	 * vnic and aggr use their own scheme to serialize creates and deletes
+	 * and ensure that *ddp is valid.
+	 *
+	 * The kstat subsystem holds its own locks (rather perimeter) before
+	 * calling the ks_update (dls_devnet_stat_update) entry point which
+	 * in turn grabs the i_dls_devnet_lock. So the lock hierarchy is
+	 * kstat locks -> i_dls_devnet_lock.
+	 */
+	if (stat_create)
+		dls_devnet_stat_create(ddp);
 	if (err == 0 && ddpp != NULL)
 		*ddpp = ddp;
 	return (err);
@@ -815,7 +852,6 @@
 		VERIFY(mod_hash_remove(i_dls_devnet_id_hash,
 		    (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0);
 
-		dls_devnet_stat_destroy(ddp);
 		devnet_need_rebuild = B_TRUE;
 	}
 	rw_exit(&i_dls_devnet_lock);
@@ -830,6 +866,9 @@
 		ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL);
 	}
 
+	if (ddp->dd_linkid != DATALINK_INVALID_LINKID)
+		dls_devnet_stat_destroy(ddp);
+
 	ddp->dd_prop_loaded = B_FALSE;
 	ddp->dd_linkid = DATALINK_INVALID_LINKID;
 	ddp->dd_zid = GLOBAL_ZONEID;
@@ -1112,6 +1151,7 @@
 	mac_perim_handle_t	mph = NULL;
 	mac_handle_t		mh;
 	mod_hash_val_t		val;
+	boolean_t		clear_dd_flag = B_FALSE;
 
 	/*
 	 * In the second case, id2 must be a REMOVED physical link.
@@ -1134,8 +1174,10 @@
 	 * mac perimeter, hence enter the perimeter first. This also waits
 	 * for the property loading to finish.
 	 */
-	if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0)
-		goto done;
+	if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0) {
+		softmac_rele_device(ddh);
+		return (err);
+	}
 
 	rw_enter(&i_dls_devnet_lock, RW_WRITER);
 	if ((err = mod_hash_find(i_dls_devnet_id_hash,
@@ -1146,13 +1188,22 @@
 	}
 
 	/*
-	 * Return EBUSY if any applications have this link open.
+	 * Return EBUSY if any applications have this link open or if any
+	 * thread is currently accessing the link kstats. Then set the
+	 * DD_KSTAT_CHANGING flag to prevent any access to the kstats
+	 * while we delete and recreate kstats below.
 	 */
+	mutex_enter(&ddp->dd_mutex);
 	if (ddp->dd_ref > 1) {
+		mutex_exit(&ddp->dd_mutex);
 		err = EBUSY;
 		goto done;
 	}
 
+	ddp->dd_flags |= DD_KSTAT_CHANGING;
+	clear_dd_flag = B_TRUE;
+	mutex_exit(&ddp->dd_mutex);
+
 	if (id2 == DATALINK_INVALID_LINKID) {
 		(void) strlcpy(linkname, link, sizeof (linkname));
 
@@ -1225,11 +1276,21 @@
 done:
 	/*
 	 * Change the name of the kstat based on the new link name.
+	 * We can't hold the i_dls_devnet_lock across calls to the kstat
+	 * subsystem. Instead the DD_KSTAT_CHANGING flag set above in this
+	 * function prevents any access to the dd_ksp while we delete and
+	 * recreate it below.
 	 */
+	rw_exit(&i_dls_devnet_lock);
 	if (err == 0)
 		dls_devnet_stat_rename(ddp, linkname);
 
-	rw_exit(&i_dls_devnet_lock);
+	if (clear_dd_flag) {
+		mutex_enter(&ddp->dd_mutex);
+		ddp->dd_flags &= ~DD_KSTAT_CHANGING;
+		mutex_exit(&ddp->dd_mutex);
+	}
+
 	if (mph != NULL)
 		mac_perim_exit(mph);
 	softmac_rele_device(ddh);
@@ -1388,6 +1449,11 @@
 	int		err;
 	mac_perim_handle_t mph;
 
+	/*
+	 * Holding the mac perimeter ensures that the downcall from the
+	 * dlmgmt daemon which does the property loading does not proceed
+	 * until we relinquish the perimeter.
+	 */
 	mac_perim_enter_by_mh(mh, &mph);
 
 	/*
@@ -1400,8 +1466,8 @@
 		return (err);
 	}
 	if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) {
+		mac_perim_exit(mph);
 		(void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE);
-		mac_perim_exit(mph);
 		return (err);
 	}
 	mac_perim_exit(mph);
--- a/usr/src/uts/common/io/e1000g/e1000g_main.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/e1000g/e1000g_main.c	Tue Feb 17 01:31:30 2009 -0800
@@ -1618,7 +1618,6 @@
 	e1000g_rx_ring_t	*rx_ring = (e1000g_rx_ring_t *)arg;
 	mblk_t			*mp = NULL;
 	mblk_t			*tail;
-	uint_t			sz = 0;
 	struct e1000g 		*adapter;
 
 	adapter = rx_ring->adapter;
@@ -1631,68 +1630,7 @@
 	}
 
 	mutex_enter(&rx_ring->rx_lock);
-	ASSERT(rx_ring->poll_flag);
-
-	/*
-	 * Get any packets that have arrived. Works only if we
-	 * actually disable the physical adapter/rx_ring interrupt.
-	 * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0,
-	 * packets will have already been added to the poll list
-	 * by the interrupt (see e1000g_intr_work()).
-	 */
-	if (adapter->poll_mode) {
-		mp = e1000g_receive(rx_ring, &tail, &sz);
-		if (mp != NULL) {
-			if (rx_ring->poll_list_head == NULL)
-				rx_ring->poll_list_head = mp;
-			else
-				rx_ring->poll_list_tail->b_next = mp;
-			rx_ring->poll_list_tail = tail;
-			rx_ring->poll_list_sz += sz;
-		}
-	}
-
-	mp = rx_ring->poll_list_head;
-	if (mp == NULL) {
-		mutex_exit(&rx_ring->rx_lock);
-		rw_exit(&adapter->chip_lock);
-		return (NULL);
-	}
-
-	/* Check if we can sendup the entire chain */
-	if (bytes_to_pickup >= rx_ring->poll_list_sz) {
-		mp = rx_ring->poll_list_head;
-		rx_ring->poll_list_head = NULL;
-		rx_ring->poll_list_tail = NULL;
-		rx_ring->poll_list_sz = 0;
-		mutex_exit(&rx_ring->rx_lock);
-		rw_exit(&adapter->chip_lock);
-		return (mp);
-	}
-
-	/*
-	 * We need to find out how much chain we can send up. We
-	 * are guaranteed that atleast one packet will go up since
-	 * we already checked that.
-	 */
-	tail = mp;
-	sz = 0;
-	while (mp != NULL) {
-		sz += MBLKL(mp);
-		if (sz > bytes_to_pickup) {
-			sz -= MBLKL(mp);
-			break;
-		}
-		tail = mp;
-		mp = mp->b_next;
-	}
-
-	mp = rx_ring->poll_list_head;
-	rx_ring->poll_list_head = tail->b_next;
-	if (rx_ring->poll_list_head == NULL)
-		rx_ring->poll_list_tail = NULL;
-	rx_ring->poll_list_sz -= sz;
-	tail->b_next = NULL;
+	mp = e1000g_receive(rx_ring, &tail, bytes_to_pickup);
 	mutex_exit(&rx_ring->rx_lock);
 	rw_exit(&adapter->chip_lock);
 	return (mp);
@@ -2118,79 +2056,26 @@
 	}
 
 	if (icr & E1000_ICR_RXT0) {
-		mblk_t			*mp;
-		uint_t			sz = 0;
-		mblk_t			*tmp, *tail = NULL;
+		mblk_t			*mp = NULL;
+		mblk_t			*tail = NULL;
 		e1000g_rx_ring_t	*rx_ring;
 
 		rx_ring = Adapter->rx_ring;
 		mutex_enter(&rx_ring->rx_lock);
-
 		/*
-		 * If the real interrupt for the Rx ring was
-		 * not disabled (e1000g_poll_mode == 0), then
-		 * we still pick up the packets and queue them
-		 * on Rx ring if we were in polling mode. this
-		 * enables the polling thread to pick up packets
-		 * really fast in polling mode and helps improve
-		 * latency.
+		 * Sometimes with legacy interrupts, it possible that
+		 * there is a single interrupt for Rx/Tx. In which
+		 * case, if poll flag is set, we shouldn't really
+		 * be doing Rx processing.
 		 */
-		mp = e1000g_receive(rx_ring, &tail, &sz);
+		if (!rx_ring->poll_flag)
+			mp = e1000g_receive(rx_ring, &tail,
+			    E1000G_CHAIN_NO_LIMIT);
+		mutex_exit(&rx_ring->rx_lock);
 		rw_exit(&Adapter->chip_lock);
-
-		if (mp != NULL) {
-			ASSERT(tail != NULL);
-			if (!rx_ring->poll_flag) {
-				/*
-				 * If not polling, see if something was
-				 * already queued. Take care not to
-				 * reorder packets.
-				 */
-				if (rx_ring->poll_list_head == NULL) {
-					mutex_exit(&rx_ring->rx_lock);
-					mac_rx_ring(Adapter->mh, rx_ring->mrh,
-					    mp, rx_ring->ring_gen_num);
-				} else {
-					tmp = rx_ring->poll_list_head;
-					rx_ring->poll_list_head = NULL;
-					rx_ring->poll_list_tail->b_next = mp;
-					rx_ring->poll_list_tail = NULL;
-					rx_ring->poll_list_sz = 0;
-					mutex_exit(&rx_ring->rx_lock);
-					mac_rx_ring(Adapter->mh, rx_ring->mrh,
-					    tmp, rx_ring->ring_gen_num);
-				}
-			} else {
-				/*
-				 * We are in a polling mode. Put the
-				 * processed packets on the poll list.
-				 */
-				if (rx_ring->poll_list_head == NULL)
-					rx_ring->poll_list_head = mp;
-				else
-					rx_ring->poll_list_tail->b_next = mp;
-				rx_ring->poll_list_tail = tail;
-				rx_ring->poll_list_sz += sz;
-				mutex_exit(&rx_ring->rx_lock);
-			}
-		} else if (!rx_ring->poll_flag &&
-		    rx_ring->poll_list_head != NULL) {
-			/*
-			 * Nothing new has arrived (then why
-			 * was the interrupt raised??). Check
-			 * if something queued from the last
-			 * time.
-			 */
-			tmp = rx_ring->poll_list_head;
-			rx_ring->poll_list_head = NULL;
-			rx_ring->poll_list_tail = NULL;
-			rx_ring->poll_list_sz = 0;
-			mutex_exit(&rx_ring->rx_lock);
+		if (mp != NULL)
 			mac_rx_ring(Adapter->mh, rx_ring->mrh,
-			    tmp, rx_ring->ring_gen_num);
-		} else {
-			mutex_exit(&rx_ring->rx_lock);
-		}
+			    mp, rx_ring->ring_gen_num);
 	} else
 		rw_exit(&Adapter->chip_lock);
 
@@ -2698,7 +2583,6 @@
 	struct e1000g 		*adapter = rx_ring->adapter;
 	struct e1000_hw 	*hw = &adapter->shared;
 	uint32_t		intr_mask;
-	boolean_t		poll_mode;
 
 	rw_enter(&adapter->chip_lock, RW_READER);
 
@@ -2709,20 +2593,17 @@
 
 	mutex_enter(&rx_ring->rx_lock);
 	rx_ring->poll_flag = 0;
-	poll_mode = adapter->poll_mode;
 	mutex_exit(&rx_ring->rx_lock);
 
-	if (poll_mode) {
-		/* Rx interrupt enabling for MSI and legacy */
-		intr_mask = E1000_READ_REG(hw, E1000_IMS);
-		intr_mask |= E1000_IMS_RXT0;
-		E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
-		E1000_WRITE_FLUSH(hw);
-
-		/* Trigger a Rx interrupt to check Rx ring */
-		E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
-		E1000_WRITE_FLUSH(hw);
-	}
+	/* Rx interrupt enabling for MSI and legacy */
+	intr_mask = E1000_READ_REG(hw, E1000_IMS);
+	intr_mask |= E1000_IMS_RXT0;
+	E1000_WRITE_REG(hw, E1000_IMS, intr_mask);
+	E1000_WRITE_FLUSH(hw);
+
+	/* Trigger a Rx interrupt to check Rx ring */
+	E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0);
+	E1000_WRITE_FLUSH(hw);
 
 	rw_exit(&adapter->chip_lock);
 	return (0);
@@ -2734,7 +2615,6 @@
 	e1000g_rx_ring_t	*rx_ring = (e1000g_rx_ring_t *)intrh;
 	struct e1000g 		*adapter = rx_ring->adapter;
 	struct e1000_hw 	*hw = &adapter->shared;
-	boolean_t		poll_mode;
 
 	rw_enter(&adapter->chip_lock, RW_READER);
 
@@ -2742,22 +2622,13 @@
 		rw_exit(&adapter->chip_lock);
 		return (0);
 	}
-
-	/*
-	 * Once the adapter can support per Rx ring interrupt,
-	 * we should disable the real interrupt instead of just setting
-	 * the flag.
-	 */
 	mutex_enter(&rx_ring->rx_lock);
 	rx_ring->poll_flag = 1;
-	poll_mode = adapter->poll_mode;
 	mutex_exit(&rx_ring->rx_lock);
 
-	if (poll_mode) {
-		/* Rx interrupt disabling for MSI and legacy */
-		E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
-		E1000_WRITE_FLUSH(hw);
-	}
+	/* Rx interrupt disabling for MSI and legacy */
+	E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0);
+	E1000_WRITE_FLUSH(hw);
 
 	rw_exit(&adapter->chip_lock);
 	return (0);
--- a/usr/src/uts/common/io/e1000g/e1000g_rx.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c	Tue Feb 17 01:31:30 2009 -0800
@@ -452,7 +452,7 @@
  * This routine will process packets received in an interrupt
  */
 mblk_t *
-e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz)
+e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz)
 {
 	struct e1000_hw *hw;
 	mblk_t *nmp;
@@ -471,13 +471,13 @@
 	struct e1000g *Adapter;
 	dma_buffer_t *rx_buf;
 	uint16_t cksumflags;
+	uint_t chain_sz = 0;
 
 	ret_mp = NULL;
 	ret_nmp = NULL;
 	pkt_count = 0;
 	desc_count = 0;
 	cksumflags = 0;
-	*sz = 0;
 
 	Adapter = rx_ring->adapter;
 	hw = &Adapter->shared;
@@ -505,7 +505,8 @@
 	 * descriptor owned by the hardware that begins a packet.
 	 */
 	while ((current_desc->status & E1000_RXD_STAT_DD) &&
-	    (pkt_count < Adapter->rx_limit_onintr)) {
+	    (pkt_count < Adapter->rx_limit_onintr) &&
+	    ((sz == E1000G_CHAIN_NO_LIMIT) || (chain_sz <= sz))) {
 
 		desc_count++;
 		/*
@@ -832,7 +833,7 @@
 		}
 		ret_nmp->b_next = NULL;
 		*tail = ret_nmp;
-		*sz += length;
+		chain_sz += length;
 
 		rx_ring->rx_mblk = NULL;
 		rx_ring->rx_mblk_tail = NULL;
--- a/usr/src/uts/common/io/e1000g/e1000g_sw.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h	Tue Feb 17 01:31:30 2009 -0800
@@ -198,6 +198,8 @@
 #define	E1000G_RX_SW_STOP		0x2
 #define	E1000G_RX_SW_DETACH		0x3
 
+#define	E1000G_CHAIN_NO_LIMIT		0
+
 /*
  * definitions for smartspeed workaround
  */
@@ -786,9 +788,6 @@
 	mac_ring_handle_t mrh;
 	mac_ring_handle_t mrh_init;
 	uint64_t ring_gen_num;
-	mblk_t *poll_list_head;
-	mblk_t *poll_list_tail;
-	uint_t poll_list_sz;
 	boolean_t poll_flag;
 
 	/*
@@ -998,7 +997,7 @@
 void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring);
 uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2);
 mblk_t *e1000g_m_tx(void *arg, mblk_t *mp);
-mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz);
+mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz);
 void e1000g_rxfree_func(p_rx_sw_packet_t packet);
 
 int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val);
--- a/usr/src/uts/common/io/mac/mac.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac.c	Tue Feb 17 01:31:30 2009 -0800
@@ -504,6 +504,7 @@
 	ASSERT(mip->mi_kstat_count == 0);
 	ASSERT(mip->mi_nclients == 0);
 	ASSERT(mip->mi_nactiveclients == 0);
+	ASSERT(mip->mi_single_active_client == NULL);
 	ASSERT(mip->mi_state_flags == 0);
 	ASSERT(mip->mi_factory_addr == NULL);
 	ASSERT(mip->mi_factory_addr_num == 0);
@@ -1712,6 +1713,12 @@
 	mac_tx_lock_all(mcip);
 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
 	mac_tx_unlock_all(mcip);
+	/*
+	 * We may fail to disable flow control for the last MAC_NOTE_TX
+	 * notification because the MAC client is quiesced. Send the
+	 * notification again.
+	 */
+	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
 }
 
 /*
@@ -2350,10 +2357,8 @@
 	    cclient = cclient->mci_client_next) {
 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
 			mac_tx_srs_wakeup(mac_srs, ring);
-		if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) {
-			(void) mac_flow_walk_nolock(cclient->mci_subflow_tab,
-			    mac_tx_flow_srs_wakeup, ring);
-		}
+		(void) mac_flow_walk(cclient->mci_subflow_tab,
+		    mac_tx_flow_srs_wakeup, ring);
 	}
 	rw_exit(&mip->mi_rw_lock);
 	rw_exit(&i_mac_impl_lock);
@@ -4107,8 +4112,13 @@
 {
 	mac_address_t *map = mip->mi_addresses;
 
-	/* there should be exactly one entry left on the list */
-	ASSERT(map != NULL);
+	if (map == NULL)
+		return;
+
+	/*
+	 * If mi_addresses is initialized, there should be exactly one
+	 * entry left on the list with no users.
+	 */
 	ASSERT(map->ma_nusers == 0);
 	ASSERT(map->ma_next == NULL);
 
--- a/usr/src/uts/common/io/mac/mac_bcast.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_bcast.c	Tue Feb 17 01:31:30 2009 -0800
@@ -124,14 +124,6 @@
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
-	if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
-		/*
-		 * The address is a multicast address, have the
-		 * underlying NIC leave the multicast group.
-		 */
-		(void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr);
-	}
-
 	ASSERT(grp->mbg_addr != NULL);
 	kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length);
 	kmem_free(grp->mbg_clients,
@@ -271,15 +263,69 @@
 	size_t			addr_len = mip->mi_type->mt_addr_length;
 	int			rc = 0;
 	int			i, index = -1;
-	mac_mcast_addrs_t	*mci_maddr = NULL;
-	mac_mcast_addrs_t	*mi_maddr = NULL;
-	mac_mcast_addrs_t	**last_maddr;
+	mac_mcast_addrs_t	**prev_mi_addr = NULL;
+	mac_mcast_addrs_t	**prev_mci_addr = NULL;
 
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
 	ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST ||
 	    addrtype == MAC_ADDRTYPE_BROADCAST);
 
+	/*
+	 * Add the MAC client to the list of MAC clients associated
+	 * with the group.
+	 */
+	if (addrtype == MAC_ADDRTYPE_MULTICAST) {
+		mac_mcast_addrs_t	*maddr;
+
+		/*
+		 * In case of a driver (say aggr), we need this information
+		 * on a per MAC instance basis.
+		 */
+		prev_mi_addr = &mip->mi_mcast_addrs;
+		for (maddr = *prev_mi_addr; maddr != NULL;
+		    prev_mi_addr = &maddr->mma_next, maddr = maddr->mma_next) {
+			if (bcmp(maddr->mma_addr, addr, addr_len) == 0)
+				break;
+		}
+		if (maddr == NULL) {
+			/*
+			 * For multicast addresses, have the underlying MAC
+			 * join the corresponding multicast group.
+			 */
+			rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
+			if (rc != 0)
+				return (rc);
+			maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+			    KM_SLEEP);
+			bcopy(addr, maddr->mma_addr, addr_len);
+			*prev_mi_addr = maddr;
+		} else {
+			prev_mi_addr = NULL;
+		}
+		maddr->mma_ref++;
+
+		/*
+		 * We maintain a separate list for each MAC client. Get
+		 * the entry or add, if it is not present.
+		 */
+		prev_mci_addr = &mcip->mci_mcast_addrs;
+		for (maddr = *prev_mci_addr; maddr != NULL;
+		    prev_mci_addr = &maddr->mma_next, maddr = maddr->mma_next) {
+			if (bcmp(maddr->mma_addr, addr, addr_len) == 0)
+				break;
+		}
+		if (maddr == NULL) {
+			maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
+			    KM_SLEEP);
+			bcopy(addr, maddr->mma_addr, addr_len);
+			*prev_mci_addr = maddr;
+		} else {
+			prev_mci_addr = NULL;
+		}
+		maddr->mma_ref++;
+	}
+
 	/* The list is protected by the perimeter */
 	last_grp = &mip->mi_bcast_grp;
 	for (grp = *last_grp; grp != NULL;
@@ -331,7 +377,7 @@
 		if (rc != 0) {
 			kmem_free(grp->mbg_addr, addr_len);
 			kmem_cache_free(mac_bcast_grp_cache, grp);
-			return (rc);
+			goto fail;
 		}
 		grp->mbg_flow_ent->fe_mbg = grp;
 		mip->mi_bcast_ngrps++;
@@ -366,23 +412,7 @@
 		rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent);
 		if (rc != 0) {
 			FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
-			return (rc);
-		}
-
-		/*
-		 * For multicast addresses, have the underlying MAC
-		 * join the corresponsing multicast group.
-		 */
-		if (addrtype == MAC_ADDRTYPE_MULTICAST) {
-			rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr);
-			if (rc != 0) {
-				mac_flow_remove(mip->mi_flow_tab,
-				    grp->mbg_flow_ent, B_FALSE);
-				mac_flow_wait(grp->mbg_flow_ent,
-				    FLOW_DRIVER_UPCALL);
-				FLOW_FINAL_REFRELE(grp->mbg_flow_ent);
-				return (rc);
-			}
+			goto fail;
 		}
 
 		*last_grp = grp;
@@ -395,45 +425,6 @@
 	 * with the group.
 	 */
 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
-	if (addrtype == MAC_ADDRTYPE_MULTICAST) {
-		/*
-		 * We maintain a separate list for each MAC client. Get
-		 * the entry or add, if it is not present.
-		 */
-		last_maddr = &mcip->mci_mcast_addrs;
-		for (mci_maddr = *last_maddr; mci_maddr != NULL;
-		    last_maddr = &mci_maddr->mma_next,
-		    mci_maddr = mci_maddr->mma_next) {
-			if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0)
-				break;
-		}
-		if (mci_maddr == NULL) {
-			mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
-			    KM_SLEEP);
-			bcopy(addr, mci_maddr->mma_addr, addr_len);
-			*last_maddr = mci_maddr;
-		}
-		mci_maddr->mma_ref++;
-
-		/*
-		 * In case of a driver (say aggr), we also need this
-		 * information on a per MAC instance basis.
-		 */
-		last_maddr = &mip->mi_mcast_addrs;
-		for (mi_maddr = *last_maddr; mi_maddr != NULL;
-		    last_maddr = &mi_maddr->mma_next,
-		    mi_maddr = mi_maddr->mma_next) {
-			if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0)
-				break;
-		}
-		if (mi_maddr == NULL) {
-			mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t),
-			    KM_SLEEP);
-			bcopy(addr, mi_maddr->mma_addr, addr_len);
-			*last_maddr = mi_maddr;
-		}
-		mi_maddr->mma_ref++;
-	}
 	for (i = 0; i < grp->mbg_nclients_alloc; i++) {
 		/*
 		 * The MAC client was already added, say when we have
@@ -442,7 +433,8 @@
 		 */
 		if (grp->mbg_clients[i].mgb_client == mcip) {
 			grp->mbg_clients[i].mgb_client_ref++;
-			goto add_done;
+			rw_exit(&mip->mi_rw_lock);
+			return (0);
 		} else if (grp->mbg_clients[i].mgb_client == NULL &&
 		    index == -1) {
 			index = i;
@@ -478,10 +470,20 @@
 	 * to detect that condition after re-acquiring the lock.
 	 */
 	grp->mbg_clients_gen++;
-add_done:
 	rw_exit(&mip->mi_rw_lock);
+	return (0);
 
-	return (0);
+fail:
+	if (prev_mi_addr != NULL) {
+		kmem_free(*prev_mi_addr, sizeof (mac_mcast_addrs_t));
+		*prev_mi_addr = NULL;
+		(void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr);
+	}
+	if (prev_mci_addr != NULL) {
+		kmem_free(*prev_mci_addr, sizeof (mac_mcast_addrs_t));
+		*prev_mci_addr = NULL;
+	}
+	return (rc);
 }
 
 /*
@@ -559,6 +561,8 @@
 		*prev = grp->mbg_next;
 	}
 update_maddr:
+	rw_exit(&mip->mi_rw_lock);
+
 	if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) {
 		mprev = &mcip->mci_mcast_addrs;
 		for (maddr = mcip->mci_mcast_addrs; maddr != NULL;
@@ -583,12 +587,12 @@
 		}
 		ASSERT(maddr != NULL);
 		if (--maddr->mma_ref == 0) {
+			(void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr);
 			*mprev = maddr->mma_next;
 			maddr->mma_next = NULL;
 			kmem_free(maddr, sizeof (mac_mcast_addrs_t));
 		}
 	}
-	rw_exit(&mip->mi_rw_lock);
 
 	/*
 	 * If the group itself is being removed, remove the
--- a/usr/src/uts/common/io/mac/mac_client.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_client.c	Tue Feb 17 01:31:30 2009 -0800
@@ -1159,18 +1159,6 @@
 		 */
 
 		mcip = mac_vnic_lower(mip);
-		/*
-		 * If there are multiple MAC clients of the VNIC, they
-		 * all share the same underlying MAC client handle.
-		 */
-		if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0)
-			mcip->mci_state_flags |= MCIS_TAG_DISABLE;
-
-		if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0)
-			mcip->mci_state_flags |= MCIS_STRIP_DISABLE;
-
-		if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0)
-			mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK;
 
 		/*
 		 * Note that multiple mac clients share the same mcip in
@@ -1328,13 +1316,6 @@
 		 * when the VNIC is deleted.
 		 */
 
-		/*
-		 * Clear the flags set when the upper client initiated
-		 * open.
-		 */
-		mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE |
-		    MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK);
-
 		i_mac_perim_exit(mip);
 		return;
 	}
@@ -1377,12 +1358,11 @@
 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
 
 	/*
-	 * If the mac_client is a VLAN or native media is non ethernet, we
-	 * should not do DLS bypass and instead let the packets go via the
-	 * default mac_rx_deliver route so vlan header can be stripped etc.
+	 * If the mac_client is a VLAN, we should not do DLS bypass and
+	 * instead let the packets come up via mac_rx_deliver so the vlan
+	 * header can be stripped.
 	 */
-	if (mcip->mci_nvids > 0 ||
-	    mip->mi_info.mi_nativemedia != DL_ETHER)
+	if (mcip->mci_nvids > 0)
 		return (B_FALSE);
 
 	/*
@@ -1606,6 +1586,37 @@
 	}
 }
 
+static void
+mac_update_single_active_client(mac_impl_t *mip)
+{
+	mac_client_impl_t *client = NULL;
+
+	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
+
+	rw_enter(&mip->mi_rw_lock, RW_WRITER);
+	if (mip->mi_nactiveclients == 1) {
+		/*
+		 * Find the one active MAC client from the list of MAC
+		 * clients. The active MAC client has at least one
+		 * unicast address.
+		 */
+		for (client = mip->mi_clients_list; client != NULL;
+		    client = client->mci_client_next) {
+			if (client->mci_unicast_list != NULL)
+				break;
+		}
+		ASSERT(client != NULL);
+	}
+
+	/*
+	 * mi_single_active_client is protected by the MAC impl's read/writer
+	 * lock, which allows mac_rx() to check the value of that pointer
+	 * as a reader.
+	 */
+	mip->mi_single_active_client = client;
+	rw_exit(&mip->mi_rw_lock);
+}
+
 /*
  * Add a new unicast address to the MAC client.
  *
@@ -1712,11 +1723,13 @@
 		mip->mi_state_flags |= MIS_EXCLUSIVE;
 
 	bzero(&mrp, sizeof (mac_resource_props_t));
-	if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) {
+	if (is_primary && !(mcip->mci_state_flags & (MCIS_IS_VNIC |
+	    MCIS_IS_AGGR_PORT))) {
 		/*
 		 * Apply the property cached in the mac_impl_t to the primary
-		 * mac client. If the mac client is a VNIC, its property were
-		 * already set in the mcip when the VNIC was created.
+		 * mac client. If the mac client is a VNIC or an aggregation
+		 * port, its property should be set in the mcip when the
+		 * VNIC/aggr was created.
 		 */
 		mac_get_resources((mac_handle_t)mip, &mrp);
 		(void) mac_client_set_resources(mch, &mrp);
@@ -1781,8 +1794,13 @@
 			goto bail;
 		bcast_added = B_TRUE;
 	}
-	flent = mcip->mci_flent;
-	ASSERT(flent != NULL);
+
+	/*
+	 * If this is the first unicast address addition for this
+	 * client, reuse the pre-allocated larval flow entry associated with
+	 * the MAC client.
+	 */
+	flent = (mcip->mci_nflents == 0) ? mcip->mci_flent : NULL;
 
 	/* We are configuring the unicast flow now */
 	if (!MCIP_DATAPATH_SETUP(mcip)) {
@@ -1806,6 +1824,7 @@
 
 		mip->mi_nactiveclients++;
 		nactiveclients_added = B_TRUE;
+
 		/*
 		 * This will allocate the RX ring group if possible for the
 		 * flow and program the software classifier as needed.
@@ -1817,6 +1836,12 @@
 		 * The unicast MAC address must have been added successfully.
 		 */
 		ASSERT(mcip->mci_unicast != NULL);
+		/*
+		 * Push down the sub-flows that were defined on this link
+		 * hitherto. The flows are added to the active flow table
+		 * and SRS, softrings etc. are created as needed.
+		 */
+		mac_link_init_flows(mch);
 	} else {
 		mac_address_t *map = mcip->mci_unicast;
 
@@ -1871,6 +1896,9 @@
 	mcip->mci_unicast_list = muip;
 	rw_exit(&mcip->mci_rw_lock);
 
+	if (nactiveclients_added)
+		mac_update_single_active_client(mip);
+
 	*mah = (mac_unicast_handle_t)muip;
 
 	/* add it to the flow list of this mcip */
@@ -1906,8 +1934,11 @@
 	if (mac_started)
 		mac_stop(mip);
 
-	if (nactiveclients_added)
+	if (nactiveclients_added) {
 		mip->mi_nactiveclients--;
+		mac_update_single_active_client(mip);
+	}
+
 	if (mcip->mci_state_flags & MCIS_EXCLUSIVE)
 		mip->mi_state_flags &= ~MIS_EXCLUSIVE;
 	kmem_free(muip, sizeof (mac_unicast_impl_t));
@@ -1983,9 +2014,9 @@
 	 * Remove the VID from the list of client's VIDs.
 	 */
 	pre = mcip->mci_unicast_list;
-	if (muip == pre)
+	if (muip == pre) {
 		mcip->mci_unicast_list = muip->mui_next;
-	else {
+	} else {
 		while ((pre->mui_next != NULL) && (pre->mui_next != muip))
 			pre = pre->mui_next;
 		ASSERT(pre->mui_next == muip);
@@ -1997,14 +2028,16 @@
 	if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0)
 		mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY;
 
-	/*
-	 * This MAC client is shared, so we will just remove the flent
-	 * corresponding to the address being removed. We don't invoke
-	 * mac_rx_classify_flow_rem() since the additional flow is
-	 * not associated with its own separate set of SRS and rings,
-	 * and these constructs are still needed for the remaining flows.
-	 */
 	if (!mac_client_single_rcvr(mcip)) {
+		/*
+		 * This MAC client is shared by more than one unicast
+		 * addresses, so we will just remove the flent
+		 * corresponding to the address being removed. We don't invoke
+		 * mac_rx_classify_flow_rem() since the additional flow is
+		 * not associated with its own separate set of SRS and rings,
+		 * and these constructs are still needed for the remaining
+		 * flows.
+		 */
 		flent = mac_client_get_flow(mcip, muip);
 		ASSERT(flent != NULL);
 
@@ -2037,7 +2070,20 @@
 		return (0);
 	}
 
+	/*
+	 * We would have initialized subflows etc. only if we brought up
+	 * the primary client and set the unicast unicast address etc.
+	 * Deactivate the flows. The flow entry will be removed from the
+	 * active flow tables, and the associated SRS, softrings etc will
+	 * be deleted. But the flow entry itself won't be destroyed, instead
+	 * it will continue to be archived off the  the global flow hash
+	 * list, for a possible future activation when say IP is plumbed
+	 * again.
+	 */
+	mac_link_release_flows(mch);
+
 	mip->mi_nactiveclients--;
+	mac_update_single_active_client(mip);
 
 	/* Tear down the Data path */
 	mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK);
@@ -2252,6 +2298,8 @@
 	mpip->mpi_mcip = mcip;
 	mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0);
 	mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0);
+	mpip->mpi_strip_vlan_tag =
+	    ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0);
 
 	mcbi = &mip->mi_promisc_cb_info;
 	mutex_enter(mcbi->mcbi_lockp);
@@ -2503,44 +2551,65 @@
  * mac_tx_is_blocked
  *
  * Given a cookie, it returns if the ring identified by the cookie is
- * flow-controlled or not (this is not implemented yet). If NULL is
- * passed in place of a cookie, then it finds out if any of the
- * underlying rings belonging to the SRS is flow controlled or not
- * and returns that status.
+ * flow-controlled or not. If NULL is passed in place of a cookie,
+ * then it finds out if any of the underlying rings belonging to the
+ * SRS is flow controlled or not and returns that status.
  */
 /* ARGSUSED */
 boolean_t
 mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie)
 {
 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
-	mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip);
+	mac_soft_ring_set_t *mac_srs;
 	mac_soft_ring_t *sringp;
 	boolean_t blocked = B_FALSE;
+	mac_tx_percpu_t *mytx;
+	int err;
 	int i;
 
 	/*
-	 * On etherstubs, there won't be a Tx SRS or an Rx
-	 * SRS. Infact there won't even be a flow_entry.
+	 * Bump the reference count so that mac_srs won't be deleted.
+	 * If the client is currently quiesced and we failed to bump
+	 * the reference, return B_TRUE so that flow control stays
+	 * as enabled.
+	 *
+	 * Flow control will then be disabled once the client is no
+	 * longer quiesced.
 	 */
-	if (mac_srs == NULL)
+	MAC_TX_TRY_HOLD(mcip, mytx, err);
+	if (err != 0)
+		return (B_TRUE);
+
+	if ((mac_srs = MCIP_TX_SRS(mcip)) == NULL) {
+		MAC_TX_RELE(mcip, mytx);
 		return (B_FALSE);
+	}
 
 	mutex_enter(&mac_srs->srs_lock);
 	if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) {
-		for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
-			sringp = mac_srs->srs_oth_soft_rings[i];
+		if (cookie != NULL) {
+			sringp = (mac_soft_ring_t *)cookie;
 			mutex_enter(&sringp->s_ring_lock);
-			if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+			if (sringp->s_ring_state & S_RING_TX_HIWAT)
 				blocked = B_TRUE;
+			mutex_exit(&sringp->s_ring_lock);
+		} else {
+			for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
+				sringp = mac_srs->srs_oth_soft_rings[i];
+				mutex_enter(&sringp->s_ring_lock);
+				if (sringp->s_ring_state & S_RING_TX_HIWAT) {
+					blocked = B_TRUE;
+					mutex_exit(&sringp->s_ring_lock);
+					break;
+				}
 				mutex_exit(&sringp->s_ring_lock);
-				break;
 			}
-			mutex_exit(&sringp->s_ring_lock);
 		}
 	} else {
 		blocked = (mac_srs->srs_state & SRS_TX_HIWAT);
 	}
 	mutex_exit(&mac_srs->srs_lock);
+	MAC_TX_RELE(mcip, mytx);
 	return (blocked);
 }
 
@@ -2846,6 +2915,10 @@
 		return;
 	mp_copy->b_next = NULL;
 
+	if (mpip->mpi_strip_vlan_tag) {
+		if ((mp_copy = mac_strip_vlan_tag_chain(mp_copy)) == NULL)
+			return;
+	}
 	mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
 }
 
@@ -3218,7 +3291,7 @@
 	 */
 	bcopy(mrp, &tmrp, sizeof (mac_resource_props_t));
 	mcip = mac_primary_client_handle(mip);
-	if (mcip != NULL) {
+	if (mcip != NULL && (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) {
 		err =
 		    mac_client_set_resources((mac_client_handle_t)mcip, &tmrp);
 	}
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,16 +73,24 @@
  * The duration in msec we wait before signalling the soft ring
  * worker thread in case packets get queued.
  */
-static uint32_t mac_soft_ring_worker_wait = 0;
+uint32_t mac_soft_ring_worker_wait = 0;
+
+/*
+ * A global tunable for turning polling on/off. By default, dynamic
+ * polling is always on and is always very beneficial. It should be
+ * turned off with absolute care and for the rare workload (very
+ * low latency sensitive traffic).
+ */
+int mac_poll_enable = B_TRUE;
 
 /*
  * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
  * Large values could end up in consuming lot of system memory and cause
  * system hang.
  */
-static int mac_soft_ring_max_q_cnt = 1024;
-static int mac_soft_ring_min_q_cnt = 256;
-static int mac_soft_ring_poll_thres = 16;
+int mac_soft_ring_max_q_cnt = 1024;
+int mac_soft_ring_min_q_cnt = 256;
+int mac_soft_ring_poll_thres = 16;
 
 /*
  * Default value of number of TX rings to be assigned to a MAC client.
@@ -91,8 +99,8 @@
  * If no TX rings are available, then MAC client(s) will be assigned the
  * default Tx ring. Default Tx ring can be shared among multiple MAC clients.
  */
-static uint32_t mac_tx_ring_count = 8;
-static boolean_t mac_tx_serialize = B_FALSE;
+uint32_t mac_tx_ring_count = 8;
+boolean_t mac_tx_serialize = B_FALSE;
 
 /*
  * mac_tx_srs_hiwat is the queue depth threshold at which callers of
@@ -105,8 +113,8 @@
  * Note that mac_tx_srs_hiwat is always be lesser than
  * mac_tx_srs_max_q_cnt.
  */
-static uint32_t mac_tx_srs_max_q_cnt = 100000;
-static uint32_t mac_tx_srs_hiwat = 1000;
+uint32_t mac_tx_srs_max_q_cnt = 100000;
+uint32_t mac_tx_srs_hiwat = 1000;
 
 /*
  * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
@@ -131,8 +139,8 @@
  * rings is based on specified bandwidth, CPU speed and number of CPUs in
  * the system.
  */
-static uint_t mac_rx_soft_ring_count = 8;
-static uint_t mac_rx_soft_ring_10gig_count = 8;
+uint_t mac_rx_soft_ring_count = 8;
+uint_t mac_rx_soft_ring_10gig_count = 8;
 
 /*
  * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
@@ -146,18 +154,12 @@
 /*
  * Whether the SRS threads should be bound, or not.
  */
-static boolean_t mac_srs_thread_bind = B_TRUE;
+boolean_t mac_srs_thread_bind = B_TRUE;
 
 /*
  * CPU to fallback to, used by mac_next_bind_cpu().
  */
-static processorid_t srs_bind_cpu = 0;
-
-/*
- * Possible setting for soft_ring_process_flag is
- * 0 or ST_RING_WORKER_ONLY.
- */
-static int soft_ring_process_flag = ST_RING_WORKER_ONLY;
+processorid_t srs_bind_cpu = 0;
 
 /*
  * If cpu bindings are specified by user, then Tx SRS and its soft
@@ -503,7 +505,7 @@
 	    (ring->mr_classify_type == MAC_HW_CLASSIFIER)) {
 		if (turn_off_poll_capab)
 			mac_srs->srs_state &= ~SRS_POLLING_CAPAB;
-		else
+		else if (mac_poll_enable)
 			mac_srs->srs_state |= SRS_POLLING_CAPAB;
 	}
 	srs_rx->sr_lower_proc = rx_func;
@@ -1498,7 +1500,7 @@
     mac_soft_ring_set_t *mac_tx_srs)
 {
 	mac_soft_ring_t *softring;
-	uint32_t soft_ring_flag = soft_ring_process_flag;
+	uint32_t soft_ring_flag = 0;
 	processorid_t cpuid = -1;
 	boolean_t user_specified;
 	int i, srings_present, new_fanout_cnt;
@@ -1606,7 +1608,7 @@
 {
 	int		i;
 	processorid_t	cpuid, worker_cpuid, poll_cpuid;
-	uint32_t	soft_ring_flag = soft_ring_process_flag;
+	uint32_t	soft_ring_flag = 0;
 	int soft_ring_cnt;
 	boolean_t user_specified = B_FALSE;
 	mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu;
@@ -1917,7 +1919,8 @@
 		    (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres :
 		    (srs_rx->sr_lowat >> 1);
 		if (mac_latency_optimize)
-			mac_srs->srs_state |= SRS_LATENCY_OPT;
+			mac_srs->srs_state |=
+			    (SRS_LATENCY_OPT|SRS_SOFTRING_QUEUE);
 	}
 
 	mac_srs->srs_worker = thread_create(NULL, 0,
@@ -1956,12 +1959,21 @@
 		ring->mr_classify_type = MAC_HW_CLASSIFIER;
 		ring->mr_flag |= MR_INCIPIENT;
 
-		if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
+		if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && mac_poll_enable)
 			mac_srs->srs_state |= SRS_POLLING_CAPAB;
 
 		mac_srs->srs_poll_thr = thread_create(NULL, 0,
 		    mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN,
 		    mac_srs->srs_pri);
+		/*
+		 * Some drivers require serialization and don't send
+		 * packet chains in interrupt context. For such
+		 * drivers, we should always queue in soft ring
+		 * so that we get a chance to switch into a polling
+		 * mode under backlog.
+		 */
+		if (mcip->mci_mip->mi_v12n_level & MAC_VIRT_SERIALIZE)
+			mac_srs->srs_state |= SRS_SOFTRING_QUEUE;
 	}
 	return (mac_srs);
 }
@@ -2131,10 +2143,6 @@
 				mac_srs = mac_srs_create(mcip, flent,
 				    fanout_type | link_type,
 				    mac_rx_deliver, mcip, NULL, ring);
-				if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) {
-					mac_srs->srs_rx.sr_enqueue_always =
-					    B_TRUE;
-				}
 				break;
 			default:
 				cmn_err(CE_PANIC, "srs_setup: mcip = %p "
@@ -2706,6 +2714,7 @@
 			mac_srs_group_setup(grp_only_mcip,
 			    grp_only_mcip->mci_flent,
 			    default_group, SRST_LINK);
+			mac_rx_group_unmark(default_group, MR_INCIPIENT);
 		}
 	}
 }
@@ -3173,7 +3182,7 @@
 {
 	mac_impl_t *mip = mcip->mci_mip;
 	mac_soft_ring_set_t *tx_srs;
-	int i, tx_ring_count = 0, tx_rings_reserved;
+	int i, tx_ring_count = 0, tx_rings_reserved = 0;
 	mac_ring_handle_t *tx_ring = NULL;
 	uint32_t soft_ring_type;
 	mac_group_t *grp = NULL;
--- a/usr/src/uts/common/io/mac/mac_flow.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_flow.c	Tue Feb 17 01:31:30 2009 -0800
@@ -479,8 +479,8 @@
 	int		i, err;
 
 	s.fs_flags = flags;
+retry:
 	s.fs_mp = mp;
-retry:
 
 	/*
 	 * Walk the list of predeclared accept functions.
@@ -489,6 +489,8 @@
 	 */
 	for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
 		if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
+			mblk_t	*last;
+
 			/*
 			 * ENOBUFS indicates that the mp could be too short
 			 * and may need a pullup.
@@ -497,11 +499,13 @@
 				return (err);
 
 			/*
-			 * Don't modify the mblk if there are references to it.
-			 * Also, there is no point pulling up if b_cont is NULL.
+			 * The pullup is done on the last processed mblk, not
+			 * the starting one. pullup is not done if the mblk
+			 * has references or if b_cont is NULL.
 			 */
-			if (DB_REF(mp) > 1 || mp->b_cont == NULL ||
-			    pullupmsg(mp, -1) == 0)
+			last = s.fs_mp;
+			if (DB_REF(last) > 1 || last->b_cont == NULL ||
+			    pullupmsg(last, -1) == 0)
 				return (EINVAL);
 
 			retried = B_TRUE;
@@ -1209,10 +1213,11 @@
 
 	/*
 	 * Add the subflow to the subflow table. Also instantiate the flow
-	 * in the mac if there is an active DLS user. The dl_mah is set when
-	 * dls_active_set() is called, typically during interface plumb.
+	 * in the mac if there is an active user (we check if the MAC client's
+	 * datapath has been setup).
 	 */
-	err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL);
+	err = mac_flow_add_subflow(dlp->dl_mch, flent,
+	    MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
 	if (err != 0)
 		goto bail;
 
@@ -1514,6 +1519,17 @@
 
 #define	PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
 
+#define	CHECK_AND_ADJUST_START_PTR(s, start) {		\
+	if ((s)->fs_mp->b_wptr == (start)) {		\
+		mblk_t	*next = (s)->fs_mp->b_cont;	\
+		if (next == NULL)			\
+			return (EINVAL);		\
+							\
+		(s)->fs_mp = next;			\
+		(start) = next->b_rptr;			\
+	}						\
+}
+
 /* ARGSUSED */
 static boolean_t
 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
@@ -1830,7 +1846,14 @@
 	uint16_t	sap = l2info->l2_sap;
 	uchar_t		*l3_start;
 
-	l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize;
+	l3_start = l2info->l2_start + l2info->l2_hdrsize;
+
+	/*
+	 * Adjust start pointer if we're at the end of an mblk.
+	 */
+	CHECK_AND_ADJUST_START_PTR(s, l3_start);
+
+	l3info->l3_start = l3_start;
 	if (!OK_32PTR(l3_start))
 		return (EINVAL);
 
@@ -2193,7 +2216,14 @@
 	uint8_t		proto = l3info->l3_protocol;
 	uchar_t		*l4_start;
 
-	l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize;
+	l4_start = l3info->l3_start + l3info->l3_hdrsize;
+
+	/*
+	 * Adjust start pointer if we're at the end of an mblk.
+	 */
+	CHECK_AND_ADJUST_START_PTR(s, l4_start);
+
+	l4info->l4_start = l4_start;
 	if (!OK_32PTR(l4_start))
 		return (EINVAL);
 
--- a/usr/src/uts/common/io/mac/mac_provider.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_provider.c	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -668,6 +668,24 @@
 			return;
 		}
 		/* We'll fall through to software classification */
+	} else {
+		flow_entry_t *flent;
+		int err;
+
+		rw_enter(&mip->mi_rw_lock, RW_READER);
+		if (mip->mi_single_active_client != NULL) {
+			flent = mip->mi_single_active_client->mci_flent_list;
+			FLOW_TRY_REFHOLD(flent, err);
+			rw_exit(&mip->mi_rw_lock);
+			if (err == 0) {
+				(flent->fe_cb_fn)(flent->fe_cb_arg1,
+				    flent->fe_cb_arg2, mp_chain, B_FALSE);
+				FLOW_REFRELE(flent);
+				return;
+			}
+		} else {
+			rw_exit(&mip->mi_rw_lock);
+		}
 	}
 
 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
--- a/usr/src/uts/common/io/mac/mac_sched.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_sched.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -515,25 +515,27 @@
 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 {
 	struct ether_header		*ehp;
-	uint16_t			etype;
+	struct ether_vlan_header	*evhp;
+	uint32_t			sap;
 	ipha_t				*ipha;
-	mac_soft_ring_t			*softring;
-	size_t				ether_hlen;
+	uint8_t				*dstaddr;
+	size_t				hdrsize;
 	mblk_t				*mp;
 	mblk_t				*headmp[MAX_SR_TYPES];
 	mblk_t				*tailmp[MAX_SR_TYPES];
 	int				cnt[MAX_SR_TYPES];
 	size_t				sz[MAX_SR_TYPES];
 	size_t				sz1;
-	boolean_t			bw_ctl = B_FALSE;
+	boolean_t			bw_ctl;
 	boolean_t			hw_classified;
-	boolean_t			dls_bypass = B_TRUE;
-	enum				pkt_type type;
+	boolean_t			dls_bypass;
+	boolean_t			is_ether;
+	boolean_t			is_unicast;
+	enum pkt_type			type;
 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
-	struct ether_vlan_header	*evhp;
-
-	if (mac_srs->srs_type & SRST_BW_CONTROL)
-		bw_ctl = B_TRUE;
+
+	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
+	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 
 	/*
 	 * If we don't have a Rx ring, S/W classification would have done
@@ -550,8 +552,7 @@
 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 	 * such SRSs.
 	 */
-	if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
-		dls_bypass = B_FALSE;
+	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
 
 	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
 	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
@@ -570,68 +571,62 @@
 		mp->b_next = NULL;
 
 		type = OTH;
-		sz1 = msgdsize(mp);
-
-		if (!dls_bypass) {
-			mac_impl_t	*mip = mcip->mci_mip;
-
+		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
+
+		if (is_ether) {
+			/*
+			 * At this point we can be sure the packet at least
+			 * has an ether header.
+			 */
+			if (sz1 < sizeof (struct ether_header)) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
 			ehp = (struct ether_header *)mp->b_rptr;
 
 			/*
-			 * For VLAN packets, if the VLAN id doesn't belong
-			 * to this client, we drop the packet.
+			 * Determine if this is a VLAN or non-VLAN packet.
 			 */
-			if (mip->mi_info.mi_nativemedia == DL_ETHER &&
-			    ntohs(ehp->ether_type) == VLAN_TPID) {
+			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
+				evhp = (struct ether_vlan_header *)mp->b_rptr;
+				sap = ntohs(evhp->ether_type);
+				hdrsize = sizeof (struct ether_vlan_header);
 				/*
-				 * LINTED: cast may result in improper
-				 * alignment
+				 * Check if the VID of the packet, if any,
+				 * belongs to this client.
 				 */
-				evhp = (struct ether_vlan_header *)ehp;
 				if (!mac_client_check_flow_vid(mcip,
 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
 					mac_rx_drop_pkt(mac_srs, mp);
 					continue;
 				}
+			} else {
+				hdrsize = sizeof (struct ether_header);
 			}
+			is_unicast =
+			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
+			dstaddr = (uint8_t *)&ehp->ether_dhost;
+		} else {
+			mac_header_info_t		mhi;
+
+			if (mac_header_info((mac_handle_t)mcip->mci_mip,
+			    mp, &mhi) != 0) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+			hdrsize = mhi.mhi_hdrsize;
+			sap = mhi.mhi_bindsap;
+			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
+			dstaddr = (uint8_t *)mhi.mhi_daddr;
+		}
+
+		if (!dls_bypass) {
 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 			    cnt[type], bw_ctl, sz[type], sz1, mp);
 			continue;
 		}
 
-		/*
-		 * At this point we can be sure the packet at least
-		 * has an ether header.
-		 */
-		if (sz1 < sizeof (struct ether_header)) {
-			mac_rx_drop_pkt(mac_srs, mp);
-			continue;
-		}
-		/* LINTED: cast may result in improper alignment */
-		ehp = (struct ether_header *)mp->b_rptr;
-
-		/*
-		 * Determine if this is a VLAN or non-VLAN packet.
-		 */
-		if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
-			/* LINTED: cast may result in improper alignment */
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-			etype = ntohs(evhp->ether_type);
-			ether_hlen = sizeof (struct ether_vlan_header);
-			/*
-			 * Check if the VID of the packet, if any, belongs
-			 * to this client.
-			 */
-			if (!mac_client_check_flow_vid(mcip,
-			    VLAN_ID(ntohs(evhp->ether_tci)))) {
-				mac_rx_drop_pkt(mac_srs, mp);
-				continue;
-			}
-		} else {
-			ether_hlen = sizeof (struct ether_header);
-		}
-
-		if (etype == ETHERTYPE_IP) {
+		if (sap == ETHERTYPE_IP) {
 			/*
 			 * If we are H/W classified, but we have promisc
 			 * on, then we need to check for the unicast address.
@@ -641,12 +636,11 @@
 
 				rw_enter(&mcip->mci_rw_lock, RW_READER);
 				map = mcip->mci_unicast;
-				if (bcmp(&ehp->ether_dhost, map->ma_addr,
+				if (bcmp(dstaddr, map->ma_addr,
 				    map->ma_len) == 0)
 					type = UNDEF;
 				rw_exit(&mcip->mci_rw_lock);
-			} else if (((((uint8_t *)&ehp->ether_dhost)[0] &
-			    0x01) == 0)) {
+			} else if (is_unicast) {
 				type = UNDEF;
 			}
 		}
@@ -665,8 +659,7 @@
 		 * the 'OTH' type path without DLS bypass.
 		 */
 
-		/* LINTED: cast may result in improper alignment */
-		ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
 			type = OTH;
 
@@ -686,25 +679,25 @@
 		switch (ipha->ipha_protocol) {
 		case IPPROTO_TCP:
 			type = V4_TCP;
-			mp->b_rptr += ether_hlen;
+			mp->b_rptr += hdrsize;
 			break;
 		case IPPROTO_UDP:
 			type = V4_UDP;
-			mp->b_rptr += ether_hlen;
+			mp->b_rptr += hdrsize;
 			break;
 		default:
 			type = OTH;
 			break;
 		}
 
-		ASSERT(type != UNDEF);
-
 		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
 		    bw_ctl, sz[type], sz1, mp);
 	}
 
 	for (type = V4_TCP; type < UNDEF; type++) {
 		if (headmp[type] != NULL) {
+			mac_soft_ring_t			*softring;
+
 			ASSERT(tailmp[type]->b_next == NULL);
 			switch (type) {
 			case V4_TCP:
@@ -716,7 +709,7 @@
 			case OTH:
 				softring = mac_srs->srs_oth_soft_rings[0];
 			}
-			mac_rx_soft_ring_process(mac_srs->srs_mcip, softring,
+			mac_rx_soft_ring_process(mcip, softring,
 			    headmp[type], tailmp[type], cnt[type], sz[type]);
 		}
 	}
@@ -731,7 +724,7 @@
  */
 static int
 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
-    uint16_t etype, enum pkt_type *type, uint_t *indx)
+    uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 {
 	ip6_t		*ip6h;
 	uint8_t		*whereptr;
@@ -740,18 +733,18 @@
 	uint8_t		nexthdr;
 	uint16_t	hdr_len;
 
-	if (etype == ETHERTYPE_IPV6) {
+	if (sap == ETHERTYPE_IPV6) {
 		boolean_t	modifiable = B_TRUE;
 
-		ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
-
-		ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header));
+		ASSERT(MBLKL(mp) >= hdrsize);
+
+		ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
 		if ((unsigned char *)ip6h == mp->b_wptr) {
 			/*
-			 * The first mblk_t only includes the ethernet header.
+			 * The first mblk_t only includes the mac header.
 			 * Note that it is safe to change the mp pointer here,
 			 * as the subsequent operation does not assume mp
-			 * points to the start of the ethernet header.
+			 * points to the start of the mac header.
 			 */
 			mp = mp->b_cont;
 
@@ -900,32 +893,32 @@
 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 {
 	struct ether_header		*ehp;
-	uint16_t			etype;
+	struct ether_vlan_header	*evhp;
+	uint32_t			sap;
 	ipha_t				*ipha;
+	uint8_t				*dstaddr;
 	uint_t				indx;
-	int				ports_offset = -1;
-	int				ipha_len;
+	size_t				ports_offset;
+	size_t				ipha_len;
+	size_t				hdrsize;
 	uint_t				hash;
-	mac_soft_ring_t			*softring;
-	size_t				ether_hlen;
-	uint16_t			frag_offset_flags;
 	mblk_t				*mp;
 	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
 	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
 	size_t				sz1;
-	boolean_t			bw_ctl = B_FALSE;
+	boolean_t			bw_ctl;
 	boolean_t			hw_classified;
-	boolean_t			dls_bypass = B_TRUE;
-	int				i;
+	boolean_t			dls_bypass;
+	boolean_t			is_ether;
+	boolean_t			is_unicast;
 	int				fanout_cnt;
-	enum 				pkt_type type;
+	enum pkt_type			type;
 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
-	struct ether_vlan_header	*evhp;
-
-	if (mac_srs->srs_type & SRST_BW_CONTROL)
-		bw_ctl = B_TRUE;
+
+	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
+	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 
 	/*
 	 * If we don't have a Rx ring, S/W classification would have done
@@ -942,8 +935,7 @@
 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 	 * such SRSs.
 	 */
-	if (!(mac_srs->srs_type & SRST_DLS_BYPASS))
-		dls_bypass = B_FALSE;
+	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0);
 
 	/*
 	 * Since the softrings are never destroyed and we always
@@ -972,37 +964,60 @@
 		mp->b_next = NULL;
 
 		type = OTH;
-		sz1 = msgdsize(mp);
-
-		if (!dls_bypass) {
-			mac_impl_t	*mip = mcip->mci_mip;
-
-			indx = 0;
-			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
-				ehp = (struct ether_header *)mp->b_rptr;
-				etype = ntohs(ehp->ether_type);
+		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
+
+		if (is_ether) {
+			/*
+			 * At this point we can be sure the packet at least
+			 * has an ether header.
+			 */
+			if (sz1 < sizeof (struct ether_header)) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+			ehp = (struct ether_header *)mp->b_rptr;
+
+			/*
+			 * Determine if this is a VLAN or non-VLAN packet.
+			 */
+			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
+				evhp = (struct ether_vlan_header *)mp->b_rptr;
+				sap = ntohs(evhp->ether_type);
+				hdrsize = sizeof (struct ether_vlan_header);
 				/*
-				 * For VLAN packets, if the VLAN id doesn't
-				 * belong to this client, we drop the packet.
+				 * Check if the VID of the packet, if any,
+				 * belongs to this client.
 				 */
-				if (etype == VLAN_TPID) {
-					/*
-					 * LINTED: cast may result in improper
-					 * alignment
-					 */
-					evhp = (struct ether_vlan_header *)
-					    mp->b_rptr;
-					if (!mac_client_check_flow_vid(mcip,
-					    VLAN_ID(ntohs(evhp->ether_tci)))) {
-						mac_rx_drop_pkt(mac_srs, mp);
-						continue;
-					}
-				}
-				if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
-				    &type, &indx) == -1) {
+				if (!mac_client_check_flow_vid(mcip,
+				    VLAN_ID(ntohs(evhp->ether_tci)))) {
 					mac_rx_drop_pkt(mac_srs, mp);
 					continue;
 				}
+			} else {
+				hdrsize = sizeof (struct ether_header);
+			}
+			is_unicast =
+			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
+			dstaddr = (uint8_t *)&ehp->ether_dhost;
+		} else {
+			mac_header_info_t		mhi;
+
+			if (mac_header_info((mac_handle_t)mcip->mci_mip,
+			    mp, &mhi) != 0) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
+			}
+			hdrsize = mhi.mhi_hdrsize;
+			sap = mhi.mhi_bindsap;
+			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
+			dstaddr = (uint8_t *)mhi.mhi_daddr;
+		}
+
+		if (!dls_bypass) {
+			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
+			    hdrsize, &type, &indx) == -1) {
+				mac_rx_drop_pkt(mac_srs, mp);
+				continue;
 			}
 
 			FANOUT_ENQUEUE_MP(headmp[type][indx],
@@ -1011,47 +1026,13 @@
 			continue;
 		}
 
-		/*
-		 * At this point we can be sure the packet at least
-		 * has an ether header. On the outbound side, GLD/stack
-		 * ensure this. On the inbound side, the driver needs
-		 * to ensure this.
-		 */
-		if (sz1 < sizeof (struct ether_header)) {
-			mac_rx_drop_pkt(mac_srs, mp);
-			continue;
-		}
-		/* LINTED: cast may result in improper alignment */
-		ehp = (struct ether_header *)mp->b_rptr;
-
-		/*
-		 * Determine if this is a VLAN or non-VLAN packet.
-		 */
-		if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) {
-			/* LINTED: cast may result in improper alignment */
-			evhp = (struct ether_vlan_header *)mp->b_rptr;
-			etype = ntohs(evhp->ether_type);
-			ether_hlen = sizeof (struct ether_vlan_header);
-			/*
-			 * Check if the VID of the packet, if any, belongs
-			 * to this client.
-			 */
-			if (!mac_client_check_flow_vid(mcip,
-			    VLAN_ID(ntohs(evhp->ether_tci)))) {
-				mac_rx_drop_pkt(mac_srs, mp);
-				continue;
-			}
-		} else {
-			ether_hlen = sizeof (struct ether_header);
-		}
-
 
 		/*
 		 * If we are using the default Rx ring where H/W or S/W
 		 * classification has not happened, we need to verify if
 		 * this unicast packet really belongs to us.
 		 */
-		if (etype == ETHERTYPE_IP) {
+		if (sap == ETHERTYPE_IP) {
 			/*
 			 * If we are H/W classified, but we have promisc
 			 * on, then we need to check for the unicast address.
@@ -1061,12 +1042,11 @@
 
 				rw_enter(&mcip->mci_rw_lock, RW_READER);
 				map = mcip->mci_unicast;
-				if (bcmp(&ehp->ether_dhost, map->ma_addr,
+				if (bcmp(dstaddr, map->ma_addr,
 				    map->ma_len) == 0)
 					type = UNDEF;
 				rw_exit(&mcip->mci_rw_lock);
-			} else if (((((uint8_t *)&ehp->ether_dhost)[0] &
-			    0x01) == 0)) {
+			} else if (is_unicast) {
 				type = UNDEF;
 			}
 		}
@@ -1076,14 +1056,15 @@
 		 * the fast path.
 		 */
 
-		/* LINTED: cast may result in improper alignment */
-		ipha = (ipha_t *)(mp->b_rptr + ether_hlen);
+		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
 			type = OTH;
 			fanout_oth1++;
 		}
 
 		if (type != OTH) {
+			uint16_t	frag_offset_flags;
+
 			switch (ipha->ipha_protocol) {
 			case IPPROTO_TCP:
 			case IPPROTO_UDP:
@@ -1103,7 +1084,7 @@
 					fanout_oth3++;
 					break;
 				}
-				ports_offset = ether_hlen + ipha_len;
+				ports_offset = hdrsize + ipha_len;
 				break;
 			default:
 				type = OTH;
@@ -1113,8 +1094,8 @@
 		}
 
 		if (type == OTH) {
-			if (mac_rx_srs_long_fanout(mac_srs, mp, etype,
-			    &type, &indx) == -1) {
+			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
+			    hdrsize, &type, &indx) == -1) {
 				mac_rx_drop_pkt(mac_srs, mp);
 				continue;
 			}
@@ -1146,7 +1127,7 @@
 			    *(uint32_t *)(mp->b_rptr + ports_offset));
 			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
 			type = V4_TCP;
-			mp->b_rptr += ether_hlen;
+			mp->b_rptr += hdrsize;
 			break;
 		case IPPROTO_UDP:
 		case IPPROTO_SCTP:
@@ -1162,19 +1143,24 @@
 				mac_srs->srs_ind++;
 			}
 			type = V4_UDP;
-			mp->b_rptr += ether_hlen;
+			mp->b_rptr += hdrsize;
 			break;
+		default:
+			indx = 0;
+			type = OTH;
 		}
 
-		ASSERT(type != UNDEF);
-
 		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
 		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
 	}
 
 	for (type = V4_TCP; type < UNDEF; type++) {
+		int	i;
+
 		for (i = 0; i < fanout_cnt; i++) {
 			if (headmp[type][i] != NULL) {
+				mac_soft_ring_t	*softring;
+
 				ASSERT(tailmp[type][i]->b_next == NULL);
 				switch (type) {
 				case V4_TCP:
@@ -1190,7 +1176,7 @@
 					    mac_srs->srs_oth_soft_rings[i];
 					break;
 				}
-				mac_rx_soft_ring_process(mac_srs->srs_mcip,
+				mac_rx_soft_ring_process(mcip,
 				    softring, headmp[type][i], tailmp[type][i],
 				    cnt[type][i], sz[type][i]);
 			}
@@ -1373,46 +1359,39 @@
 		    (mac_srs->srs_first != NULL)) {
 			/*
 			 * We have packets to process and worker thread
-			 * is not running.  Check to see if poll thread is
-			 * allowed to process. Let it do processing only if it
-			 * picked up some packets from the NIC otherwise
-			 * wakeup the worker thread.
+			 * is not running. Check to see if poll thread is
+			 * allowed to process.
 			 */
-			if ((mac_srs->srs_state & SRS_LATENCY_OPT) &&
-			    (head != NULL)) {
+			if (mac_srs->srs_state & SRS_LATENCY_OPT) {
 				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
 				if (srs_rx->sr_poll_pkt_cnt <=
 				    srs_rx->sr_lowat) {
 					srs_rx->sr_poll_again++;
 					goto check_again;
-				} else {
-					/*
-					 * We are already above low water mark
-					 * so stay in the polling mode but no
-					 * need to poll. Once we dip below
-					 * the polling threshold, the processing
-					 * thread (soft ring) will signal us
-					 * to poll again (MAC_UPDATE_SRS_COUNT)
-					 */
-					srs_rx->sr_poll_drain_no_poll++;
-					mac_srs->srs_state &=
-					    ~(SRS_PROC|SRS_GET_PKTS);
-					/*
-					 * In B/W control case, its possible
-					 * that the backlog built up due to
-					 * B/W limit being reached and packets
-					 * are queued only in SRS. In this case,
-					 * we should schedule worker thread
-					 * since no one else will wake us up.
-					 */
-					if ((mac_srs->srs_type &
-					    SRST_BW_CONTROL) &&
-					    (mac_srs->srs_tid == NULL)) {
-						mac_srs->srs_tid =
-						    timeout(mac_srs_fire,
-						    mac_srs, 1);
-						srs_rx->sr_poll_worker_wakeup++;
-					}
+				}
+				/*
+				 * We are already above low water mark
+				 * so stay in the polling mode but no
+				 * need to poll. Once we dip below
+				 * the polling threshold, the processing
+				 * thread (soft ring) will signal us
+				 * to poll again (MAC_UPDATE_SRS_COUNT)
+				 */
+				srs_rx->sr_poll_drain_no_poll++;
+				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
+				/*
+				 * In B/W control case, its possible
+				 * that the backlog built up due to
+				 * B/W limit being reached and packets
+				 * are queued only in SRS. In this case,
+				 * we should schedule worker thread
+				 * since no one else will wake us up.
+				 */
+				if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
+				    (mac_srs->srs_tid == NULL)) {
+					mac_srs->srs_tid =
+					    timeout(mac_srs_fire, mac_srs, 1);
+					srs_rx->sr_poll_worker_wakeup++;
 				}
 			} else {
 				/*
@@ -1598,7 +1577,7 @@
 
 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
 	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
-again:
+
 	/* If we are blanked i.e. can't do upcalls, then we are done */
 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
@@ -1609,6 +1588,26 @@
 	if (mac_srs->srs_first == NULL)
 		goto out;
 
+	if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
+	    (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
+		/*
+		 * In the normal case, the SRS worker thread does no
+		 * work and we wait for a backlog to build up before
+		 * we switch into polling mode. In case we are
+		 * optimizing for throughput, we use the worker thread
+		 * as well. The goal is to let worker thread process
+		 * the queue and poll thread to feed packets into
+		 * the queue. As such, we should signal the poll
+		 * thread to try and get more packets.
+		 *
+		 * We could have pulled this check in the POLL_RING
+		 * macro itself but keeping it explicit here makes
+		 * the architecture more human understandable.
+		 */
+		MAC_SRS_POLL_RING(mac_srs);
+	}
+
+again:
 	head = mac_srs->srs_first;
 	mac_srs->srs_first = NULL;
 	tail = mac_srs->srs_last;
@@ -1624,10 +1623,7 @@
 
 	mac_srs->srs_state |= (SRS_PROC|proc_type);
 
-	/* Switch to polling mode */
-	MAC_SRS_WORKER_POLLING_ON(mac_srs);
-	if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
-		MAC_SRS_POLL_RING(mac_srs);
+
 	/*
 	 * mcip is NULL for broadcast and multicast flows. The promisc
 	 * callbacks for broadcast and multicast packets are delivered from
@@ -1696,37 +1692,27 @@
 		mutex_enter(&mac_srs->srs_lock);
 	}
 
-	/*
-	 * Send the poll thread to pick up any packets arrived
-	 * so far. This also serves as the last check in case
-	 * nothing else is queued in the SRS. The poll thread
-	 * is signalled only in the case the drain was done
-	 * by the worker thread and SRS_WORKER is set. The
-	 * worker thread can run in parallel as long as the
-	 * SRS_WORKER flag is set. We we have nothing else to
-	 * process, we can exit while leaving SRS_PROC set
-	 * which gives the poll thread control to process and
-	 * cleanup once it returns from the NIC.
-	 *
-	 * If we have nothing else to process, we need to
-	 * ensure that we keep holding the srs_lock till
-	 * all the checks below are done and control is
-	 * handed to the poll thread if it was running.
-	 */
-	if (mac_srs->srs_first != NULL) {
-		if (proc_type == SRS_WORKER) {
-			if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
-				MAC_SRS_POLL_RING(mac_srs);
+	if (!(mac_srs->srs_state & (SRS_LATENCY_OPT|SRS_BLANK|SRS_PAUSE))) {
+		/*
+		 * In case we are optimizing for throughput, we
+		 * should try and keep the worker thread running
+		 * as much as possible. Send the poll thread down
+		 * to check one more time if something else
+		 * arrived. In the meanwhile, if poll thread had
+		 * collected something due to earlier signal,
+		 * process it now.
+		 */
+		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
+			srs_rx->sr_drain_poll_sig++;
+			MAC_SRS_POLL_RING(mac_srs);
+		}
+		if (mac_srs->srs_first != NULL) {
 			srs_rx->sr_drain_again++;
 			goto again;
-		} else {
-			srs_rx->sr_drain_worker_sig++;
-			cv_signal(&mac_srs->srs_async);
 		}
 	}
 
 out:
-
 	if (mac_srs->srs_state & SRS_GET_PKTS) {
 		/*
 		 * Poll thread is already running. Leave the
@@ -1885,12 +1871,6 @@
 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
 	}
 
-	/*
-	 * We can continue processing the queue.
-	 * We need to figure out if there is a fanout needed or
-	 * we can just process this here.
-	 */
-
 	if ((tid = mac_srs->srs_tid) != 0)
 		mac_srs->srs_tid = 0;
 
@@ -2405,8 +2385,7 @@
 		 * optimizing for latency, we should signal the
 		 * worker thread.
 		 */
-		if (loopback || ((count > 1) &&
-		    !(mac_srs->srs_state & SRS_LATENCY_OPT))) {
+		if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) {
 			/*
 			 * For loopback, We need to let the worker take
 			 * over as we don't want to continue in the same
@@ -2502,6 +2481,12 @@
 	mblk_t *tail;
 	boolean_t wakeup_worker = B_TRUE;
 
+	/*
+	 * Ignore fanout hint if we don't have multiple tx rings.
+	 */
+	if (!TX_MULTI_RING_MODE(mac_srs))
+		fanout_hint = 0;
+
 	if (mac_srs->srs_first != NULL)
 		wakeup_worker = B_FALSE;
 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
@@ -2753,18 +2738,89 @@
  * the soft ring associated with that Tx ring. The srs itself will not
  * queue any packets.
  */
+
+#define	MAC_TX_SOFT_RING_PROCESS(chain) {		       		\
+	index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count),	\
+	softring = mac_srs->srs_oth_soft_rings[index];			\
+	cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
+	DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);	\
+}
+
 static mac_tx_cookie_t
 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
 {
 	mac_soft_ring_t		*softring;
-	uint_t			indx, hash;
+	uint64_t		hash;
+	uint_t			index;
+	mac_tx_cookie_t		cookie = NULL;
 
 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
-	hash = HASH_HINT(fanout_hint);
-	indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
-	softring = mac_srs->srs_oth_soft_rings[indx];
-	return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp));
+	if (fanout_hint != 0) {
+		/*
+		 * The hint is specified by the caller, simply pass the
+		 * whole chain to the soft ring.
+		 */
+		hash = HASH_HINT(fanout_hint);
+		MAC_TX_SOFT_RING_PROCESS(mp_chain);
+	} else {
+		mblk_t *last_mp, *cur_mp, *sub_chain;
+		uint64_t last_hash = 0;
+		uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
+
+		/*
+		 * Compute the hash from the contents (headers) of the
+		 * packets of the mblk chain. Split the chains into
+		 * subchains of the same conversation.
+		 *
+		 * Since there may be more than one ring used for
+		 * sub-chains of the same call, and since the caller
+		 * does not maintain per conversation state since it
+		 * passed a zero hint, unsent subchains will be
+		 * dropped.
+		 */
+
+		flag |= MAC_DROP_ON_NO_DESC;
+		ret_mp = NULL;
+
+		ASSERT(ret_mp == NULL);
+
+		sub_chain = NULL;
+		last_mp = NULL;
+
+		for (cur_mp = mp_chain; cur_mp != NULL;
+		    cur_mp = cur_mp->b_next) {
+			hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
+			    B_TRUE);
+			if (last_hash != 0 && hash != last_hash) {
+				/*
+				 * Starting a different subchain, send current
+				 * chain out.
+				 */
+				ASSERT(last_mp != NULL);
+				last_mp->b_next = NULL;
+				MAC_TX_SOFT_RING_PROCESS(sub_chain);
+				sub_chain = NULL;
+			}
+
+			/* add packet to subchain */
+			if (sub_chain == NULL)
+				sub_chain = cur_mp;
+			last_mp = cur_mp;
+			last_hash = hash;
+		}
+
+		if (sub_chain != NULL) {
+			/* send last subchain */
+			ASSERT(last_mp != NULL);
+			last_mp->b_next = NULL;
+			MAC_TX_SOFT_RING_PROCESS(sub_chain);
+		}
+
+		cookie = NULL;
+	}
+
+	return (cookie);
 }
 
 /*
@@ -2788,8 +2844,17 @@
 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
 	mutex_enter(&mac_srs->srs_lock);
 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
-		/* zero bandwidth: drop all */
-		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		/*
+		 * zero bandwidth, no traffic is sent: drop the packets,
+		 * or return the whole chain if the caller requests all
+		 * unsent packets back.
+		 */
+		if (flag & MAC_TX_NO_ENQUEUE) {
+			cookie = (mac_tx_cookie_t)mac_srs;
+			*ret_mp = mp_chain;
+		} else {
+			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+		}
 		mutex_exit(&mac_srs->srs_lock);
 		return (cookie);
 	} else if ((mac_srs->srs_first != NULL) ||
@@ -3223,9 +3288,6 @@
 	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
 	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
 
-	if (mip->mi_promisc_list != NULL)
-		mac_promisc_dispatch(mip, mp_chain, src_mcip);
-
 	mp = mp_chain;
 	while (mp != NULL) {
 		flow_entry_t *dst_flow_ent;
@@ -3241,6 +3303,12 @@
 		CHECK_VID_AND_ADD_TAG(mp);
 
 		/*
+		 * Check if there are promiscuous mode callbacks defined.
+		 */
+		if (mip->mi_promisc_list != NULL)
+			mac_promisc_dispatch(mip, mp, src_mcip);
+
+		/*
 		 * Find the destination.
 		 */
 		dst_flow_ent = mac_tx_classify(mip, mp);
@@ -3516,9 +3584,8 @@
 
 	mutex_enter(&ringp->s_ring_lock);
 	ringp->s_ring_total_inpkt += cnt;
-	if ((ringp->s_ring_type & ST_RING_ANY) ||
-	    ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
-	    !mac_srs->srs_rx.sr_enqueue_always)) {
+	if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
+	    !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
 		/* If on processor or blanking on, then enqueue and return */
 		if (ringp->s_ring_state & S_RING_BLANK ||
 		    ringp->s_ring_state & S_RING_PROC) {
@@ -3526,7 +3593,6 @@
 			mutex_exit(&ringp->s_ring_lock);
 			return;
 		}
-
 		proc = ringp->s_ring_rx_func;
 		arg1 = ringp->s_ring_rx_arg1;
 		arg2 = ringp->s_ring_rx_arg2;
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -207,6 +207,8 @@
 		ringp->s_ring_rx_func = rx_func;
 		ringp->s_ring_rx_arg1 = x_arg1;
 		ringp->s_ring_rx_arg2 = x_arg2;
+		if (mac_srs->srs_state & SRS_SOFTRING_QUEUE)
+			ringp->s_ring_type |= ST_RING_WORKER_ONLY;
 	}
 	if (cpuid != -1)
 		(void) mac_soft_ring_bind(ringp, cpuid);
--- a/usr/src/uts/common/io/mac/mac_util.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/mac/mac_util.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,6 +44,10 @@
 #include <sys/vtrace.h>
 #include <sys/dlpi.h>
 #include <sys/sunndi.h>
+#include <inet/ipsec_impl.h>
+#include <inet/sadb.h>
+#include <inet/ipsecesp.h>
+#include <inet/ipsecah.h>
 
 /*
  * Copy an mblk, preserving its hardware checksum flags.
@@ -821,3 +825,192 @@
 
 	return ((void *)mip->mi_dip);
 }
+
+#define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
+#define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
+
+uint64_t
+mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
+{
+	struct ether_header *ehp;
+	uint64_t hash = 0;
+	uint16_t sap;
+	uint_t skip_len;
+	uint8_t proto;
+
+	/*
+	 * We may want to have one of these per MAC type plugin in the
+	 * future. For now supports only ethernet.
+	 */
+	if (media != DL_ETHER)
+		return (0L);
+
+	/* for now we support only outbound packets */
+	ASSERT(is_outbound);
+	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
+	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
+
+	/* compute L2 hash */
+
+	ehp = (struct ether_header *)mp->b_rptr;
+
+	if ((policy & MAC_PKT_HASH_L2) != 0) {
+		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
+		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
+		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
+		policy &= ~MAC_PKT_HASH_L2;
+	}
+
+	if (policy == 0)
+		goto done;
+
+	/* skip ethernet header */
+
+	sap = ntohs(ehp->ether_type);
+	if (sap == ETHERTYPE_VLAN) {
+		struct ether_vlan_header *evhp;
+		mblk_t *newmp = NULL;
+
+		skip_len = sizeof (struct ether_vlan_header);
+		if (MBLKL(mp) < skip_len) {
+			/* the vlan tag is the payload, pull up first */
+			newmp = msgpullup(mp, -1);
+			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
+				goto done;
+			}
+			evhp = (struct ether_vlan_header *)newmp->b_rptr;
+		} else {
+			evhp = (struct ether_vlan_header *)mp->b_rptr;
+		}
+
+		sap = ntohs(evhp->ether_type);
+		freemsg(newmp);
+	} else {
+		skip_len = sizeof (struct ether_header);
+	}
+
+	/* if ethernet header is in its own mblk, skip it */
+	if (MBLKL(mp) <= skip_len) {
+		skip_len -= MBLKL(mp);
+		mp = mp->b_cont;
+		if (mp == NULL)
+			goto done;
+	}
+
+	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+
+	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
+
+	switch (sap) {
+	case ETHERTYPE_IP: {
+		ipha_t *iphp;
+
+		/*
+		 * If the header is not aligned or the header doesn't fit
+		 * in the mblk, bail now. Note that this may cause packets
+		 * reordering.
+		 */
+		iphp = (ipha_t *)(mp->b_rptr + skip_len);
+		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
+		    !OK_32PTR((char *)iphp))
+			goto done;
+
+		proto = iphp->ipha_protocol;
+		skip_len += IPH_HDR_LENGTH(iphp);
+
+		if ((policy & MAC_PKT_HASH_L3) != 0) {
+			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
+			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
+
+			hash ^= (PKT_HASH_4BYTES(ip_src) ^
+			    PKT_HASH_4BYTES(ip_dst));
+			policy &= ~MAC_PKT_HASH_L3;
+		}
+		break;
+	}
+	case ETHERTYPE_IPV6: {
+		ip6_t *ip6hp;
+		uint16_t hdr_length;
+
+		/*
+		 * If the header is not aligned or the header doesn't fit
+		 * in the mblk, bail now. Note that this may cause packets
+		 * reordering.
+		 */
+
+		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
+		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
+		    !OK_32PTR((char *)ip6hp))
+			goto done;
+
+		if (!mac_ip_hdr_length_v6(mp, ip6hp, &hdr_length, &proto))
+			goto done;
+		skip_len += hdr_length;
+
+		if ((policy & MAC_PKT_HASH_L3) != 0) {
+			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
+			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
+
+			hash ^= (PKT_HASH_4BYTES(ip_src) ^
+			    PKT_HASH_4BYTES(ip_dst));
+			policy &= ~MAC_PKT_HASH_L3;
+		}
+		break;
+	}
+	default:
+		goto done;
+	}
+
+	if (policy == 0)
+		goto done;
+
+	/* if ip header is in its own mblk, skip it */
+	if (MBLKL(mp) <= skip_len) {
+		skip_len -= MBLKL(mp);
+		mp = mp->b_cont;
+		if (mp == NULL)
+			goto done;
+	}
+
+	/* parse ULP header */
+again:
+	switch (proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ESP:
+	case IPPROTO_SCTP:
+		/*
+		 * These Internet Protocols are intentionally designed
+		 * for hashing from the git-go.  Port numbers are in the first
+		 * word for transports, SPI is first for ESP.
+		 */
+		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
+			goto done;
+		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
+		break;
+
+	case IPPROTO_AH: {
+		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
+		uint_t ah_length = AH_TOTAL_LEN(ah);
+
+		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
+			goto done;
+
+		proto = ah->ah_nexthdr;
+		skip_len += ah_length;
+
+		/* if AH header is in its own mblk, skip it */
+		if (MBLKL(mp) <= skip_len) {
+			skip_len -= MBLKL(mp);
+			mp = mp->b_cont;
+			if (mp == NULL)
+				goto done;
+		}
+
+		goto again;
+	}
+	}
+
+done:
+	return (hash);
+}
--- a/usr/src/uts/common/io/nxge/nxge_send.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/nxge/nxge_send.c	Tue Feb 17 01:31:30 2009 -0800
@@ -19,10 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+#include <sys/mac_provider.h>
 #include <sys/nxge/nxge_impl.h>
 #include <sys/nxge/nxge_hio.h>
 #include <npi_tx_wr64.h>
@@ -32,6 +33,9 @@
 #include <inet/ip_impl.h>
 #include <inet/tcp.h>
 
+extern uint64_t mac_pkt_hash(uint_t, mblk_t *mp, uint8_t policy,
+    boolean_t is_outbound);
+
 static mblk_t *nxge_lso_eliminate(mblk_t *);
 static mblk_t *nxge_do_softlso(mblk_t *mp, uint32_t mss);
 static void nxge_lso_info_get(mblk_t *, uint32_t *, uint32_t *);
@@ -121,8 +125,17 @@
 #if defined(sun4v)
 
 /*
+ * Hashing policy for load balancing over the set of TX rings
+ * available to the driver.
+ */
+static uint8_t nxge_tx_hash_policy = MAC_PKT_HASH_L4;
+
+/*
  * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in
  *	the guest domain.  See CR 6778758 for long term solution.
+ *
+ *	The guest domain driver will for now hash the packet
+ *	to pick a DMA channel from the only group it has group 0.
  */
 
 mblk_t *
@@ -130,15 +143,23 @@
 {
 	p_nxge_t		nxgep = (p_nxge_t)arg;
 	mblk_t			*next;
+	uint64_t		rindex;
 	p_tx_ring_t		tx_ring_p;
 	int			status;
 
 	NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx"));
 
 	/*
-	 * Get the default ring handle.
+	 * Hash to pick a ring from Group 0, the only TX group
+	 * for a guest domain driver.
 	 */
-	tx_ring_p = nxgep->tx_rings->rings[0];
+	rindex = mac_pkt_hash(DL_ETHER, mp, nxge_tx_hash_policy, B_TRUE);
+	rindex = rindex % nxgep->pt_config.tdc_grps[0].max_tdcs;
+
+	/*
+	 * Get the ring handle.
+	 */
+	tx_ring_p = nxgep->tx_rings->rings[rindex];
 
 	while (mp != NULL) {
 		next = mp->b_next;
--- a/usr/src/uts/common/io/softmac/softmac_main.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/io/softmac/softmac_main.c	Tue Feb 17 01:31:30 2009 -0800
@@ -1042,17 +1042,26 @@
 		return (MH_WALK_CONTINUE);
 	}
 
+	/*
+	 * Bumping up the smac_hold_cnt allows us to drop the lock. It also
+	 * makes softmac_destroy() return failure on an attempted device detach.
+	 * We don't want to hold the lock across calls to other subsystems
+	 * like kstats, which will happen in the call to dls_devnet_recreate
+	 */
+	softmac->smac_hold_cnt++;
+	mutex_exit(&softmac->smac_mutex);
+
 	if (dls_mgmt_create(softmac->smac_devname,
 	    makedevice(softmac->smac_umajor, softmac->smac_uppa + 1),
 	    DATALINK_CLASS_PHYS, softmac->smac_media, B_TRUE, &linkid) != 0) {
-		mutex_exit(&softmac->smac_mutex);
+		softmac_rele_device((dls_dev_handle_t)softmac);
 		return (MH_WALK_CONTINUE);
 	}
 
 	if ((err = softmac_update_info(softmac, &linkid)) != 0) {
 		cmn_err(CE_WARN, "softmac: softmac_update_info() for %s "
 		    "failed (%d)", softmac->smac_devname, err);
-		mutex_exit(&softmac->smac_mutex);
+		softmac_rele_device((dls_dev_handle_t)softmac);
 		return (MH_WALK_CONTINUE);
 	}
 
@@ -1069,7 +1078,10 @@
 		}
 	}
 
+	mutex_enter(&softmac->smac_mutex);
 	softmac->smac_flags &= ~SOFTMAC_NEED_RECREATE;
+	ASSERT(softmac->smac_hold_cnt != 0);
+	softmac->smac_hold_cnt--;
 	mutex_exit(&softmac->smac_mutex);
 
 	return (MH_WALK_CONTINUE);
--- a/usr/src/uts/common/sys/aggr_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/aggr_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -157,6 +157,7 @@
 	aggr_port_t	**lg_tx_ports;		/* array of tx ports */
 	uint_t		lg_tx_ports_size;	/* size of lg_tx_ports */
 	uint32_t	lg_tx_policy;		/* outbound policy */
+	uint8_t		lg_mac_tx_policy;
 	uint64_t	lg_ifspeed;
 	link_state_t	lg_link_state;
 	link_duplex_t	lg_link_duplex;
--- a/usr/src/uts/common/sys/dld.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/dld.h	Tue Feb 17 01:31:30 2009 -0800
@@ -372,6 +372,10 @@
 	/* flow control notification callback */
 	uintptr_t	di_tx_cb_df; /* callback registration/de-registration */
 	void		*di_tx_cb_dh;
+
+	/* flow control "can I put on a ring" callback */
+	uintptr_t	di_tx_fctl_df; /* canput-like callback */
+	void		*di_tx_fctl_dh;
 } dld_capab_direct_t;
 
 /*
--- a/usr/src/uts/common/sys/dld_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/dld_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -323,7 +323,7 @@
 	mutex_exit(&(dsp)->ds_lock);					\
 }
 
-#define	DLD_TX(dsp, mp, f_hint, flag)					\
+#define	DLD_TX(dsp, mp, f_hint, flag)				\
 	mac_tx(dsp->ds_mch, mp, f_hint, flag, NULL)
 
 #ifdef DEBUG
--- a/usr/src/uts/common/sys/dls_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/dls_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -70,6 +70,8 @@
 	uint_t			dh_removing;		/* dh_lock */
 } dls_head_t;
 
+extern mod_hash_t	*i_dls_link_hash;
+
 extern void		dls_link_init(void);
 extern int		dls_link_fini(void);
 extern int		dls_link_hold(const char *, dls_link_t **);
--- a/usr/src/uts/common/sys/mac.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac.h	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -565,21 +565,28 @@
 extern int			mac_margin_remove(mac_handle_t, uint32_t);
 extern int			mac_margin_add(mac_handle_t, uint32_t *,
 				    boolean_t);
-extern void			mac_init_ops(struct dev_ops *, const char *);
-extern void			mac_fini_ops(struct dev_ops *);
-extern uint32_t			mac_no_notification(mac_handle_t);
 
 extern mactype_register_t	*mactype_alloc(uint_t);
 extern void			mactype_free(mactype_register_t *);
 extern int			mactype_register(mactype_register_t *);
 extern int			mactype_unregister(const char *);
-extern void			mac_set_ring(void *, void *);
 
 extern void			mac_start_logusage(mac_logtype_t, uint_t);
 extern void			mac_stop_logusage(mac_logtype_t);
 
 extern mac_handle_t		mac_get_lower_mac_handle(mac_handle_t);
 
+/*
+ * Packet hashing for distribution to multiple ports and rings.
+ */
+
+#define	MAC_PKT_HASH_L2		0x01
+#define	MAC_PKT_HASH_L3		0x02
+#define	MAC_PKT_HASH_L4		0x04
+
+extern uint64_t			mac_pkt_hash(uint_t, mblk_t *, uint8_t,
+				    boolean_t);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/sys/mac_client.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac_client.h	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,8 +98,9 @@
 #define	MAC_CLOSE_FLAGS_IS_AGGR_PORT	0x0004
 
 /* flags passed to mac_promisc_add() */
-#define	MAC_PROMISC_FLAGS_NO_TX_LOOP	0x0001
-#define	MAC_PROMISC_FLAGS_NO_PHYS	0x0002
+#define	MAC_PROMISC_FLAGS_NO_TX_LOOP		0x0001
+#define	MAC_PROMISC_FLAGS_NO_PHYS		0x0002
+#define	MAC_PROMISC_FLAGS_VLAN_TAG_STRIP	0x0004
 
 /* flags passed to mac_tx() */
 #define	MAC_DROP_ON_NO_DESC	0x01 /* freemsg() if no tx descs */
@@ -175,6 +176,12 @@
 extern void mac_get_hwgrp_info(mac_handle_t, int, uint_t *, uint_t *,
     uint_t *, uint_t *, char *);
 
+extern uint32_t mac_no_notification(mac_handle_t);
+extern int mac_set_prop(mac_handle_t, mac_prop_t *, void *, uint_t);
+extern int mac_get_prop(mac_handle_t, mac_prop_t *, void *, uint_t, uint_t *);
+
+extern boolean_t mac_is_vnic(mac_handle_t);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/sys/mac_client_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac_client_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -74,6 +74,7 @@
 	struct mac_client_impl_s	*mpi_mcip;	/* WO */
 	boolean_t			mpi_no_tx_loop;	/* WO */
 	boolean_t			mpi_no_phys;	/* WO */
+	boolean_t			mpi_strip_vlan_tag;	/* WO */
 } mac_promisc_impl_t;
 
 typedef union mac_tx_percpu_s {
--- a/usr/src/uts/common/sys/mac_impl.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac_impl.h	Tue Feb 17 01:31:30 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -424,6 +424,7 @@
 	/* list of MAC clients which opened this MAC */
 	struct mac_client_impl_s *mi_clients_list;	/* mi_rw_lock */
 	uint_t			mi_nclients;		/* mi_rw_lock */
+	struct mac_client_impl_s *mi_single_active_client; /* mi_rw_lock */
 
 	uint32_t		mi_margin;		/* mi_rw_lock */
 	uint_t			mi_sdu_min;		/* mi_rw_lock */
--- a/usr/src/uts/common/sys/mac_provider.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac_provider.h	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -414,10 +414,6 @@
  */
 extern void			mac_sdu_get(mac_handle_t, uint_t *, uint_t *);
 extern int			mac_maxsdu_update(mac_handle_t, uint_t);
-extern int			mac_set_prop(mac_handle_t, mac_prop_t *,
-				    void *, uint_t);
-extern int			mac_get_prop(mac_handle_t, mac_prop_t *,
-				    void *, uint_t, uint_t *);
 
 extern mac_register_t		*mac_alloc(uint_t);
 extern void			mac_free(mac_register_t *);
@@ -452,19 +448,15 @@
 				    boolean_t);
 extern void			mac_init_ops(struct dev_ops *, const char *);
 extern void			mac_fini_ops(struct dev_ops *);
-extern uint32_t			mac_no_notification(mac_handle_t);
 
 extern mactype_register_t	*mactype_alloc(uint_t);
 extern void			mactype_free(mactype_register_t *);
 extern int			mactype_register(mactype_register_t *);
 extern int			mactype_unregister(const char *);
-extern void			mac_set_ring(void *, void *);
 
 extern boolean_t		mac_unicst_verify(mac_handle_t,
 				    const uint8_t *, uint_t);
 
-extern boolean_t		mac_is_vnic(mac_handle_t);
-
 extern int			mac_group_add_ring(mac_group_handle_t, int);
 extern void			mac_group_rem_ring(mac_group_handle_t,
 				    mac_ring_handle_t);
--- a/usr/src/uts/common/sys/mac_soft_ring.h	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/sys/mac_soft_ring.h	Tue Feb 17 01:31:30 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -181,7 +181,6 @@
 	void			*sr_arg1;	/* srs_lock */
 	mac_resource_handle_t 	sr_arg2;	/* srs_lock */
 	mac_rx_func_t		sr_lower_proc;	/* Atomically changed */
-	boolean_t		sr_enqueue_always; /* enqueue at soft ring */
 	uint32_t		sr_poll_pkt_cnt;
 	uint32_t		sr_poll_thres;
 
@@ -233,7 +232,7 @@
 	/* Worker thread goes back to draining the queue */
 	uint32_t		sr_drain_again;
 	/* More Packets in queue so signal the worker thread to drain */
-	uint32_t		sr_drain_worker_sig;
+	uint32_t		sr_drain_poll_sig;
 	/* Poll thread is already running so worker has nothing to do */
 	uint32_t		sr_drain_poll_running;
 	/* We have packets already queued so keep polling */
@@ -485,6 +484,7 @@
 
 #define	SRS_QUIESCE_PERM	0x10000000
 #define	SRS_LATENCY_OPT		0x20000000
+#define	SRS_SOFTRING_QUEUE	0x40000000
 
 #define	SRS_QUIESCED(srs)	(srs->srs_state & SRS_QUIESCE_DONE)
 
--- a/usr/src/uts/common/xen/io/xnbo.c	Tue Feb 17 16:22:10 2009 +0800
+++ b/usr/src/uts/common/xen/io/xnbo.c	Tue Feb 17 01:31:30 2009 -0800
@@ -312,7 +312,8 @@
 		mac_rx_set(xnbop->o_mch, rx_fn, xnbp);
 	} else {
 		err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL,
-		    rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP);
+		    rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP |
+		    MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
 		if (err != 0) {
 			cmn_err(CE_WARN, "xnbo_open_mac: "
 			    "cannot enable promiscuous mode of %s: %d",