PSARC/2009/590 Socket Filter Framework
authorAnders Persson <Anders.Persson@Sun.COM>
Thu, 17 Jun 2010 17:22:09 -0700
changeset 12643 044ff822d212
parent 12642 3d43062bca80
child 12644 4f9a0cd40c5f
PSARC/2009/590 Socket Filter Framework 6939085 Socket Filter Framework 6802067 connect_failed kernel socket callback is not triggered 6776450 time spent in tcp_close could be reduced/deferred to a worker thread 6828586 assertion failed: family == 26, file: ../../common/fs/sockfs/socksyscalls.c, line: 1608 6802078 kernel socket 'newconn' callback is passing rcv queue size as an argument
exception_lists/packaging
usr/src/cmd/cmd-inet/usr.sbin/Makefile
usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter
usr/src/cmd/ptools/pfiles/pfiles.c
usr/src/cmd/truss/expound.c
usr/src/cmd/truss/print.c
usr/src/cmd/truss/print.h
usr/src/cmd/truss/systable.c
usr/src/lib/libc/common/sys/_sockconfig.s
usr/src/pkg/manifests/SUNWcs.mf
usr/src/uts/common/Makefile.files
usr/src/uts/common/c2/audit_event.c
usr/src/uts/common/fs/sockfs/sockcommon.c
usr/src/uts/common/fs/sockfs/sockcommon.h
usr/src/uts/common/fs/sockfs/sockcommon_sops.c
usr/src/uts/common/fs/sockfs/sockcommon_subr.c
usr/src/uts/common/fs/sockfs/sockfilter.c
usr/src/uts/common/fs/sockfs/sockfilter_impl.h
usr/src/uts/common/fs/sockfs/socknotify.c
usr/src/uts/common/fs/sockfs/sockparams.c
usr/src/uts/common/fs/sockfs/socksubr.c
usr/src/uts/common/fs/sockfs/socksyscalls.c
usr/src/uts/common/fs/sockfs/socktpi.c
usr/src/uts/common/fs/sockfs/socktpi.h
usr/src/uts/common/fs/sockfs/sodirect.c
usr/src/uts/common/inet/inetddi.c
usr/src/uts/common/inet/ip/icmp.c
usr/src/uts/common/inet/ip/icmpddi.c
usr/src/uts/common/inet/rawip_impl.h
usr/src/uts/common/inet/sockmods/socksctp.c
usr/src/uts/common/inet/sockmods/socksdp.c
usr/src/uts/common/inet/tcp/tcp.c
usr/src/uts/common/inet/tcp/tcp_fusion.c
usr/src/uts/common/inet/tcp/tcp_input.c
usr/src/uts/common/inet/tcp/tcp_output.c
usr/src/uts/common/inet/tcp/tcp_socket.c
usr/src/uts/common/inet/tcp/tcp_tpi.c
usr/src/uts/common/inet/tcp/tcpddi.c
usr/src/uts/common/inet/tcp_impl.h
usr/src/uts/common/inet/udp/udp.c
usr/src/uts/common/inet/udp/udpddi.c
usr/src/uts/common/inet/udp_impl.h
usr/src/uts/common/io/ksocket/ksocket.c
usr/src/uts/common/io/sock_conf.c
usr/src/uts/common/os/sysent.c
usr/src/uts/common/sys/Makefile
usr/src/uts/common/sys/ksocket.h
usr/src/uts/common/sys/socket.h
usr/src/uts/common/sys/socket_proto.h
usr/src/uts/common/sys/socketvar.h
usr/src/uts/common/sys/sockfilter.h
usr/src/uts/common/syscall/sendfile.c
--- a/exception_lists/packaging	Thu Jun 17 16:29:23 2010 -0700
+++ b/exception_lists/packaging	Thu Jun 17 17:22:09 2010 -0700
@@ -926,3 +926,7 @@
 #
 opt/onbld/bin/i386/elfsign	i386
 opt/onbld/bin/sparc/elfsign	sparc
+#
+# Private socket filter API
+#
+usr/include/sys/sockfilter.h
--- a/usr/src/cmd/cmd-inet/usr.sbin/Makefile	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/cmd-inet/usr.sbin/Makefile	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
 #
 
 #
-# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 
 SYNCPROG=	syncinit syncloop syncstat
@@ -38,6 +37,7 @@
 
 MANIFEST=	rarp.xml telnet.xml comsat.xml finger.xml \
 		login.xml shell.xml rexec.xml 
+SVCMETHOD=	svc-sockfilter
 
 ROOTFS_PROG=	hostconfig route soconfig
 SBINLINKS=	hostconfig route
@@ -106,7 +106,8 @@
 #
 # Message catalog
 #
-POFILES=	6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po
+POFILES=	6to4relay.po if_mpadm.po in.comsat.po ipaddrsel.po route.po \
+		soconfig.po
 POFILE=		usr.sbin.po
 
 all:=		TARGET= all
@@ -199,7 +200,7 @@
 
 install: $(PROG) $(ROOTFS_PROG) $(SUBDIRS) .WAIT $(ROOTUSRSBINPROG) \
 	$(ROOTSBINPROG) $(ROOTUSRSBINLINKS) $(ROOTETCDEFAULTFILES) \
-	$(ROOTMANIFEST) THIRDPARTYLICENSE.arp
+	$(ROOTMANIFEST) $(ROOTSVCMETHOD) THIRDPARTYLICENSE.arp
 
 THIRDPARTYLICENSE.arp: arp.c
 	$(SED) -n '/University of California/,/SUCH DAMAGE/p' arp.c > $@
--- a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <stdio.h>
@@ -30,6 +29,9 @@
 #include <string.h>
 #include <ctype.h>
 #include <locale.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <errno.h>
 
 #define	MAXLINELEN	4096
 
@@ -47,6 +49,15 @@
  *
  *	soconfig <fam> <type> <protocol>
  *		deregisters
+ *
+ * Filter Operations (Consolidation Private):
+ *
+ *	soconfig -F <name> <modname> {auto [top | bottom | before:filter |
+ *		after:filter] | prog} <fam>:<type>:<proto>,...
+ *		configure filter
+ *
+ *	soconfig -F <name>
+ *		unconfigures filter
  */
 
 static int	parse_file(char *filename);
@@ -60,6 +71,8 @@
 
 static void	usage(void);
 
+static int	parse_filter_params(int argc, char **argv);
+
 int
 main(argc, argv)
 	int argc;
@@ -75,6 +88,11 @@
 #endif
 	(void) textdomain(TEXT_DOMAIN);
 
+	if (argc >= 2 && strcmp(argv[0], "-F") == 0) {
+		argc--; argv++;
+		ret = parse_filter_params(argc, argv);
+		exit(ret);
+	}
 	if (argc == 2 && strcmp(argv[0], "-f") == 0) {
 		ret = parse_file(argv[1]);
 		exit(ret);
@@ -213,7 +231,7 @@
 static int
 parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
 {
-	int fam, type, protocol;
+	int cmd, fam, type, protocol;
 
 	fam = parse_int(famstr);
 	if (fam == -1) {
@@ -272,13 +290,17 @@
 			}
 			return (1);
 		}
+
+		cmd = SOCKCONFIG_ADD_SOCK;
+	} else {
+		cmd = SOCKCONFIG_REMOVE_SOCK;
 	}
 
 #ifdef DEBUG
-	printf("not calling sockconfig(%d, %d, %d, %s)\n",
-	    fam, type, protocol, path == NULL ? "(null)" : path);
+	printf("not calling sockconfig(%d, %d, %d, %d, %s)\n",
+	    cmd, fam, type, protocol, path == NULL ? "(null)" : path);
 #else
-	if (_sockconfig(fam, type, protocol, path) == -1) {
+	if (_sockconfig(cmd, fam, type, protocol, path) == -1) {
 		perror("sockconfig");
 		return (1);
 	}
@@ -297,3 +319,181 @@
 		return (-1);
 	return (res);
 }
+
+/*
+ * Add and remove socket filters.
+ */
+static int
+parse_filter_params(int argc, char **argv)
+{
+	struct sockconfig_filter_props filprop;
+	sof_socktuple_t *socktuples;
+	size_t tupcnt, nalloc;
+	char *hintarg, *socktup, *tupstr;
+	int i;
+
+	if (argc == 1) {
+		if (_sockconfig(SOCKCONFIG_REMOVE_FILTER, argv[0], 0,
+		    0, 0) < 0) {
+			switch (errno) {
+			case ENXIO:
+				fprintf(stderr,
+				    gettext("socket filter is not configured "
+				    "'%s'\n"), argv[0]);
+				break;
+			default:
+				perror("sockconfig");
+				break;
+			}
+			return (1);
+		}
+		return (0);
+	}
+
+	if (argc < 4 || argc > 5)
+		return (1);
+
+
+	if (strlen(argv[1]) >= MODMAXNAMELEN) {
+		fprintf(stderr,
+		    gettext("invalid module name '%s': name too long\n"),
+		    argv[1]);
+		return (1);
+	}
+	filprop.sfp_modname = argv[1];
+
+	/* Check the attach semantics */
+	if (strcmp(argv[2], "auto") == 0) {
+		filprop.sfp_autoattach = B_TRUE;
+		if (argc == 5) {
+			/* placement hint */
+			if (strcmp(argv[3], "top") == 0) {
+				filprop.sfp_hint = SOF_HINT_TOP;
+			} else if (strcmp(argv[3], "bottom") == 0) {
+				filprop.sfp_hint = SOF_HINT_BOTTOM;
+			} else {
+				if (strncmp(argv[3], "before", 6) == 0) {
+					filprop.sfp_hint = SOF_HINT_BEFORE;
+				} else if (strncmp(argv[3], "after", 5) == 0) {
+					filprop.sfp_hint = SOF_HINT_AFTER;
+				} else {
+					fprintf(stderr,
+					    gettext("invalid placement hint "
+					    "'%s'\n"), argv[3]);
+					return (1);
+				}
+
+				hintarg = strchr(argv[3], ':');
+				if (hintarg == NULL ||
+				    (strlen(++hintarg) == 0) ||
+				    (strlen(hintarg) >= FILNAME_MAX)) {
+					fprintf(stderr,
+					    gettext("invalid placement hint "
+					    "argument '%s': name too long\n"),
+					    argv[3]);
+					return (1);
+				}
+
+				filprop.sfp_hintarg = hintarg;
+			}
+		} else {
+			filprop.sfp_hint = SOF_HINT_NONE;
+		}
+	} else if (strcmp(argv[2], "prog") == 0) {
+		filprop.sfp_autoattach = B_FALSE;
+		filprop.sfp_hint = SOF_HINT_NONE;
+		/* cannot specify placement hint for programmatic filter */
+		if (argc == 5) {
+			fprintf(stderr,
+			    gettext("placement hint specified for programmatic "
+			    "filter\n"));
+			return (1);
+		}
+	} else {
+		fprintf(stderr, gettext("invalid attach semantic '%s'\n"),
+		    argv[2]);
+		return (1);
+	}
+
+	/* parse the socket tuples */
+	nalloc = 4;
+	socktuples = calloc(nalloc, sizeof (sof_socktuple_t));
+	if (socktuples == NULL) {
+		perror("calloc");
+		return (1);
+	}
+
+	tupcnt = 0;
+	tupstr = argv[(argc == 4) ? 3 : 4];
+	while ((socktup = strsep(&tupstr, ",")) != NULL) {
+		int val;
+		char *valstr;
+
+		if (tupcnt == nalloc) {
+			sof_socktuple_t *new;
+
+			nalloc *= 2;
+			new = realloc(socktuples,
+			    nalloc * sizeof (sof_socktuple_t));
+			if (new == NULL) {
+				perror("realloc");
+				free(socktuples);
+				return (1);
+			}
+			socktuples = new;
+		}
+		i = 0;
+		while ((valstr = strsep(&socktup, ":")) != NULL && i < 3) {
+			val = parse_int(valstr);
+			if (val == -1) {
+				fprintf(stderr, gettext("bad socket tuple\n"));
+				free(socktuples);
+				return (1);
+			}
+			switch (i) {
+			case 0:	socktuples[tupcnt].sofst_family = val; break;
+			case 1:	socktuples[tupcnt].sofst_type = val; break;
+			case 2:	socktuples[tupcnt].sofst_protocol = val; break;
+			}
+			i++;
+		}
+		if (i != 3) {
+			fprintf(stderr, gettext("bad socket tuple\n"));
+			free(socktuples);
+			return (1);
+		}
+		tupcnt++;
+	}
+	if (tupcnt == 0) {
+		fprintf(stderr, gettext("no socket tuples specified\n"));
+		free(socktuples);
+		return (1);
+	}
+	filprop.sfp_socktuple_cnt = tupcnt;
+	filprop.sfp_socktuple = socktuples;
+
+	if (_sockconfig(SOCKCONFIG_ADD_FILTER, argv[0], &filprop, 0, 0) < 0) {
+		switch (errno) {
+		case EINVAL:
+			fprintf(stderr,
+			    gettext("invalid socket filter configuration\n"));
+			break;
+		case EEXIST:
+			fprintf(stderr,
+			    gettext("socket filter is already configured "
+			    "'%s'\n"), argv[0]);
+			break;
+		case ENOSPC:
+			fprintf(stderr, gettext("unable to satisfy placement "
+			    "constraint\n"));
+			break;
+		default:
+			perror("sockconfig");
+			break;
+		}
+		free(socktuples);
+		return (1);
+	}
+	free(socktuples);
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/cmd-inet/usr.sbin/svc-sockfilter	Thu Jun 17 17:22:09 2010 -0700
@@ -0,0 +1,55 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+#
+
+. /lib/svc/share/smf_include.sh
+
+filter_name=`svcprop -p socket-filter/name $SMF_FMRI 2>/dev/null`
+if [ -z "$filter_name" ]; then
+	echo "socket-filter/name is missing"
+	exit $SMF_EXIT_ERR_CONFIG
+fi
+
+case "$1" in
+start)
+	mod_name=`svcprop -p socket-filter/module_name $SMF_FMRI 2>/dev/null`
+	type=`svcprop -p socket-filter/attach_semantics $SMF_FMRI 2>/dev/null`
+	order=`svcprop -p socket-filter/order_hint $SMF_FMRI 2>/dev/null`
+	socktups=`svcprop -p socket-filter/socket_tuples $SMF_FMRI 2>/dev/null`
+
+	/sbin/soconfig -F $filter_name $mod_name $type $order $socktups
+	if [ $? -ne 0 ]; then
+		exit $SMF_EXIT_ERR_FATAL
+	fi 
+	;;
+stop)
+	/sbin/soconfig -F $filter_name
+	;;
+*)
+	echo "Usage: $0 { start | stop }"
+	exit 1
+	;;
+esac
+
+exit $SMF_EXIT_OK
--- a/usr/src/cmd/ptools/pfiles/pfiles.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/ptools/pfiles/pfiles.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <stdio.h>
@@ -650,6 +649,70 @@
 		(void) printf("\t%s\n", buf+1);
 }
 
+#define	MAXNALLOC	32
+static void
+show_sockfilters(struct ps_prochandle *Pr, int fd)
+{
+	struct fil_info *fi;
+	int i = 0, nalloc = 2, len = nalloc * sizeof (*fi);
+	boolean_t printhdr = B_TRUE;
+
+	fi = calloc(nalloc, sizeof (*fi));
+	if (fi == NULL) {
+		perror("calloc");
+		return;
+	}
+	/* CONSTCOND */
+	while (1) {
+		if (pr_getsockopt(Pr, fd, SOL_FILTER, FIL_LIST, fi, &len) != 0)
+			break;
+		/* No filters */
+		if (len == 0)
+			break;
+		/* Make sure buffer was large enough */
+		if (fi->fi_pos >= nalloc) {
+			struct fil_info *new;
+
+			nalloc = fi->fi_pos + 1;
+			if (nalloc > MAXNALLOC)
+				break;
+			len = nalloc * sizeof (*fi);
+			new = realloc(fi, nalloc * sizeof (*fi));
+			if (new == NULL) {
+				perror("realloc");
+				break;
+			}
+			fi = new;
+			continue;
+		}
+
+		for (i = 0; (i + 1) * sizeof (*fi) <= len; i++) {
+			if (fi[i].fi_flags & FILF_BYPASS)
+				continue;
+			if (printhdr) {
+				(void) printf("\tfilters: ");
+				printhdr = B_FALSE;
+			}
+			(void) printf("%s", fi[i].fi_name);
+			if (fi[i].fi_flags != 0) {
+				(void) printf("(");
+				if (fi[i].fi_flags & FILF_AUTO)
+					(void) printf("auto,");
+				if (fi[i].fi_flags & FILF_PROG)
+					(void) printf("prog,");
+				(void) printf("\b)");
+			}
+			if (fi[i].fi_pos == 0) /* last one */
+				break;
+			(void) printf(",");
+		}
+		if (!printhdr)
+			(void) printf("\n");
+		break;
+	}
+	free(fi);
+}
+
 /* the file is a socket */
 static void
 dosocket(struct ps_prochandle *Pr, int fd)
@@ -666,6 +729,7 @@
 		show_socktype((uint_t)type);
 
 	show_sockopts(Pr, fd);
+	show_sockfilters(Pr, fd);
 
 	len = sizeof (buf);
 	if (pr_getsockname(Pr, fd, sa, &len) == 0)
--- a/usr/src/cmd/truss/expound.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/truss/expound.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -90,6 +89,7 @@
 #include <sys/nvpair.h>
 #include <libnvpair.h>
 #include <sys/rctl_impl.h>
+#include <sys/socketvar.h>
 
 #include "ramdata.h"
 #include "systable.h"
@@ -4721,6 +4721,132 @@
 	}
 }
 
+#ifdef _LP64
+static void
+show_sockconfig_filter_prop32(private_t *pri, long addr)
+{
+	struct sockconfig_filter_props32 props;
+	const char *s = NULL;
+	char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)];
+	sof_socktuple32_t *tup;
+	size_t sz;
+	int i;
+
+	if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) {
+		if (Pread_string(Proc, buf, sizeof (buf),
+		    (uintptr_t)props.sfp_modname) == -1)
+			(void) strcpy(buf, "<?>");
+		(void) printf("%s\tmodule name: %s\n", pri->pname, buf);
+		(void) printf("%s\tattach semantics: %s", pri->pname,
+		    props.sfp_autoattach ? "automatic" : "progammatic");
+		if (props.sfp_autoattach) {
+			buf[0] = '\0';
+			switch (props.sfp_hint) {
+			case SOF_HINT_TOP:	s = "top"; break;
+			case SOF_HINT_BOTTOM:	s = "bottom"; break;
+			case SOF_HINT_BEFORE:
+			case SOF_HINT_AFTER:
+				s = (props.sfp_hint == SOF_HINT_BEFORE) ?
+				    "before" : "after";
+				if (Pread_string(Proc, buf, sizeof (buf),
+				    (uintptr_t)props.sfp_hintarg) == -1)
+					(void) strcpy(buf, "<?>");
+			}
+			if (s != NULL) {
+				(void) printf(", placement: %s %s", s, buf);
+			}
+		}
+		(void) printf("\n");
+		(void) printf("%s\tsocket tuples:\n", pri->pname);
+		if (props.sfp_socktuple_cnt == 0) {
+			(void) printf("\t\t<empty>\n");
+			return;
+		}
+		sz = props.sfp_socktuple_cnt * sizeof (*tup);
+		tup = my_malloc(sz, "socket tuple buffer");
+		if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz)
+			for (i = 0; i < props.sfp_socktuple_cnt; i++) {
+				(void) printf(
+				    "\t\tfamily: %d, type: %d, proto: %d\n",
+				    tup[i].sofst_family, tup[i].sofst_type,
+				    tup[i].sofst_protocol);
+			}
+	}
+}
+#endif	/* _LP64 */
+static void
+show_sockconfig_filter_prop(private_t *pri, long addr)
+{
+	struct sockconfig_filter_props props;
+	const char *s = NULL;
+	char buf[MAX(FILNAME_MAX, MODMAXNAMELEN)];
+	sof_socktuple_t *tup;
+	size_t sz;
+	int i;
+
+	if (Pread(Proc, &props, sizeof (props), addr) == sizeof (props)) {
+		if (Pread_string(Proc, buf, sizeof (buf),
+		    (uintptr_t)props.sfp_modname) == -1)
+			(void) strcpy(buf, "<?>");
+		(void) printf("%s\tmodule name: %s\n", pri->pname, buf);
+		(void) printf("%s\tattach semantics: %s", pri->pname,
+		    props.sfp_autoattach ? "automatic" : "progammatic");
+		if (props.sfp_autoattach) {
+			buf[0] = '\0';
+			switch (props.sfp_hint) {
+			case SOF_HINT_TOP:	s = "top"; break;
+			case SOF_HINT_BOTTOM:	s = "bottom"; break;
+			case SOF_HINT_BEFORE:
+			case SOF_HINT_AFTER:
+				s = (props.sfp_hint == SOF_HINT_BEFORE) ?
+				    "before" : "after";
+				if (Pread_string(Proc, buf, sizeof (buf),
+				    (uintptr_t)props.sfp_hintarg) == -1)
+					(void) strcpy(buf, "<?>");
+			}
+			if (s != NULL) {
+				(void) printf(", placement: %s", s);
+			}
+		}
+		(void) printf("\n");
+		(void) printf("%s\tsocket tuples:\n", pri->pname);
+		if (props.sfp_socktuple_cnt == 0) {
+			(void) printf("\t\t<empty>\n");
+			return;
+		}
+		sz = props.sfp_socktuple_cnt * sizeof (*tup);
+		tup = my_malloc(sz, "socket tuple buffer");
+		if (Pread(Proc, tup, sz, (uintptr_t)props.sfp_socktuple) == sz)
+			for (i = 0; i < props.sfp_socktuple_cnt; i++) {
+				(void) printf(
+				    "\t\tfamily: %d, type: %d, proto: %d\n",
+				    tup[i].sofst_family, tup[i].sofst_type,
+				    tup[i].sofst_protocol);
+			}
+	}
+}
+
+void
+show_sockconfig(private_t *pri)
+{
+	switch (pri->sys_args[0]) {
+	case SOCKCONFIG_ADD_FILTER:
+#ifdef _LP64
+		if (data_model == PR_MODEL_LP64)
+			show_sockconfig_filter_prop(pri,
+			    (long)pri->sys_args[2]);
+		else
+			show_sockconfig_filter_prop32(pri,
+			    (long)pri->sys_args[2]);
+#else
+		show_sockconfig_filter_prop(pri, (long)pri->sys_args[2]);
+#endif
+		break;
+	default:
+		break;
+	}
+}
+
 /* expound verbosely upon syscall arguments */
 /*ARGSUSED*/
 void
@@ -5199,5 +5325,8 @@
 	case SYS_utimesys:
 		show_utimesys(pri);
 		break;
+	case SYS_sockconfig:
+		show_sockconfig(pri);
+		break;
 	}
 }
--- a/usr/src/cmd/truss/print.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/truss/print.c	Thu Jun 17 17:22:09 2010 -0700
@@ -1649,7 +1649,32 @@
 	}
 }
 
+/*
+ * Print sockconfig() subcode.
+ */
+/*ARGSUSED*/
+void
+prt_skc(private_t *pri, int raw, long val)
+{
+	const char *s = NULL;
 
+	if (!raw) {
+		switch (val) {
+		case SOCKCONFIG_ADD_SOCK:
+			s = "SOCKCONFIG_ADD_SOCK"; break;
+		case SOCKCONFIG_REMOVE_SOCK:
+			s = "SOCKCONFIG_REMOVE_SOCK"; break;
+		case SOCKCONFIG_ADD_FILTER:
+			s = "SOCKCONFIG_ADD_FILTER"; break;
+		case SOCKCONFIG_REMOVE_FILTER:
+			s = "SOCKCONFIG_REMOVE_FILTER"; break;
+		}
+	}
+	if (s == NULL)
+		prt_dec(pri, 0, val);
+	else
+		outstring(pri, s);
+}
 /*
  * Print so_socket() 2nd argument.
  */
@@ -2709,5 +2734,6 @@
 	prt_un1,	/* UN1 -- as prt_uns except for -1 */
 	prt_mob,	/* MOB -- print mmapobj() flags */
 	prt_utf,	/* UTF -- print utimensat() flag */
+	prt_skc,	/* SKC -- print sockconfig() subcode */
 	prt_dec,	/* HID -- hidden argument, make this the last one */
 };
--- a/usr/src/cmd/truss/print.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/truss/print.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -135,7 +134,8 @@
 #define	UN1	95		/* unsigned except for -1 */
 #define	MOB	96		/* print mmapobj() flags */
 #define	UTF	97		/* print utimensat() flag */
-#define	HID	98		/* hidden argument, don't print */
+#define	SKC	98		/* print sockconfig subcode */
+#define	HID	99		/* hidden argument, don't print */
 				/* make sure HID is always the last member */
 
 /*
--- a/usr/src/cmd/truss/systable.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/cmd/truss/systable.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -464,7 +463,7 @@
 {"getsockname", 4, DEC, NOV, DEC, HEX, HEX, SKV},		/* 244 */
 {"getsockopt",	6, DEC, NOV, DEC, SOL, SON, HEX, HEX, SKV},	/* 245 */
 {"setsockopt",	6, DEC, NOV, DEC, SOL, SON, HEX, DEC, SKV},	/* 246 */
-{"sockconfig",	4, DEC, NOV, DEC, DEC, DEC, STG},		/* 247 */
+{"sockconfig",	5, DEC, NOV, DEC, HEX, HEX, HEX, HEX},		/* 247 */
 {"ntp_gettime",	1, DEC, NOV, HEX},				/* 248 */
 {"ntp_adjtime",	1, DEC, NOV, HEX},				/* 249 */
 {"lwp_mutex_unlock", 1, DEC, NOV, HEX},				/* 250 */
@@ -873,6 +872,14 @@
 };
 #define	NUTIMESYSCODE	(sizeof (utimesystable) / sizeof (struct systable))
 
+const	struct systable sockconfigtable[] = {
+{"sockconfig", 5, DEC, NOV, SKC, DEC, DEC, DEC, STG},	/* 0 */
+{"sockconfig", 4, DEC, NOV, SKC, DEC, DEC, DEC},	/* 1 */
+{"sockconfig", 3, DEC, NOV, SKC, STG, HEX },		/* 2 */
+{"sockconfig", 2, DEC, NOV, SKC, STG },			/* 3 */
+};
+#define	NSOCKCONFIGCODE	(sizeof (sockconfigtable) / sizeof (struct systable))
+
 const	struct sysalias sysalias[] = {
 	{ "exit",	SYS_exit	},
 	{ "fork",	SYS_forksys	},
@@ -1204,6 +1211,10 @@
 			if ((unsigned)subcode < NUTIMESYSCODE)
 				stp = &utimesystable[subcode];
 			break;
+		case SYS_sockconfig:	/* sockconfig family */
+			if ((unsigned)subcode < NSOCKCONFIGCODE)
+				stp = &sockconfigtable[subcode];
+			break;
 		}
 	}
 
@@ -1383,6 +1394,7 @@
 		case SYS_rctlsys:	/* rctlsys */
 		case SYS_sidsys:	/* sidsys */
 		case SYS_utimesys:	/* utimesys */
+		case SYS_sockconfig:	/* sockconfig */
 			subcode = arg0;
 			break;
 		case SYS_fcntl:		/* fcntl() */
@@ -1453,7 +1465,8 @@
 	    + NRCTLCODE - 1
 	    + NFORKCODE - 1
 	    + NSIDSYSCODE - 1
-	    + NUTIMESYSCODE - 1);
+	    + NUTIMESYSCODE - 1
+	    + NSOCKCONFIGCODE - 1);
 }
 
 /*
@@ -1545,6 +1558,8 @@
 		return (NSIDSYSCODE);
 	case SYS_utimesys:
 		return (NUTIMESYSCODE);
+	case SYS_sockconfig:
+		return (NSOCKCONFIGCODE);
 	default:
 		return (1);
 	}
--- a/usr/src/lib/libc/common/sys/_sockconfig.s	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/lib/libc/common/sys/_sockconfig.s	Thu Jun 17 17:22:09 2010 -0700
@@ -23,16 +23,14 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 	.file	"_sockconfig.s"
 
 /* C library -- _sockconfig					*/
 /*
- * int _sockconfig (int domain, int type, int protocol,
- *			dev_t dev, int version);
+ * int _sockconfig (int cmd, void *arg1, void *arg2, void *arg3, void *arg4);
  */
 
 #include "SYS.h"
--- a/usr/src/pkg/manifests/SUNWcs.mf	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/pkg/manifests/SUNWcs.mf	Thu Jun 17 17:22:09 2010 -0700
@@ -629,6 +629,7 @@
 file path=lib/svc/method/svc-legacy-routing mode=0555
 file path=lib/svc/method/svc-nscd mode=0555
 file path=lib/svc/method/svc-rbac mode=0555
+file path=lib/svc/method/svc-sockfilter mode=0555
 file path=lib/svc/method/svc-utmpd mode=0555
 file path=lib/svc/method/system-log mode=0555
 file path=lib/svc/method/vtdaemon mode=0555
--- a/usr/src/uts/common/Makefile.files	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/Makefile.files	Thu Jun 17 17:22:09 2010 -0700
@@ -1263,7 +1263,7 @@
 		sockcommon_sops.o	sockcommon.o	\
 		sock_notsupp.o	socknotify.o \
 		nl7c.o		nl7curi.o	nl7chttp.o	nl7clogd.o \
-		nl7cnca.o	sodirect.o
+		nl7cnca.o	sodirect.o	sockfilter.o
 
 TMPFS_OBJS +=	tmp_dir.o	tmp_subr.o	tmp_tnode.o	tmp_vfsops.o \
 		tmp_vnops.o
--- a/usr/src/uts/common/c2/audit_event.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/c2/audit_event.c	Thu Jun 17 17:22:09 2010 -0700
@@ -3822,38 +3822,60 @@
 	struct t_audit_data *tad;
 {
 	struct a {
-		long	domain;
-		long	type;
-		long	protocol;
-		long	devpath;
+		long	cmd;
+		long	arg1;
+		long	arg2;
+		long	arg3;
+		long	arg4;
 	} *uap = (struct a *)ttolwp(curthread)->lwp_ap;
 
-	char	*kdevpath;
-	int	kdevpathlen = MAXPATHLEN + 1;
+	char	*buf;
+	int	buflen;
 	size_t	size;
 
-	au_uwrite(au_to_arg32(1, "domain", (uint32_t)uap->domain));
-	au_uwrite(au_to_arg32(2, "type", (uint32_t)uap->type));
-	au_uwrite(au_to_arg32(3, "protocol", (uint32_t)uap->protocol));
-
-	if (uap->devpath == 0) {
-		au_uwrite(au_to_arg32(3, "devpath", (uint32_t)0));
-	} else {
-		kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP);
-
-		if (copyinstr((caddr_t)uap->devpath, kdevpath, kdevpathlen,
-			&size)) {
-			kmem_free(kdevpath, kdevpathlen);
+	au_uwrite(au_to_arg32(1, "cmd", (uint_t)uap->cmd));
+	switch (uap->cmd) {
+	case SOCKCONFIG_ADD_SOCK:
+	case SOCKCONFIG_REMOVE_SOCK:
+		au_uwrite(au_to_arg32(2, "domain", (uint32_t)uap->arg1));
+		au_uwrite(au_to_arg32(3, "type", (uint32_t)uap->arg2));
+		au_uwrite(au_to_arg32(4, "protocol", (uint32_t)uap->arg3));
+
+		if (uap->arg4 == 0) {
+			au_uwrite(au_to_arg32(5, "devpath", (uint32_t)0));
+		} else {
+			buflen = MAXPATHLEN + 1;
+			buf = kmem_alloc(buflen, KM_SLEEP);
+			if (copyinstr((caddr_t)uap->arg4, buf, buflen,
+			    &size)) {
+				kmem_free(buf, buflen);
+				return;
+			}
+
+			if (size > MAXPATHLEN) {
+				kmem_free(buf, buflen);
+				return;
+			}
+
+			au_uwrite(au_to_text(buf));
+			kmem_free(buf, buflen);
+		}
+		break;
+	case SOCKCONFIG_ADD_FILTER:
+	case SOCKCONFIG_REMOVE_FILTER:
+		buflen = FILNAME_MAX;
+		buf = kmem_alloc(buflen, KM_SLEEP);
+
+		if (copyinstr((caddr_t)uap->arg1, buf, buflen, &size)) {
+			kmem_free(buf, buflen);
 			return;
 		}
 
-		if (size > MAXPATHLEN) {
-			kmem_free(kdevpath, kdevpathlen);
-			return;
-		}
-
-		au_uwrite(au_to_text(kdevpath));
-		kmem_free(kdevpath, kdevpathlen);
+		au_uwrite(au_to_text(buf));
+		kmem_free(buf, buflen);
+		break;
+	default:
+		break;
 	}
 }
 
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -45,6 +44,7 @@
 
 #include <inet/ipclassifier.h>
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 #include <fs/sockfs/nl7c.h>
 #include <fs/sockfs/socktpi.h>
 #include <fs/sockfs/sodirect.h>
@@ -216,7 +216,7 @@
  * Active open.
  */
 int
-socket_connect(struct sonode *so, const struct sockaddr *name,
+socket_connect(struct sonode *so, struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, cred_t *cr)
 {
 	int error;
@@ -471,14 +471,23 @@
 	so->so_rcv_timer_tid	= 0;
 	so->so_rcv_thresh	= 0;
 
-	so->so_acceptq_head	= NULL;
-	so->so_acceptq_tail	= &so->so_acceptq_head;
-	so->so_acceptq_next	= NULL;
+	list_create(&so->so_acceptq_list, sizeof (struct sonode),
+	    offsetof(struct sonode, so_acceptq_node));
+	list_create(&so->so_acceptq_defer, sizeof (struct sonode),
+	    offsetof(struct sonode, so_acceptq_node));
+	list_link_init(&so->so_acceptq_node);
 	so->so_acceptq_len	= 0;
 	so->so_backlog		= 0;
+	so->so_listener		= NULL;
 
 	so->so_snd_qfull	= B_FALSE;
 
+	so->so_filter_active	= 0;
+	so->so_filter_tx	= 0;
+	so->so_filter_defertime = 0;
+	so->so_filter_top	= NULL;
+	so->so_filter_bottom	= NULL;
+
 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
@@ -509,9 +518,15 @@
 
 	ASSERT(so->so_rcv_q_head == NULL);
 
-	ASSERT(so->so_acceptq_head == NULL);
-	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
-	ASSERT(so->so_acceptq_next == NULL);
+	list_destroy(&so->so_acceptq_list);
+	list_destroy(&so->so_acceptq_defer);
+	ASSERT(!list_link_active(&so->so_acceptq_node));
+	ASSERT(so->so_listener == NULL);
+
+	ASSERT(so->so_filter_active == 0);
+	ASSERT(so->so_filter_tx == 0);
+	ASSERT(so->so_filter_top == NULL);
+	ASSERT(so->so_filter_bottom == NULL);
 
 	ASSERT(vp->v_data == so);
 	ASSERT(vn_matchops(vp, socket_vnodeops));
@@ -581,21 +596,11 @@
 
 	so->so_copyflag = 0;
 
-	ASSERT(so->so_acceptq_head == NULL);
-	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
-	ASSERT(so->so_acceptq_next == NULL);
-
 	vn_reinit(vp);
 	vp->v_vfsp	= rootvfs;
 	vp->v_type	= VSOCK;
 	vp->v_rdev	= sockdev;
 
-	so->so_rcv_queued = 0;
-	so->so_rcv_q_head = NULL;
-	so->so_rcv_q_last_head = NULL;
-	so->so_rcv_head	= NULL;
-	so->so_rcv_last_head = NULL;
-
 	so->so_snd_qfull = B_FALSE;
 	so->so_minpsz = 0;
 
@@ -620,7 +625,6 @@
 void
 sonode_fini(struct sonode *so)
 {
-	mblk_t *mp;
 	vnode_t *vp;
 
 	ASSERT(so->so_count == 0);
@@ -631,15 +635,6 @@
 		so->so_rcv_timer_tid = 0;
 	}
 
-	so_acceptq_flush(so, B_FALSE);
-
-	if ((mp = so->so_oobmsg) != NULL) {
-		freemsg(mp);
-		so->so_oobmsg = NULL;
-		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
-		    SS_RCVATMARK);
-	}
-
 	if (so->so_poll_list.ph_list != NULL) {
 		pollwakeup(&so->so_poll_list, POLLERR);
 		pollhead_clean(&so->so_poll_list);
@@ -655,4 +650,17 @@
 		crfree(so->so_peercred);
 		so->so_peercred = NULL;
 	}
+	/* Detach and destroy filters */
+	if (so->so_filter_top != NULL)
+		sof_sonode_cleanup(so);
+
+	ASSERT(list_is_empty(&so->so_acceptq_list));
+	ASSERT(list_is_empty(&so->so_acceptq_defer));
+	ASSERT(!list_link_active(&so->so_acceptq_node));
+
+	ASSERT(so->so_rcv_queued == 0);
+	ASSERT(so->so_rcv_q_head == NULL);
+	ASSERT(so->so_rcv_q_last_head == NULL);
+	ASSERT(so->so_rcv_head == NULL);
+	ASSERT(so->so_rcv_last_head == NULL);
 }
--- a/usr/src/uts/common/fs/sockfs/sockcommon.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SOCKCOMMON_H_
@@ -54,7 +53,7 @@
     struct cred *);
 extern int socket_accept(struct sonode *, int, struct cred *, struct sonode **);
 extern int socket_listen(struct sonode *, int, struct cred *);
-extern int socket_connect(struct sonode *, const struct sockaddr *,
+extern int socket_connect(struct sonode *, struct sockaddr *,
     socklen_t, int, int, struct cred *);
 extern int socket_getpeername(struct sonode *, struct sockaddr *, socklen_t *,
     boolean_t, struct cred *);
@@ -120,7 +119,7 @@
 extern int so_bind(struct sonode *, struct sockaddr *, socklen_t, int,
     struct cred *);
 extern int so_listen(struct sonode *, int, struct cred *);
-extern int so_connect(struct sonode *, const struct sockaddr *,
+extern int so_connect(struct sonode *, struct sockaddr *,
     socklen_t, int, int, struct cred *);
 extern int so_getsockopt(struct sonode *, int, int, void *,
     socklen_t *, int, struct cred *);
@@ -136,6 +135,8 @@
     struct pollhead **);
 extern int so_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
     struct cred *);
+extern int so_sendmblk_impl(struct sonode *, struct nmsghdr *, int,
+    struct cred *, mblk_t **, struct sof_instance *, boolean_t);
 extern int so_sendmblk(struct sonode *, struct nmsghdr *, int,
     struct cred *, mblk_t **);
 extern int so_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
@@ -153,6 +154,8 @@
 	struct sock_proto_props *);
 extern ssize_t	so_queue_msg(sock_upper_handle_t, mblk_t *, size_t, int,
     int *, boolean_t *);
+extern ssize_t	so_queue_msg_impl(struct sonode *, mblk_t *, size_t, int,
+    int *, boolean_t *, struct sof_instance *);
 extern void	so_signal_oob(sock_upper_handle_t, ssize_t);
 
 extern void	so_connected(sock_upper_handle_t, sock_connid_t, struct cred *,
@@ -183,6 +186,7 @@
     rval_t *, int);
 extern void	so_enqueue_msg(struct sonode *, mblk_t *, size_t);
 extern void	so_process_new_message(struct sonode *, mblk_t *, mblk_t *);
+extern void	so_check_flow_control(struct sonode *);
 
 extern mblk_t	*socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
 extern mblk_t 	*socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
@@ -213,7 +217,7 @@
 /* Notification functions */
 extern void	so_notify_connected(struct sonode *);
 extern void	so_notify_disconnecting(struct sonode *);
-extern void	so_notify_disconnected(struct sonode *, int);
+extern void	so_notify_disconnected(struct sonode *, boolean_t, int);
 extern void	so_notify_writable(struct sonode *);
 extern void	so_notify_data(struct sonode *, size_t);
 extern void	so_notify_oobsig(struct sonode *);
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c	Thu Jun 17 17:22:09 2010 -0700
@@ -46,6 +46,7 @@
 #include <inet/ip.h>
 
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 
 #include <sys/socket_proto.h>
 
@@ -59,7 +60,7 @@
 extern int xnet_skip_checks;
 extern int xnet_check_print;
 
-static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
+static void so_queue_oob(struct sonode *, mblk_t *, size_t);
 
 
 /*ARGSUSED*/
@@ -291,8 +292,11 @@
 	}
 
 dobind:
-	error = (*so->so_downcalls->sd_bind)
-	    (so->so_proto_handle, name, namelen, cr);
+	if (so->so_filter_active == 0 ||
+	    (error = sof_filter_bind(so, name, &namelen, cr)) < 0) {
+		error = (*so->so_downcalls->sd_bind)
+		    (so->so_proto_handle, name, namelen, cr);
+	}
 done:
 	SO_UNBLOCK_FALLBACK(so);
 
@@ -307,8 +311,10 @@
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
 
-	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
-	    cr);
+	if ((so)->so_filter_active == 0 ||
+	    (error = sof_filter_listen(so, &backlog, cr)) < 0)
+		error = (*so->so_downcalls->sd_listen)(so->so_proto_handle,
+		    backlog, cr);
 
 	SO_UNBLOCK_FALLBACK(so);
 
@@ -317,7 +323,7 @@
 
 
 int
-so_connect(struct sonode *so, const struct sockaddr *name,
+so_connect(struct sonode *so, struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, struct cred *cr)
 {
 	int error = 0;
@@ -339,12 +345,16 @@
 			goto done;
 	}
 
-	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
-	    name, namelen, &id, cr);
+	if (so->so_filter_active == 0 ||
+	    (error = sof_filter_connect(so, (struct sockaddr *)name,
+	    &namelen, cr)) < 0) {
+		error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
+		    name, namelen, &id, cr);
 
-	if (error == EINPROGRESS)
-		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
-
+		if (error == EINPROGRESS)
+			error = so_wait_connected(so,
+			    fflag & (FNONBLOCK|FNDELAY), id);
+	}
 done:
 	SO_UNBLOCK_FALLBACK(so);
 	return (error);
@@ -371,9 +381,10 @@
 		ASSERT(nso != NULL);
 
 		/* finish the accept */
-		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
-		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
-		if (error != 0) {
+		if ((so->so_filter_active > 0 &&
+		    (error = sof_filter_accept(nso, cr)) > 0) ||
+		    (error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
+		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr)) != 0) {
 			(void) socket_close(nso, 0, cr);
 			socket_destroy(nso);
 		} else {
@@ -442,7 +453,7 @@
 				error = EOPNOTSUPP;
 				break;
 			}
-		} else if (so->so_snd_qfull) {
+		} else if (SO_SND_FLOWCTRLD(so)) {
 			/*
 			 * Need to wait until the protocol is ready to receive
 			 * more data for transmission.
@@ -474,6 +485,13 @@
 			}
 			ASSERT(uiop->uio_resid >= 0);
 
+			if (so->so_filter_active > 0 &&
+			    ((mp = SOF_FILTER_DATA_OUT(so, mp, msg, cr,
+			    &error)) == NULL)) {
+				if (error != 0)
+					break;
+				continue;
+			}
 			error = (*so->so_downcalls->sd_send)
 			    (so->so_proto_handle, mp, msg, cr);
 			if (error != 0) {
@@ -495,27 +513,23 @@
 }
 
 int
-so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
-    struct cred *cr, mblk_t **mpp)
+so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag,
+    struct cred *cr, mblk_t **mpp, sof_instance_t *fil,
+    boolean_t fil_inject)
 {
 	int error;
 	boolean_t dontblock;
 	size_t size;
 	mblk_t *mp = *mpp;
 
-	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
+	if (so->so_downcalls->sd_send == NULL)
+		return (EOPNOTSUPP);
 
 	error = 0;
 	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
 	    (fflag & (FNONBLOCK|FNDELAY));
 	size = msgdsize(mp);
 
-	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
-	    so->so_downcalls->sd_send == NULL) {
-		SO_UNBLOCK_FALLBACK(so);
-		return (EOPNOTSUPP);
-	}
-
 	if ((so->so_mode & SM_ATOMIC) &&
 	    size > so->so_proto_props.sopp_maxpsz &&
 	    so->so_proto_props.sopp_maxpsz != -1) {
@@ -538,7 +552,8 @@
 			if (error != 0)
 				break;
 		}
-		if (so->so_snd_qfull) {
+		/* Socket filters are not flow controlled */
+		if (SO_SND_FLOWCTRLD(so) && !fil_inject) {
 			/*
 			 * Need to wait until the protocol is ready to receive
 			 * more data for transmission.
@@ -564,6 +579,14 @@
 			nmp = nmp->b_cont;
 		}
 
+		if (so->so_filter_active > 0 &&
+		    (mp = SOF_FILTER_DATA_OUT_FROM(so, fil, mp, msg,
+		    cr, &error)) == NULL) {
+			*mpp = mp = nmp;
+			if (error != 0)
+				break;
+			continue;
+		}
 		error = (*so->so_downcalls->sd_send)
 		    (so->so_proto_handle, mp, msg, cr);
 		if (error != 0) {
@@ -578,6 +601,30 @@
 
 		*mpp = mp = nmp;
 	}
+	/* Let the filter know whether the protocol is flow controlled */
+	if (fil_inject && error == 0 && SO_SND_FLOWCTRLD(so))
+		error = ENOSPC;
+
+	return (error);
+}
+
+#pragma inline(so_sendmblk_impl)
+
+int
+so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
+    struct cred *cr, mblk_t **mpp)
+{
+	int error;
+
+	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
+
+	if ((so->so_mode & SM_SENDFILESUPP) == 0) {
+		SO_UNBLOCK_FALLBACK(so);
+		return (EOPNOTSUPP);
+	}
+
+	error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
+	    B_FALSE);
 
 	SO_UNBLOCK_FALLBACK(so);
 
@@ -607,8 +654,10 @@
 		goto done;
 	}
 
-	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
-	    how, cr));
+	if (so->so_filter_active == 0 ||
+	    (error = sof_filter_shutdown(so, &how, cr)) < 0)
+		error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
+		    how, cr));
 
 	/*
 	 * Protocol agreed to shutdown. We need to flush the
@@ -638,8 +687,10 @@
 
 	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
 
-	error = (*so->so_downcalls->sd_getsockname)
-	    (so->so_proto_handle, addr, addrlen, cr);
+	if (so->so_filter_active == 0 ||
+	    (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
+		error = (*so->so_downcalls->sd_getsockname)
+		    (so->so_proto_handle, addr, addrlen, cr);
 
 	SO_UNBLOCK_FALLBACK(so);
 	return (error);
@@ -664,7 +715,8 @@
 		if (xnet_check_print) {
 			printf("sockfs: X/Open getpeername check => EINVAL\n");
 		}
-	} else {
+	} else if (so->so_filter_active == 0 ||
+	    (error = sof_filter_getpeername(so, addr, addrlen, cr)) < 0) {
 		error = (*so->so_downcalls->sd_getpeername)
 		    (so->so_proto_handle, addr, addrlen, cr);
 	}
@@ -679,13 +731,17 @@
 {
 	int error = 0;
 
-	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+	if (level == SOL_FILTER)
+		return (sof_getsockopt(so, option_name, optval, optlenp, cr));
+
 	SO_BLOCK_FALLBACK(so,
 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
 
-	error = socket_getopt_common(so, level, option_name, optval, optlenp,
-	    flags);
-	if (error < 0) {
+	if ((so->so_filter_active == 0 ||
+	    (error = sof_filter_getsockopt(so, level, option_name, optval,
+	    optlenp, cr)) < 0) &&
+	    (error = socket_getopt_common(so, level, option_name, optval,
+	    optlenp, flags)) < 0) {
 		error = (*so->so_downcalls->sd_getsockopt)
 		    (so->so_proto_handle, level, option_name, optval, optlenp,
 		    cr);
@@ -764,6 +820,9 @@
 	struct timeval tl;
 	const void *opt = optval;
 
+	if (level == SOL_FILTER)
+		return (sof_setsockopt(so, option_name, optval, optlen, cr));
+
 	SO_BLOCK_FALLBACK(so,
 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
 
@@ -775,6 +834,11 @@
 		return (EINVAL);
 	}
 
+	if (so->so_filter_active > 0 &&
+	    (error = sof_filter_setsockopt(so, level, option_name,
+	    (void *)optval, &optlen, cr)) >= 0)
+		goto done;
+
 	if (level == SOL_SOCKET) {
 		switch (option_name) {
 		case SO_RCVTIMEO:
@@ -856,7 +920,10 @@
 	 * calling strioc can result in the socket falling back to TPI,
 	 * if that is supported.
 	 */
-	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
+	if ((so->so_filter_active == 0 ||
+	    (error = sof_filter_ioctl(so, cmd, arg, mode,
+	    rvalp, cr)) < 0) &&
+	    (error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
 	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
 		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
 		    cmd, arg, mode, rvalp, cr);
@@ -894,7 +961,7 @@
 		 * is flow controlled
 		 */
 		*reventsp |= POLLWRBAND & events;
-		if (!so->so_snd_qfull) {
+		if (!SO_SND_FLOWCTRLD(so)) {
 			/*
 			 * As long as there is buffer to send data
 			 * turn on POLLOUT events
@@ -915,7 +982,7 @@
 	 */
 
 	/* Pending connections */
-	if (so->so_acceptq_len > 0)
+	if (!list_is_empty(&so->so_acceptq_list))
 		*reventsp |= (POLLIN|POLLRDNORM) & events;
 
 	/* Data */
@@ -941,7 +1008,8 @@
 		/* Check for read events again, but this time under lock */
 		if (events & (POLLIN|POLLRDNORM)) {
 			mutex_enter(&so->so_lock);
-			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
+			if (SO_HAVE_DATA(so) ||
+			    !list_is_empty(&so->so_acceptq_list)) {
 				mutex_exit(&so->so_lock);
 				*reventsp |= (POLLIN|POLLRDNORM) & events;
 				return (0);
@@ -987,12 +1055,13 @@
 so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
 {
 	struct sonode *so = (struct sonode *)sock_handle;
+	boolean_t connect_failed;
 
 	mutex_enter(&so->so_lock);
-
+	connect_failed = so->so_state & SS_ISCONNECTED;
 	so->so_proto_connid = id;
 	soisdisconnected(so, error);
-	so_notify_disconnected(so, error);
+	so_notify_disconnected(so, connect_failed, error);
 
 	return (0);
 }
@@ -1019,6 +1088,16 @@
 		mutex_enter(&so->so_lock);
 		so->so_state |= SS_ACCEPTCONN;
 		so->so_backlog = (unsigned int)arg;
+		/*
+		 * The protocol can stop generating newconn upcalls when
+		 * the backlog is full, so to make sure the listener does
+		 * not end up with a queue full of deferred connections
+		 * we reduce the backlog by one. Thus the listener will
+		 * start closing deferred connections before the backlog
+		 * is full.
+		 */
+		if (so->so_filter_active > 0)
+			so->so_backlog = MAX(1, so->so_backlog - 1);
 		mutex_exit(&so->so_lock);
 		break;
 	default:
@@ -1037,6 +1116,7 @@
 	} else {
 		so_snd_qnotfull(so);
 		mutex_enter(&so->so_lock);
+		/* so_notify_writable drops so_lock */
 		so_notify_writable(so);
 	}
 }
@@ -1053,8 +1133,10 @@
 	ASSERT(proto_handle != NULL);
 
 	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
-	    so->so_acceptq_len >= so->so_backlog)
-		return (NULL);
+	    (so->so_acceptq_len >= so->so_backlog &&
+	    (so->so_filter_active == 0 || !sof_sonode_drop_deferred(so)))) {
+			return (NULL);
+	}
 
 	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
 	    &error);
@@ -1066,6 +1148,7 @@
 		nso->so_peercred = peer_cred;
 		nso->so_cpid = peer_cpid;
 	}
+	nso->so_listener = so;
 
 	/*
 	 * The new socket (nso), proto_handle and sock_upcallsp are all
@@ -1075,12 +1158,30 @@
 	 */
 	*sock_upcallsp = &so_upcalls;
 
-	(void) so_acceptq_enqueue(so, nso);
+	mutex_enter(&so->so_acceptq_lock);
+	if (so->so_state & (SS_CLOSING|SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) {
+		mutex_exit(&so->so_acceptq_lock);
+		ASSERT(nso->so_count == 1);
+		nso->so_count--;
+		/* drop proto ref */
+		VN_RELE(SOTOV(nso));
+		socket_destroy(nso);
+		return (NULL);
+	} else {
+		so->so_acceptq_len++;
+		if (nso->so_state & SS_FIL_DEFER) {
+			list_insert_tail(&so->so_acceptq_defer, nso);
+			mutex_exit(&so->so_acceptq_lock);
+		} else {
+			list_insert_tail(&so->so_acceptq_list, nso);
+			cv_signal(&so->so_acceptq_cv);
+			mutex_exit(&so->so_acceptq_lock);
+			mutex_enter(&so->so_lock);
+			so_notify_newconn(so);
+		}
 
-	mutex_enter(&so->so_lock);
-	so_notify_newconn(so);
-
-	return ((sock_upper_handle_t)nso);
+		return ((sock_upper_handle_t)nso);
+	}
 }
 
 void
@@ -1132,6 +1233,27 @@
 
 	mutex_exit(&so->so_lock);
 
+	if (so->so_filter_active > 0) {
+		sof_instance_t *inst;
+		ssize_t maxblk;
+		ushort_t wroff, tail;
+		maxblk = so->so_proto_props.sopp_maxblk;
+		wroff = so->so_proto_props.sopp_wroff;
+		tail = so->so_proto_props.sopp_tail;
+		for (inst = so->so_filter_bottom; inst != NULL;
+		    inst = inst->sofi_prev) {
+			if (SOF_INTERESTED(inst, mblk_prop)) {
+				(*inst->sofi_ops->sofop_mblk_prop)(
+				    (sof_handle_t)inst, inst->sofi_cookie,
+				    &maxblk, &wroff, &tail);
+			}
+		}
+		mutex_enter(&so->so_lock);
+		so->so_proto_props.sopp_maxblk = maxblk;
+		so->so_proto_props.sopp_wroff = wroff;
+		so->so_proto_props.sopp_tail = tail;
+		mutex_exit(&so->so_lock);
+	}
 #ifdef DEBUG
 	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
 	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
@@ -1144,10 +1266,10 @@
 
 /* ARGSUSED */
 ssize_t
-so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
-    size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
+so_queue_msg_impl(struct sonode *so, mblk_t *mp,
+    size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp,
+    sof_instance_t *filter)
 {
-	struct sonode *so = (struct sonode *)sock_handle;
 	boolean_t force_push = B_TRUE;
 	int space_left;
 	sodirect_t *sodp = so->so_direct;
@@ -1165,31 +1287,14 @@
 			return (0);
 		}
 		ASSERT(msg_size == 0);
-		/*
-		 * recv space check
-		 */
 		mutex_enter(&so->so_lock);
-		space_left = so->so_rcvbuf - so->so_rcv_queued;
-		if (space_left <= 0) {
-			so->so_flowctrld = B_TRUE;
-			*errorp = ENOSPC;
-			space_left = -1;
-		}
-		goto done_unlock;
+		goto space_check;
 	}
 
 	ASSERT(mp->b_next == NULL);
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
 	ASSERT(msg_size == msgdsize(mp));
 
-	if (flags & MSG_OOB) {
-		so_queue_oob(sock_handle, mp, msg_size);
-		return (0);
-	}
-
-	if (force_pushp != NULL)
-		force_push = *force_pushp;
-
 	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
 		/* The read pointer is not aligned correctly for TPI */
 		zcmn_err(getzoneid(), CE_WARN,
@@ -1199,11 +1304,36 @@
 		mutex_enter(&so->so_lock);
 		if (sodp != NULL)
 			SOD_UIOAFINI(sodp);
-		mutex_exit(&so->so_lock);
+		goto space_check;
+	}
 
-		return (so->so_rcvbuf - so->so_rcv_queued);
+	if (so->so_filter_active > 0) {
+		for (; filter != NULL; filter = filter->sofi_prev) {
+			if (!SOF_INTERESTED(filter, data_in))
+				continue;
+			mp = (*filter->sofi_ops->sofop_data_in)(
+			    (sof_handle_t)filter, filter->sofi_cookie, mp,
+			    flags, &msg_size);
+			ASSERT(msgdsize(mp) == msg_size);
+			DTRACE_PROBE2(filter__data, (sof_instance_t), filter,
+			    (mblk_t *), mp);
+			/* Data was consumed/dropped, just do space check */
+			if (msg_size == 0) {
+				mutex_enter(&so->so_lock);
+				goto space_check;
+			}
+		}
 	}
 
+	if (flags & MSG_OOB) {
+		so_queue_oob(so, mp, msg_size);
+		mutex_enter(&so->so_lock);
+		goto space_check;
+	}
+
+	if (force_pushp != NULL)
+		force_push = *force_pushp;
+
 	mutex_enter(&so->so_lock);
 	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
 		if (sodp != NULL)
@@ -1212,7 +1342,7 @@
 		*errorp = EOPNOTSUPP;
 		return (-1);
 	}
-	if (so->so_state & SS_CANTRCVMORE) {
+	if (so->so_state & (SS_CANTRCVMORE | SS_CLOSING)) {
 		freemsg(mp);
 		if (sodp != NULL)
 			SOD_DISABLE(sodp);
@@ -1270,6 +1400,27 @@
 	mutex_exit(&so->so_lock);
 done:
 	return (space_left);
+
+space_check:
+	space_left = so->so_rcvbuf - so->so_rcv_queued;
+	if (space_left <= 0) {
+		so->so_flowctrld = B_TRUE;
+		*errorp = ENOSPC;
+		space_left = -1;
+	}
+	goto done_unlock;
+}
+
+#pragma	inline(so_queue_msg_impl)
+
+ssize_t
+so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
+    size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
+{
+	struct sonode *so = (struct sonode *)sock_handle;
+
+	return (so_queue_msg_impl(so, mp, msg_size, flags, errorp, force_pushp,
+	    so->so_filter_bottom));
 }
 
 /*
@@ -1320,11 +1471,8 @@
  * Queue the OOB byte
  */
 static void
-so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
+so_queue_oob(struct sonode *so, mblk_t *mp, size_t len)
 {
-	struct sonode *so;
-
-	so = (struct sonode *)sock_handle;
 	mutex_enter(&so->so_lock);
 	if (so->so_direct != NULL)
 		SOD_UIOAFINI(so->so_direct);
@@ -1345,21 +1493,62 @@
 {
 	int error;
 
-	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
-
 	/*
-	 * At this point there will be no more upcalls from the protocol
+	 * No new data will be enqueued once the CLOSING flag is set.
 	 */
 	mutex_enter(&so->so_lock);
-
+	so->so_state |= SS_CLOSING;
 	ASSERT(so_verify_oobstate(so));
-
 	so_rcv_flush(so);
 	mutex_exit(&so->so_lock);
 
+	if (so->so_state & SS_ACCEPTCONN) {
+		/*
+		 * We grab and release the accept lock to ensure that any
+		 * thread about to insert a socket in so_newconn completes
+		 * before we flush the queue. Any thread calling so_newconn
+		 * after we drop the lock will observe the SS_CLOSING flag,
+		 * which will stop it from inserting the socket in the queue.
+		 */
+		mutex_enter(&so->so_acceptq_lock);
+		mutex_exit(&so->so_acceptq_lock);
+
+		so_acceptq_flush(so, B_TRUE);
+	}
+
+	if (so->so_filter_active > 0)
+		sof_sonode_closing(so);
+
+	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
+	switch (error) {
+	default:
+		/* Protocol made a synchronous close; remove proto ref */
+		VN_RELE(SOTOV(so));
+		break;
+	case EINPROGRESS:
+		/*
+		 * Protocol is in the process of closing, it will make a
+		 * 'closed' upcall to remove the reference.
+		 */
+		error = 0;
+		break;
+	}
+
 	return (error);
 }
 
+/*
+ * Upcall made by the protocol when it's doing an asynchronous close. It
+ * will drop the protocol's reference on the socket.
+ */
+void
+so_closed(sock_upper_handle_t sock_handle)
+{
+	struct sonode *so = (struct sonode *)sock_handle;
+
+	VN_RELE(SOTOV(so));
+}
+
 void
 so_zcopy_notify(sock_upper_handle_t sock_handle)
 {
@@ -1759,5 +1948,6 @@
 	so_txq_full,
 	so_signal_oob,
 	so_zcopy_notify,
-	so_set_error
+	so_set_error,
+	so_closed
 };
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -39,6 +38,7 @@
 #include <sys/tihdr.h>
 
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 #include <fs/sockfs/socktpi.h>
 #include <fs/sockfs/sodirect.h>
 #include <sys/ddi.h>
@@ -59,46 +59,6 @@
 static boolean_t so_check_length(sonode_t *so);
 #endif
 
-int
-so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
-{
-	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
-	ASSERT(nso->so_acceptq_next == NULL);
-
-	*so->so_acceptq_tail = nso;
-	so->so_acceptq_tail = &nso->so_acceptq_next;
-	so->so_acceptq_len++;
-
-	/* Wakeup a single consumer */
-	cv_signal(&so->so_acceptq_cv);
-
-	return (so->so_acceptq_len);
-}
-
-/*
- * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
- *
- * Enqueue an incoming connection on a listening socket.
- *
- * Arguments:
- *   so	  - listening socket
- *   nso  - new connection
- *
- * Returns:
- *   Number of queued connections, including the new connection
- */
-int
-so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
-{
-	int conns;
-
-	mutex_enter(&so->so_acceptq_lock);
-	conns = so_acceptq_enqueue_locked(so, nso);
-	mutex_exit(&so->so_acceptq_lock);
-
-	return (conns);
-}
-
 static int
 so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
     struct sonode **nsop)
@@ -107,7 +67,7 @@
 
 	*nsop = NULL;
 	ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
-	while ((nso = so->so_acceptq_head) == NULL) {
+	while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
 		/*
 		 * No need to check so_error here, because it is not
 		 * possible for a listening socket to be reset or otherwise
@@ -126,15 +86,9 @@
 	}
 
 	ASSERT(nso != NULL);
-	so->so_acceptq_head = nso->so_acceptq_next;
-	nso->so_acceptq_next = NULL;
-
-	if (so->so_acceptq_head == NULL) {
-		ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
-		so->so_acceptq_tail = &so->so_acceptq_head;
-	}
 	ASSERT(so->so_acceptq_len > 0);
-	--so->so_acceptq_len;
+	so->so_acceptq_len--;
+	nso->so_listener = NULL;
 
 	*nsop = nso;
 
@@ -174,8 +128,36 @@
 	return (error);
 }
 
+static void
+so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
+{
+	struct sonode *nso;
+
+	while ((nso = list_remove_head(list)) != NULL) {
+		nso->so_listener = NULL;
+		if (doclose) {
+			(void) socket_close(nso, 0, CRED());
+		} else {
+			/*
+			 * Only used for fallback - not possible when filters
+			 * are present.
+			 */
+			ASSERT(so->so_filter_active == 0);
+			/*
+			 * Since the socket is on the accept queue, there can
+			 * only be one reference. We drop the reference and
+			 * just blow off the socket.
+			 */
+			ASSERT(nso->so_count == 1);
+			nso->so_count--;
+			/* drop the proto ref */
+			VN_RELE(SOTOV(nso));
+		}
+		socket_destroy(nso);
+	}
+}
 /*
- * void so_acceptq_flush(struct sonode *so, boolean_t doclose)
+ * void so_acceptq_flush(struct sonode *so)
  *
  * Removes all pending connections from a listening socket, and
  * frees the associated resources.
@@ -183,7 +165,6 @@
  * Arguments
  *   so	     - listening socket
  *   doclose - make a close downcall for each socket on the accept queue
- *             (Note, only SCTP and SDP sockets rely on this)
  *
  * Return values:
  *   None.
@@ -197,28 +178,9 @@
 void
 so_acceptq_flush(struct sonode *so, boolean_t doclose)
 {
-	struct sonode *nso;
-
-	while ((nso = so->so_acceptq_head) != NULL) {
-		so->so_acceptq_head = nso->so_acceptq_next;
-		nso->so_acceptq_next = NULL;
+	so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
+	so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
 
-		if (doclose) {
-			(void) socket_close(nso, 0, CRED());
-		} else {
-			/*
-			 * Since the socket is on the accept queue, there can
-			 * only be one reference. We drop the reference and
-			 * just blow off the socket.
-			 */
-			ASSERT(nso->so_count == 1);
-			nso->so_count--;
-		}
-		socket_destroy(nso);
-	}
-
-	so->so_acceptq_head = NULL;
-	so->so_acceptq_tail = &so->so_acceptq_head;
 	so->so_acceptq_len = 0;
 }
 
@@ -296,7 +258,7 @@
 	int error;
 
 	ASSERT(MUTEX_HELD(&so->so_lock));
-	while (so->so_snd_qfull) {
+	while (SO_SND_FLOWCTRLD(so)) {
 		if (so->so_state & SS_CANTSENDMORE)
 			return (EPIPE);
 		if (dontblock)
@@ -334,11 +296,9 @@
 	int error = 0;
 
 	mutex_enter(&so->so_lock);
-	if (so->so_snd_qfull) {
-		so->so_snd_wakeup = B_TRUE;
-		error = so_snd_wait_qnotfull_locked(so, dontblock);
-		so->so_snd_wakeup = B_FALSE;
-	}
+	so->so_snd_wakeup = B_TRUE;
+	error = so_snd_wait_qnotfull_locked(so, dontblock);
+	so->so_snd_wakeup = B_FALSE;
 	mutex_exit(&so->so_lock);
 
 	return (error);
@@ -601,8 +561,13 @@
 void
 so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
 {
+	if (so->so_filter_active > 0 &&
+	    (mp_head = sof_filter_data_in_proc(so, mp_head,
+	    &mp_last_head)) == NULL)
+		return;
+
 	ASSERT(mp_head->b_prev != NULL);
-	if (so->so_rcv_q_head  == NULL) {
+	if (so->so_rcv_q_head == NULL) {
 		so->so_rcv_q_head = mp_head;
 		so->so_rcv_q_last_head = mp_last_head;
 		ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
@@ -650,13 +615,13 @@
  * Check flow control on a given sonode.  Must have so_lock held, and
  * this function will release the hold.
  */
-
-static void
+void
 so_check_flow_control(struct sonode *so)
 {
 	ASSERT(MUTEX_HELD(&so->so_lock));
 
-	if (so->so_flowctrld && so->so_rcv_queued < so->so_rcvlowat) {
+	if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
+	    !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
 		so->so_flowctrld = B_FALSE;
 		mutex_exit(&so->so_lock);
 		/*
@@ -668,6 +633,8 @@
 			(*so->so_downcalls->sd_clr_flowctrl)
 			    (so->so_proto_handle);
 		}
+		/* filters can start injecting data */
+		sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
 	} else {
 		mutex_exit(&so->so_lock);
 	}
@@ -1116,7 +1083,7 @@
 	}
 
 	/*
-	 * Free messages sitting in the send and recv queue
+	 * Free messages sitting in the recv queues
 	 */
 	while (so->so_rcv_q_head != NULL) {
 		mp = so->so_rcv_q_head;
@@ -1313,11 +1280,29 @@
 		so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
 
 		mutex_exit(&pso->so_lock);
+
+		/*
+		 * If the parent has any filters, try to inherit them.
+		 */
+		if (pso->so_filter_active > 0 &&
+		    (error = sof_sonode_inherit_filters(so, pso)) != 0)
+			return (error);
+
 	} else {
 		struct sockparams *sp = so->so_sockparams;
 		sock_upcalls_t *upcalls_to_use;
 
 		/*
+		 * Attach automatic filters, if there are any.
+		 */
+		if (!list_is_empty(&sp->sp_auto_filters) &&
+		    (error = sof_sonode_autoattach_filters(so, cr)) != 0)
+			return (error);
+
+		/* OK to attach filters */
+		so->so_state |= SS_FILOP_OK;
+
+		/*
 		 * Based on the version number select the right upcalls to
 		 * pass down. Currently we only have one version so choose
 		 * default
@@ -1384,6 +1369,9 @@
 	if (uioasync.enabled)
 		sod_sock_init(so);
 
+	/* put an extra reference on the socket for the protocol */
+	VN_HOLD(SOTOV(so));
+
 	return (0);
 }
 
@@ -1812,6 +1800,22 @@
 		*optlenp = sizeof (struct so_snd_bufinfo);
 		return (0);
 	}
+	case SO_SND_COPYAVOID: {
+		sof_instance_t *inst;
+
+		/*
+		 * Avoid zero-copy if there is a filter with a data_out
+		 * callback. We could let the operation succeed, but then
+		 * the filter would have to copy the data anyway.
+		 */
+		for (inst = so->so_filter_top; inst != NULL;
+		    inst = inst->sofi_next) {
+			if (SOF_INTERESTED(inst, data_out))
+				return (EOPNOTSUPP);
+		}
+		break;
+	}
+
 	default:
 		break;
 	}
@@ -1982,15 +1986,19 @@
  * We do not need to hold so_lock, since there can be only one thread
  * operating on the sonode.
  */
-static void
-so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
-    struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
+static mblk_t *
+so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
+    struct T_capability_ack *tcap,
+    struct sockaddr *laddr, socklen_t laddrlen,
     struct sockaddr *faddr, socklen_t faddrlen, short opts)
 {
 	struct sonode *so = (struct sonode *)sock_handle;
 	boolean_t atmark;
+	mblk_t *retmp = NULL, **tailmpp = &retmp;
 
-	sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
+	if (tcap != NULL)
+		sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
+		    opts);
 
 	/*
 	 * Some protocols do not quiece the data path during fallback. Once
@@ -2038,9 +2046,9 @@
 		 */
 		if (atmark) {
 			struct T_exdata_ind *tei;
-			mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
+			mblk_t *mp1 = arg->soqa_exdata_mp;
 
-			SOTOTPI(so)->sti_exdata_mp = NULL;
+			arg->soqa_exdata_mp = NULL;
 			ASSERT(mp1 != NULL);
 			mp1->b_datap->db_type = M_PROTO;
 			tei = (struct T_exdata_ind *)mp1->b_rptr;
@@ -2101,7 +2109,8 @@
 		 * Queue data on the STREAM head.
 		 */
 		so->so_rcv_queued -= mlen;
-		putnext(q, mp);
+		*tailmpp = mp;
+		tailmpp = &mp->b_next;
 	}
 	so->so_rcv_head = NULL;
 	so->so_rcv_last_head = NULL;
@@ -2121,8 +2130,8 @@
 		if (atmark && so->so_oobmsg != NULL) {
 			struct T_exdata_ind *tei;
 
-			mp = SOTOTPI(so)->sti_exdata_mp;
-			SOTOTPI(so)->sti_exdata_mp = NULL;
+			mp = arg->soqa_exdata_mp;
+			arg->soqa_exdata_mp = NULL;
 			ASSERT(mp != NULL);
 			mp->b_datap->db_type = M_PROTO;
 			tei = (struct T_exdata_ind *)mp->b_rptr;
@@ -2133,38 +2142,32 @@
 			mp->b_cont = so->so_oobmsg;
 			so->so_oobmsg = NULL;
 
-			putnext(q, mp);
+			*tailmpp = mp;
+			tailmpp = &mp->b_next;
 		} else {
 			/* Send up the signal */
-			mp = SOTOTPI(so)->sti_exdata_mp;
-			SOTOTPI(so)->sti_exdata_mp = NULL;
+			mp = arg->soqa_exdata_mp;
+			arg->soqa_exdata_mp = NULL;
 			ASSERT(mp != NULL);
 			DB_TYPE(mp) = M_PCSIG;
 			*mp->b_wptr++ = (uchar_t)SIGURG;
-			putnext(q, mp);
+			*tailmpp = mp;
+			tailmpp = &mp->b_next;
 
 			/* Send up the mark indicator */
-			mp = SOTOTPI(so)->sti_urgmark_mp;
-			SOTOTPI(so)->sti_urgmark_mp = NULL;
+			mp = arg->soqa_urgmark_mp;
+			arg->soqa_urgmark_mp = NULL;
 			mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
-			putnext(q, mp);
+			*tailmpp = mp;
+			tailmpp = &mp->b_next;
 
 			so->so_oobmark = 0;
 		}
 	}
-
-	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
-		freeb(SOTOTPI(so)->sti_exdata_mp);
-		SOTOTPI(so)->sti_exdata_mp = NULL;
-	}
-
-	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
-		freeb(SOTOTPI(so)->sti_urgmark_mp);
-		SOTOTPI(so)->sti_urgmark_mp = NULL;
-	}
-
 	ASSERT(so->so_oobmark == 0);
 	ASSERT(so->so_rcv_queued == 0);
+
+	return (retmp);
 }
 
 #ifdef DEBUG
@@ -2203,7 +2206,8 @@
 	VERIFY(cur->so_version == orig->so_version);
 	/* New conns might have arrived, but none should have been lost */
 	VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
-	VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
+	VERIFY(list_head(&cur->so_acceptq_list) ==
+	    list_head(&orig->so_acceptq_list));
 	VERIFY(cur->so_backlog == orig->so_backlog);
 	/* New OOB migth have arrived, but mark should not have been lost */
 	VERIFY(cur->so_oobmark >= orig->so_oobmark);
@@ -2243,8 +2247,10 @@
 	struct sockparams *sp;
 	struct sockparams *newsp = NULL;
 	so_proto_fallback_func_t fbfunc;
+	const char *devpath;
 	boolean_t direct;
 	struct sonode *nso;
+	sock_quiesce_arg_t arg = { NULL, NULL };
 #ifdef DEBUG
 	struct sonode origso;
 #endif
@@ -2253,10 +2259,27 @@
 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
 
 	/*
-	 * Fallback can only happen if there is a device associated
-	 * with the sonode, and the socket module has a fallback function.
+	 * Cannot fallback if the socket has active filters
 	 */
-	if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
+	if (so->so_filter_active > 0)
+		return (EINVAL);
+
+	switch (so->so_family) {
+	case AF_INET:
+		devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
+		break;
+	case AF_INET6:
+		devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/*
+	 * Fallback can only happen if the socket module has a TPI device
+	 * and fallback function.
+	 */
+	if (devpath == NULL || fbfunc == NULL)
 		return (EINVAL);
 
 	/*
@@ -2276,8 +2299,7 @@
 	sp->sp_stats.sps_nfallback.value.ui64++;
 
 	newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
-	    so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
-	    KM_SLEEP, &error);
+	    so->so_protocol, devpath, KM_SLEEP, &error);
 	if (error != 0)
 		goto out;
 
@@ -2295,14 +2317,30 @@
 	error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
 	if (error != 0)
 		goto out;
-
+	/*
+	 * When it comes to urgent data we have two cases to deal with;
+	 * (1) The oob byte has already arrived, or (2) the protocol has
+	 * notified that oob data is pending, but it has not yet arrived.
+	 *
+	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
+	 * in the byte stream the oob byte is. For (2) we have to send a
+	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
+	 * the oob byte will be the next byte from the protocol.
+	 *
+	 * So in the worst case we need two mblks, one for the signal, another
+	 * for mark indication. In that case we use the exdata_mp for the sig.
+	 */
+	arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
+	    BPRI_MED, STR_NOSIG, NULL);
+	arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
 
 	/*
 	 * Now tell the protocol to start using TPI. so_quiesced_cb be
 	 * called once it's safe to synchronize state.
 	 */
 	DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
-	error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
+	error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
+	    &arg);
 	DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
 
 	if (error != 0) {
@@ -2315,19 +2353,40 @@
 	 * Walk the accept queue and notify the proto that they should
 	 * fall back to TPI. The protocol will send up the T_CONN_IND.
 	 */
-	nso = so->so_acceptq_head;
+	nso = list_head(&so->so_acceptq_list);
 	while (nso != NULL) {
 		int rval;
+		struct sonode *next;
+
+		if (arg.soqa_exdata_mp == NULL) {
+			arg.soqa_exdata_mp =
+			    allocb_wait(sizeof (struct T_exdata_ind),
+			    BPRI_MED, STR_NOSIG, NULL);
+		}
+		if (arg.soqa_urgmark_mp == NULL) {
+			arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
+			    STR_NOSIG, NULL);
+		}
 
 		DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
-		rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
+		rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
+		    so_quiesced_cb, &arg);
 		DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
 		if (rval != 0) {
+			/* Abort the connection */
 			zcmn_err(getzoneid(), CE_WARN,
 			    "Failed to convert socket in accept queue to TPI. "
 			    "Pid = %d\n", curproc->p_pid);
+			next = list_next(&so->so_acceptq_list, nso);
+			list_remove(&so->so_acceptq_list, nso);
+			so->so_acceptq_len--;
+
+			(void) socket_close(nso, 0, CRED());
+			socket_destroy(nso);
+			nso = next;
+		} else {
+			nso = list_next(&so->so_acceptq_list, nso);
 		}
-		nso = nso->so_acceptq_next;
 	}
 
 	/*
@@ -2352,6 +2411,14 @@
 	 * the STREAMS head).
 	 */
 	pollwakeup(&so->so_poll_list, POLLERR);
+
+	/*
+	 * When this non-STREAM socket was created we placed an extra ref on
+	 * the associated vnode to support asynchronous close. Drop that ref
+	 * here.
+	 */
+	ASSERT(SOTOV(so)->v_count >= 2);
+	VN_RELE(SOTOV(so));
 out:
 	so_end_fallback(so);
 
@@ -2365,6 +2432,10 @@
 		if (newsp != NULL)
 			SOCKPARAMS_DEC_REF(newsp);
 	}
+	if (arg.soqa_exdata_mp != NULL)
+		freemsg(arg.soqa_exdata_mp);
+	if (arg.soqa_urgmark_mp != NULL)
+		freemsg(arg.soqa_urgmark_mp);
 
 	return (error);
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c	Thu Jun 17 17:22:09 2010 -0700
@@ -0,0 +1,1770 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/disp.h>
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/note.h>
+#include <sys/rwlock.h>
+#include <sys/stropts.h>
+#include <sys/taskq.h>
+#include <sys/socketvar.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
+
+/*
+ * Socket Filter Framework
+ *
+ * Socket filter entry (sof_entry_t):
+ *
+ *   There exists one entry for each configured filter (done via soconfig(1M)),
+ *   and they are all in sof_entry_list. In addition to the global list, each
+ *   sockparams entry maintains a list of filters that is interested in that
+ *   particular socket type. So the filter entry may be referenced by multiple
+ *   sockparams. The set of sockparams referencing a filter may change as
+ *   socket types are added and/or removed from the system. Both sof_entry_list
+ *   and the sockparams list is protected by sockconf_lock.
+ *
+ *   Each filter entry has a ref count which is incremented whenever a filter
+ *   is attached to a socket. An entry is marked SOFEF_CONDEMED when it is
+ *   unconfigured, which will result in the entry being freed when its ref
+ *   count reaches zero.
+ *
+ * Socket filter module (sof_module_t):
+ *
+ *   Modules are created by sof_register() and placed in sof_module_list,
+ *   which is protected by sof_module_lock. Each module has a reference count
+ *   that is incremented when a filter entry is using the module. A module
+ *   can be destroyed by sof_register() only when it's ref count is zero.
+ *
+ * Socket filter instance (sof_instance_t):
+ *
+ *   Whenever a filter is attached to a socket (sonode), a new instance is
+ *   created. The socket is guaranteed to be single threaded when filters are
+ *   being attached/detached. The instance uses the sonode's so_lock for
+ *   protection.
+ *
+ *   The lifetime of an instance is the same as the socket it's attached to.
+ *
+ * How things link together:
+ *
+ *      sockparams.sp_{auto,prog}_filters -> sp_filter_t -> sp_filter_t
+ *      ^                                    |              |
+ *      |                                    |              |
+ *   sonode.so_filter_top -> sof_instance_t  |              |
+ *                                     |     |              |
+ *                                     v     v              v
+ *    sof_entry_list -> sof_entry_t -> sof_entry -> ... -> sof_entry_t
+ *                                     |
+ *                                     v
+ *           sof_module_list -> sof_module_t -> ... -> sof_module_t
+ */
+
+static list_t 	sof_entry_list;		/* list of configured filters */
+
+static list_t	sof_module_list;	/* list of loaded filter modules */
+static kmutex_t	sof_module_lock;	/* protect the module list */
+
+static sof_kstat_t	sof_stat;
+static kstat_t 		*sof_stat_ksp;
+
+#ifdef DEBUG
+static int socket_filter_debug = 0;
+#endif
+
+/*
+ * A connection that has been deferred for more than `sof_defer_drop_time'
+ * ticks can be dropped to make room for new connections. A connection that
+ * is to be dropped is moved over to `sof_close_deferred_list' where it will
+ * be closed by sof_close_deferred() (which is running on a taskq). Connections
+ * will not be moved over to the close list if it grows larger than
+ * `sof_close_deferred_max_backlog'.
+ */
+clock_t		sof_defer_drop_time = 3000;
+uint_t		sof_close_deferred_max_backlog = 1000;
+
+taskq_t		*sof_close_deferred_taskq;
+boolean_t	sof_close_deferred_running;
+uint_t		sof_close_deferred_backlog;
+list_t		sof_close_deferred_list;
+kmutex_t	sof_close_deferred_lock;
+
+static void	sof_close_deferred(void *);
+
+static void		sof_module_rele(sof_module_t *);
+static sof_module_t 	*sof_module_hold_by_name(const char *, const char *);
+
+static int		sof_entry_load_module(sof_entry_t *);
+static void 		sof_entry_hold(sof_entry_t *);
+static void 		sof_entry_rele(sof_entry_t *);
+static int 		sof_entry_kstat_create(sof_entry_t *);
+static void 		sof_entry_kstat_destroy(sof_entry_t *);
+
+static sof_instance_t 	*sof_instance_create(sof_entry_t *, struct sonode *);
+static void		sof_instance_destroy(sof_instance_t *);
+
+static int
+sof_kstat_update(kstat_t *ksp, int rw)
+{
+	_NOTE(ARGUNUSED(ksp));
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	sof_stat.sofks_defer_close_backlog.value.ui64 =
+	    sof_close_deferred_backlog;
+
+	return (0);
+}
+
+void
+sof_init(void)
+{
+	list_create(&sof_entry_list, sizeof (sof_entry_t),
+	    offsetof(sof_entry_t, sofe_node));
+	list_create(&sof_module_list, sizeof (sof_module_t),
+	    offsetof(sof_module_t, sofm_node));
+	list_create(&sof_close_deferred_list, sizeof (struct sonode),
+	    offsetof(struct sonode, so_acceptq_node));
+
+	sof_close_deferred_taskq = taskq_create("sof_close_deferred_taskq",
+	    1, minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE);
+	sof_close_deferred_running = B_FALSE;
+	sof_close_deferred_backlog = 0;
+
+	mutex_init(&sof_close_deferred_lock, NULL, MUTEX_DEFAULT, 0);
+	mutex_init(&sof_module_lock, NULL, MUTEX_DEFAULT, 0);
+
+	sof_stat_ksp = kstat_create("sockfs", 0, "sockfilter", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (sof_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (sof_stat_ksp == NULL)
+		return;
+
+	kstat_named_init(&sof_stat.sofks_defer_closed, "defer_closed",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&sof_stat.sofks_defer_close_backlog,
+	    "defer_close_backlog", KSTAT_DATA_UINT64);
+	kstat_named_init(&sof_stat.sofks_defer_close_failed_backlog_too_big,
+	    "defer_close_failed_backlog_too_big", KSTAT_DATA_UINT64);
+
+	sof_stat_ksp->ks_data = &sof_stat;
+	sof_stat_ksp->ks_update = sof_kstat_update;
+	kstat_install(sof_stat_ksp);
+}
+
+/*
+ * Process filter options.
+ */
+static int
+sof_setsockopt_impl(struct sonode *so, int option_name,
+    const void *optval, socklen_t optlen, struct cred *cr)
+{
+	struct sockparams *sp = so->so_sockparams;
+	sof_entry_t *ent = NULL;
+	sp_filter_t *fil;
+	sof_instance_t *inst;
+	sof_rval_t rval;
+	int error;
+
+	_NOTE(ARGUNUSED(optlen));
+
+	/*
+	 * Is the filter in a state where filters can be attached?
+	 */
+	if (!(so->so_state & SS_FILOP_OK))
+		return (EINVAL);
+
+	if (option_name == FIL_ATTACH) {
+		/*
+		 * Make sure there isn't already another instance of the
+		 * same filter attached to the socket.
+		 */
+		for (inst = so->so_filter_top; inst != NULL;
+		    inst = inst->sofi_next) {
+			if (strncmp(inst->sofi_filter->sofe_name,
+			    (const char *)optval, SOF_MAXNAMELEN) == 0)
+				return (EEXIST);
+		}
+		/* Look up the filter. */
+		rw_enter(&sockconf_lock, RW_READER);
+		for (fil = list_head(&sp->sp_prog_filters); fil != NULL;
+		    fil = list_next(&sp->sp_prog_filters, fil)) {
+			ent = fil->spf_filter;
+			ASSERT(ent->sofe_flags & SOFEF_PROG);
+
+			if (strncmp(ent->sofe_name, (const char *)optval,
+			    SOF_MAXNAMELEN) == 0)
+				break;
+		}
+		/* No such filter */
+		if (fil == NULL) {
+			rw_exit(&sockconf_lock);
+			return (ENOENT);
+		}
+		inst = sof_instance_create(ent, so);
+		rw_exit(&sockconf_lock);
+
+		/* Failed to create an instance; must be out of memory */
+		if (inst == NULL)
+			return (ENOMEM);
+
+		/*
+		 * This might be the first time the filter is being used,
+		 * so try to load the module if it's not already registered.
+		 */
+		if (ent->sofe_mod == NULL &&
+		    (error = sof_entry_load_module(ent)) != 0) {
+			sof_instance_destroy(inst);
+			return (error);
+		}
+
+		/* Module loaded OK, so there must be an ops vector */
+		ASSERT(ent->sofe_mod != NULL);
+		inst->sofi_ops = &ent->sofe_mod->sofm_ops;
+
+		SOF_STAT_ADD(inst, tot_active_attach, 1);
+		if (inst->sofi_ops->sofop_attach_active != NULL) {
+			rval = inst->sofi_ops->sofop_attach_active(
+			    (sof_handle_t)inst, so->so_family, so->so_type,
+			    so->so_protocol, cr, &inst->sofi_cookie);
+			if (rval != SOF_RVAL_CONTINUE) {
+				sof_instance_destroy(inst);
+				switch (rval) {
+				case SOF_RVAL_DETACH:
+					/*
+					 * Filter does not want to to attach.
+					 * An error is returned so the user
+					 * knows the request did not go
+					 * through.
+					 */
+					error = EINVAL;
+					break;
+				default:
+					SOF_STAT_ADD(inst, attach_failures, 1);
+					/* Not a valid rval for active attach */
+					ASSERT(rval != SOF_RVAL_DEFER);
+					error = sof_rval2errno(rval);
+					break;
+				}
+				return (error);
+			}
+		}
+		return (0);
+	} else if (option_name == FIL_DETACH) {
+		for (inst = so->so_filter_top; inst != NULL;
+		    inst = inst->sofi_next) {
+
+			ent = inst->sofi_filter;
+			if (strncmp(ent->sofe_name, (const char *)optval,
+			    SOF_MAXNAMELEN) == 0)
+				break;
+		}
+		if (inst == NULL)
+			return (ENXIO);
+
+		/* automatic filters cannot be detached */
+		if (inst->sofi_filter->sofe_flags & SOFEF_AUTO)
+			return (EINVAL);
+
+		if (inst->sofi_ops->sofop_detach != NULL)
+			inst->sofi_ops->sofop_detach((sof_handle_t)inst,
+			    inst->sofi_cookie, cr);
+		sof_instance_destroy(inst);
+
+		return (0);
+	} else {
+		return (EINVAL);
+	}
+}
+
+int
+sof_setsockopt(struct sonode *so, int option_name,
+    const void *optval, socklen_t optlen, struct cred *cr)
+{
+	int error;
+
+	/*
+	 * By grabbing the lock as a writer we ensure that no other socket
+	 * operations can start while the filter stack is being manipulated.
+	 *
+	 * We do a tryenter so that in case there is an active thread we
+	 * ask the caller to try again instead of blocking here until the
+	 * other thread is done (which could be indefinitely in case of recv).
+	 */
+	if (!rw_tryenter(&so->so_fallback_rwlock, RW_WRITER)) {
+		return (EAGAIN);
+	}
+
+	/* Bail out if a fallback has taken place */
+	if (so->so_state & SS_FALLBACK_COMP)
+		error = EINVAL;
+	else
+		error = sof_setsockopt_impl(so, option_name, optval,
+		    optlen, cr);
+	rw_exit(&so->so_fallback_rwlock);
+
+	return (error);
+}
+
+/*
+ * Get filter socket options.
+ */
+static int
+sof_getsockopt_impl(struct sonode *so, int option_name,
+    void *optval, socklen_t *optlenp, struct cred *cr)
+{
+	sof_instance_t *inst;
+	struct fil_info *fi;
+	socklen_t maxsz = *optlenp;
+	int i;
+	uint_t cnt;
+
+	_NOTE(ARGUNUSED(cr));
+
+	if (option_name == FIL_LIST) {
+		fi = (struct fil_info *)optval;
+
+		if (maxsz < sizeof (*fi))
+			return (EINVAL);
+
+		for (inst = so->so_filter_top, cnt = 0; inst != NULL;
+		    inst = inst->sofi_next)
+			cnt++;
+		for (inst = so->so_filter_top, i = 0;
+		    inst != NULL && (i+1) * sizeof (*fi) <= maxsz;
+		    inst = inst->sofi_next, i++) {
+			fi[i].fi_flags =
+			    (inst->sofi_filter->sofe_flags & SOFEF_AUTO) ?
+			    FILF_AUTO : FILF_PROG;
+			if (inst->sofi_flags & SOFIF_BYPASS)
+				fi[i].fi_flags |= FILF_BYPASS;
+			(void) strncpy(fi[i].fi_name,
+			    inst->sofi_filter->sofe_name, FILNAME_MAX);
+			ASSERT(cnt > 0);
+			fi[i].fi_pos = --cnt;
+		}
+		*optlenp = i * sizeof (*fi);
+		return (0);
+	} else {
+		return (EINVAL);
+	}
+}
+
+int
+sof_getsockopt(struct sonode *so, int option_name,
+    void *optval, socklen_t *optlenp, struct cred *cr)
+{
+	int error;
+
+	/*
+	 * The fallback lock is used here to serialize set and get
+	 * filter operations.
+	 */
+	rw_enter(&so->so_fallback_rwlock, RW_READER);
+	if (so->so_state & SS_FALLBACK_COMP)
+		error = EINVAL;
+	else
+		error = sof_getsockopt_impl(so, option_name, optval, optlenp,
+		    cr);
+	rw_exit(&so->so_fallback_rwlock);
+
+	return (error);
+}
+
+/*
+ * The socket `so' wants to inherit the filter stack from `pso'.
+ * Returns 0 if all went well or an errno otherwise.
+ */
+int
+sof_sonode_inherit_filters(struct sonode *so, struct sonode *pso)
+{
+	sof_instance_t *inst, *pinst;
+	sof_rval_t rval;
+	int error;
+	struct sockaddr_in6 laddrbuf, faddrbuf;
+	struct sockaddr_in6 *laddr, *faddr;
+	socklen_t laddrlen, faddrlen;
+
+	/*
+	 * Make sure there is enough room to retrieve the addresses
+	 */
+	if (so->so_proto_props.sopp_maxaddrlen > sizeof (laddrbuf)) {
+		laddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen,
+		    KM_NOSLEEP);
+		if (laddr == NULL)
+			return (ENOMEM);
+		faddr = kmem_zalloc(so->so_proto_props.sopp_maxaddrlen,
+		    KM_NOSLEEP);
+		if (faddr == NULL) {
+			kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen);
+			return (ENOMEM);
+		}
+		laddrlen = faddrlen = so->so_proto_props.sopp_maxaddrlen;
+	} else {
+		laddrlen = faddrlen = sizeof (laddrbuf);
+		laddr = &laddrbuf;
+		faddr = &faddrbuf;
+	}
+
+	error = (*so->so_downcalls->sd_getpeername)
+	    (so->so_proto_handle, (struct sockaddr *)faddr, &faddrlen, kcred);
+	if (error != 0)
+		goto out;
+	error = (*so->so_downcalls->sd_getsockname)
+	    (so->so_proto_handle, (struct sockaddr *)laddr, &laddrlen, kcred);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * The stack is built bottom up. Filters are allowed to modify the
+	 * the foreign and local addresses during attach.
+	 */
+	for (pinst = pso->so_filter_bottom;
+	    pinst != NULL && !(pinst->sofi_flags & SOFIF_BYPASS);
+	    pinst = pinst->sofi_prev) {
+		inst = sof_instance_create(pinst->sofi_filter, so);
+		if (inst == NULL) {
+			error = ENOMEM;
+			goto out;
+		}
+		/*
+		 * The filter module must be loaded since it's already
+		 * attached to the listener.
+		 */
+		ASSERT(pinst->sofi_ops != NULL);
+		inst->sofi_ops = pinst->sofi_ops;
+
+		SOF_STAT_ADD(inst, tot_passive_attach, 1);
+		if (inst->sofi_ops->sofop_attach_passive != NULL) {
+			rval = inst->sofi_ops->sofop_attach_passive(
+			    (sof_handle_t)inst,
+			    (sof_handle_t)pinst, pinst->sofi_cookie,
+			    (struct sockaddr *)laddr, laddrlen,
+			    (struct sockaddr *)faddr, faddrlen,
+			    &inst->sofi_cookie);
+			if (rval != SOF_RVAL_CONTINUE) {
+				if (rval == SOF_RVAL_DEFER) {
+					mutex_enter(&so->so_lock);
+					inst->sofi_flags |= SOFIF_DEFER;
+					so->so_state |= SS_FIL_DEFER;
+					mutex_exit(&so->so_lock);
+					so->so_filter_defertime =
+					    ddi_get_lbolt();
+					SOF_STAT_ADD(inst, ndeferred, 1);
+				} else if (rval == SOF_RVAL_DETACH) {
+					sof_instance_destroy(inst);
+				} else {
+					SOF_STAT_ADD(inst, attach_failures, 1);
+					error = sof_rval2errno(rval);
+					/*
+					 * Filters that called attached will be
+					 * destroyed when the socket goes away,
+					 * after detach is called.
+					 */
+					goto out;
+				}
+			}
+		}
+	}
+
+out:
+	if (laddr != &laddrbuf) {
+		kmem_free(laddr, so->so_proto_props.sopp_maxaddrlen);
+		kmem_free(faddr, so->so_proto_props.sopp_maxaddrlen);
+	}
+	return (error);
+}
+
+/*
+ * Attach any automatic filters to sonode `so'. Returns 0 if all went well
+ * and an errno otherwise.
+ */
+int
+sof_sonode_autoattach_filters(struct sonode *so, cred_t *cr)
+{
+	struct sockparams *sp = so->so_sockparams;
+	sp_filter_t *fil;
+	sof_instance_t *inst;
+	sof_rval_t rval;
+	int error;
+
+	/*
+	 * A created instance is added to the top of the sonode's filter
+	 * stack, so traverse the config list in reverse order.
+	 */
+	rw_enter(&sockconf_lock, RW_READER);
+	for (fil = list_tail(&sp->sp_auto_filters);
+	    fil != NULL; fil = list_prev(&sp->sp_auto_filters, fil)) {
+		ASSERT(fil->spf_filter->sofe_flags & SOFEF_AUTO);
+		if (!sof_instance_create(fil->spf_filter, so)) {
+			rw_exit(&sockconf_lock);
+			error = ENOMEM; /* must have run out of memory */
+			goto free_all;
+		}
+	}
+	rw_exit(&sockconf_lock);
+
+	/*
+	 * Notify each filter that it's being attached.
+	 */
+	inst = so->so_filter_top;
+	while (inst != NULL) {
+		sof_entry_t *ent = inst->sofi_filter;
+		sof_instance_t *ninst = inst->sofi_next;
+
+		/*
+		 * This might be the first time the filter is being used,
+		 * so try to load the module if it's not already registered.
+		 */
+		if (ent->sofe_mod == NULL &&
+		    (error = sof_entry_load_module(ent)) != 0)
+			goto free_detached;
+
+		/* Module loaded OK, so there must be an ops vector */
+		ASSERT(ent->sofe_mod != NULL);
+		inst->sofi_ops = &ent->sofe_mod->sofm_ops;
+
+		SOF_STAT_ADD(inst, tot_active_attach, 1);
+		if (inst->sofi_ops->sofop_attach_active != NULL) {
+			rval = inst->sofi_ops->sofop_attach_active(
+			    (sof_handle_t)inst, so->so_family, so->so_type,
+			    so->so_protocol, cr, &inst->sofi_cookie);
+			if (rval != SOF_RVAL_CONTINUE) {
+				switch (rval) {
+				case SOF_RVAL_DETACH:
+					/* filter does not want to attach */
+					sof_instance_destroy(inst);
+					break;
+				default:
+					SOF_STAT_ADD(inst, attach_failures, 1);
+					/* Not a valid rval for active attach */
+					ASSERT(rval != SOF_RVAL_DEFER);
+					error = sof_rval2errno(rval);
+					goto free_detached;
+				}
+			}
+		}
+		inst = ninst;
+	}
+	return (0);
+
+free_all:
+	inst = so->so_filter_top;
+free_detached:
+	ASSERT(inst != NULL);
+	/*
+	 * Destroy all filters for which attach was not called. The other
+	 * filters will be destroyed (and detach called) when the socket
+	 * is freed.
+	 */
+	do {
+		sof_instance_t *t = inst->sofi_next;
+		sof_instance_destroy(inst);
+		inst = t;
+	} while (inst != NULL);
+
+	return (error);
+}
+
+/*
+ * Detaches and frees all filters attached to sonode `so'.
+ */
+void
+sof_sonode_cleanup(struct sonode *so)
+{
+	sof_instance_t *inst;
+
+	while ((inst = so->so_filter_top) != NULL) {
+		(inst->sofi_ops->sofop_detach)((sof_handle_t)inst,
+		    inst->sofi_cookie, kcred);
+		sof_instance_destroy(inst);
+	}
+}
+
+/*
+ * Notifies all active filters attached to `so' about the `event' and
+ * where `arg' is an event specific argument.
+ */
+void
+sof_sonode_notify_filters(struct sonode *so, sof_event_t event, uintptr_t arg)
+{
+	sof_instance_t *inst;
+
+	for (inst = so->so_filter_bottom; inst != NULL;
+	    inst = inst->sofi_prev) {
+		if (SOF_INTERESTED(inst, notify))
+			(inst->sofi_ops->sofop_notify)((sof_handle_t)inst,
+			    inst->sofi_cookie, event, arg);
+	}
+}
+
+/*
+ * The socket `so' is closing. Notify filters and make sure that there
+ * are no pending tx operations.
+ */
+void
+sof_sonode_closing(struct sonode *so)
+{
+	/*
+	 * Notify filters that the socket is being closed. It's OK for
+	 * filters to inject data.
+	 */
+	sof_sonode_notify_filters(so, SOF_EV_CLOSING, (uintptr_t)B_TRUE);
+
+	/* wait for filters that are sending out data */
+	mutex_enter(&so->so_lock);
+	while (so->so_filter_tx > 0)
+		cv_wait(&so->so_closing_cv, &so->so_lock);
+	mutex_exit(&so->so_lock);
+}
+
+/*
+ * Called when socket `so' wants to get rid of a deferred connection.
+ * Returns TRUE if a connection was dropped.
+ */
+boolean_t
+sof_sonode_drop_deferred(struct sonode *so)
+{
+	struct sonode *def;
+	clock_t now = ddi_get_lbolt();
+
+	if (sof_close_deferred_backlog > sof_close_deferred_max_backlog) {
+		SOF_GLOBAL_STAT_BUMP(defer_close_failed_backlog_too_big);
+		return (B_FALSE);
+	}
+	mutex_enter(&so->so_acceptq_lock);
+	if ((def = list_head(&so->so_acceptq_defer)) != NULL &&
+	    (now - def->so_filter_defertime) > sof_defer_drop_time) {
+		list_remove(&so->so_acceptq_defer, def);
+		so->so_acceptq_len--;
+		mutex_exit(&so->so_acceptq_lock);
+		def->so_listener = NULL;
+	} else {
+		mutex_exit(&so->so_acceptq_lock);
+		return (B_FALSE);
+	}
+
+	mutex_enter(&sof_close_deferred_lock);
+	list_insert_tail(&sof_close_deferred_list, def);
+	sof_close_deferred_backlog++;
+	if (!sof_close_deferred_running) {
+		mutex_exit(&sof_close_deferred_lock);
+		(void) taskq_dispatch(sof_close_deferred_taskq,
+		    sof_close_deferred, NULL, TQ_NOSLEEP);
+	} else {
+		mutex_exit(&sof_close_deferred_lock);
+	}
+	return (B_TRUE);
+}
+
+/*
+ * Called from a taskq to close connections that have been deferred for
+ * too long.
+ */
+void
+sof_close_deferred(void *unused)
+{
+	struct sonode *drop;
+
+	_NOTE(ARGUNUSED(unused));
+
+	mutex_enter(&sof_close_deferred_lock);
+	if (!sof_close_deferred_running) {
+		sof_close_deferred_running = B_TRUE;
+		while ((drop =
+		    list_remove_head(&sof_close_deferred_list)) != NULL) {
+			sof_close_deferred_backlog--;
+			mutex_exit(&sof_close_deferred_lock);
+
+			SOF_GLOBAL_STAT_BUMP(defer_closed);
+			(void) socket_close(drop, 0, kcred);
+			socket_destroy(drop);
+
+			mutex_enter(&sof_close_deferred_lock);
+		}
+		sof_close_deferred_running = B_FALSE;
+		ASSERT(sof_close_deferred_backlog == 0);
+	}
+	mutex_exit(&sof_close_deferred_lock);
+}
+
+/*
+ * Creates a new filter instance from the entry `ent' and attaches
+ * it to the sonode `so'. On success, return a pointer to the created
+ * instance.
+ *
+ * The new instance will be placed on the top of the filter stack.
+ *
+ * The caller is responsible for assigning the instance's ops vector and
+ * calling the filter's attach callback.
+ *
+ * No locks are held while manipulating the sonode fields because we are
+ * guaranteed that this operation is serialized.
+ *
+ * We can be sure that the entry `ent' will not disappear, because the
+ * caller is either holding sockconf_lock (in case of an active open), or is
+ * already holding a reference (in case of a passive open, the listener has
+ * one).
+ */
+static sof_instance_t *
+sof_instance_create(sof_entry_t *ent, struct sonode *so)
+{
+	sof_instance_t *inst;
+
+	inst = kmem_zalloc(sizeof (sof_instance_t), KM_NOSLEEP);
+	if (inst == NULL)
+		return (NULL);
+	sof_entry_hold(ent);
+	inst->sofi_filter = ent;
+	inst->sofi_sonode = so;
+
+	inst->sofi_next = so->so_filter_top;
+	if (so->so_filter_top != NULL)
+		so->so_filter_top->sofi_prev = inst;
+	else
+		so->so_filter_bottom = inst;
+	so->so_filter_top = inst;
+	so->so_filter_active++;
+
+	return (inst);
+}
+/*
+ * Destroys the filter instance `inst' and unlinks it from the sonode.
+ *
+ * Any filter private state must be destroyed (via the detach callback)
+ * before the instance is destroyed.
+ */
+static void
+sof_instance_destroy(sof_instance_t *inst)
+{
+	struct sonode *so = inst->sofi_sonode;
+
+	ASSERT(inst->sofi_sonode != NULL);
+	ASSERT(inst->sofi_filter != NULL);
+	ASSERT(inst->sofi_prev != NULL || so->so_filter_top == inst);
+	ASSERT(inst->sofi_next != NULL || so->so_filter_bottom == inst);
+
+	if (inst->sofi_prev != NULL)
+		inst->sofi_prev->sofi_next = inst->sofi_next;
+	else
+		so->so_filter_top = inst->sofi_next;
+
+	if (inst->sofi_next != NULL)
+		inst->sofi_next->sofi_prev = inst->sofi_prev;
+	else
+		so->so_filter_bottom = inst->sofi_prev;
+
+	if (!(inst->sofi_flags & SOFIF_BYPASS)) {
+		ASSERT(so->so_filter_active > 0);
+		so->so_filter_active--;
+	}
+	if (inst->sofi_flags & SOFIF_DEFER)
+		SOF_STAT_ADD(inst, ndeferred, -1);
+	sof_entry_rele(inst->sofi_filter);
+	kmem_free(inst, sizeof (sof_instance_t));
+}
+
+static sof_entry_t *
+sof_entry_find(const char *name)
+{
+	sof_entry_t *ent;
+
+	for (ent = list_head(&sof_entry_list); ent != NULL;
+	    ent = list_next(&sof_entry_list, ent)) {
+		if (strncmp(ent->sofe_name, name, SOF_MAXNAMELEN) == 0)
+			return (ent);
+	}
+	return (NULL);
+}
+
+void
+sof_entry_free(sof_entry_t *ent)
+{
+	ASSERT(ent->sofe_refcnt == 0);
+	ASSERT(!list_link_active(&ent->sofe_node));
+
+	if (ent->sofe_hintarg != NULL) {
+		ASSERT(ent->sofe_hint == SOF_HINT_BEFORE ||
+		    ent->sofe_hint == SOF_HINT_AFTER);
+		kmem_free(ent->sofe_hintarg, strlen(ent->sofe_hintarg) + 1);
+		ent->sofe_hintarg = NULL;
+	}
+	if (ent->sofe_socktuple_cnt > 0) {
+		ASSERT(ent->sofe_socktuple != NULL);
+		kmem_free(ent->sofe_socktuple,
+		    sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt);
+		ent->sofe_socktuple = NULL;
+		ent->sofe_socktuple_cnt = 0;
+	}
+	sof_entry_kstat_destroy(ent);
+
+	mutex_destroy(&ent->sofe_lock);
+	kmem_free(ent, sizeof (sof_entry_t));
+}
+
+static int
+sof_entry_kstat_update(kstat_t *ksp, int rw)
+{
+	sof_entry_t *ent = ksp->ks_private;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ent->sofe_kstat.sofek_nactive.value.ui64 = ent->sofe_refcnt;
+
+	return (0);
+}
+
+/*
+ * Create the kstat for filter entry `ent'.
+ */
+static int
+sof_entry_kstat_create(sof_entry_t *ent)
+{
+	char name[SOF_MAXNAMELEN + 7];
+
+	(void) snprintf(name, sizeof (name), "filter_%s", ent->sofe_name);
+	ent->sofe_ksp = kstat_create("sockfs", 0, name, "misc",
+	    KSTAT_TYPE_NAMED,
+	    sizeof (sof_entry_kstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ent->sofe_ksp == NULL)
+		return (ENOMEM);
+
+	kstat_named_init(&ent->sofe_kstat.sofek_nactive, "nactive",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ent->sofe_kstat.sofek_tot_active_attach,
+	    "tot_active_attach", KSTAT_DATA_UINT64);
+	kstat_named_init(&ent->sofe_kstat.sofek_tot_passive_attach,
+	    "tot_passive_attach", KSTAT_DATA_UINT64);
+	kstat_named_init(&ent->sofe_kstat.sofek_ndeferred, "ndeferred",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&ent->sofe_kstat.sofek_attach_failures,
+	    "attach_failures", KSTAT_DATA_UINT64);
+
+	ent->sofe_ksp->ks_data = &ent->sofe_kstat;
+	ent->sofe_ksp->ks_update = sof_entry_kstat_update;
+	ent->sofe_ksp->ks_private = ent;
+	kstat_install(ent->sofe_ksp);
+
+	return (0);
+}
+
+/*
+ * Destroys the kstat for filter entry `ent'.
+ */
+static void
+sof_entry_kstat_destroy(sof_entry_t *ent)
+{
+	if (ent->sofe_ksp != NULL) {
+		kstat_delete(ent->sofe_ksp);
+		ent->sofe_ksp = NULL;
+	}
+}
+
+static void
+sof_entry_hold(sof_entry_t *ent)
+{
+	mutex_enter(&ent->sofe_lock);
+	ent->sofe_refcnt++;
+	mutex_exit(&ent->sofe_lock);
+}
+
+/*
+ * Decrement the reference count for `ent'. The entry will
+ * drop its' reference on the filter module whenever its'
+ * ref count reaches zero.
+ */
+static void
+sof_entry_rele(sof_entry_t *ent)
+{
+	mutex_enter(&ent->sofe_lock);
+	if (--ent->sofe_refcnt == 0) {
+		sof_module_t *mod = ent->sofe_mod;
+		ent->sofe_mod = NULL;
+		if (ent->sofe_flags & SOFEF_CONDEMED) {
+			mutex_exit(&ent->sofe_lock);
+			sof_entry_free(ent);
+		} else {
+			mutex_exit(&ent->sofe_lock);
+		}
+		if (mod != NULL)
+			sof_module_rele(mod);
+	} else {
+		mutex_exit(&ent->sofe_lock);
+	}
+}
+
+/*
+ * Loads the module used by `ent'
+ */
+static int
+sof_entry_load_module(sof_entry_t *ent)
+{
+	sof_module_t *mod = sof_module_hold_by_name(ent->sofe_name,
+	    ent->sofe_modname);
+
+	if (mod == NULL)
+		return (EINVAL);
+
+	mutex_enter(&ent->sofe_lock);
+	/* Another thread might have already loaded the module */
+	ASSERT(ent->sofe_mod == mod || ent->sofe_mod == NULL);
+	if (ent->sofe_mod != NULL) {
+		mutex_exit(&ent->sofe_lock);
+		sof_module_rele(mod);
+	} else {
+		ent->sofe_mod = mod;
+		mutex_exit(&ent->sofe_lock);
+	}
+
+	return (0);
+}
+
+/*
+ * Add filter entry `ent' to the global list and attach it to all sockparam
+ * entries which the filter is interested in. Upon successful return the filter
+ * will be available for applications to use.
+ */
+int
+sof_entry_add(sof_entry_t *ent)
+{
+	int error;
+
+	/*
+	 * We hold sockconf_lock as a WRITER for the whole operation,
+	 * so all operations must be non-blocking.
+	 */
+	rw_enter(&sockconf_lock, RW_WRITER);
+	if (sof_entry_find(ent->sofe_name) != NULL) {
+		rw_exit(&sockconf_lock);
+		return (EEXIST);
+	}
+
+	/* The entry is unique; create the kstats */
+	if (sof_entry_kstat_create(ent) != 0) {
+		rw_exit(&sockconf_lock);
+		return (ENOMEM);
+	}
+
+	/*
+	 * Attach the filter to sockparams of interest.
+	 */
+	if ((error = sockparams_new_filter(ent)) != 0) {
+		sof_entry_kstat_destroy(ent);
+		rw_exit(&sockconf_lock);
+		return (error);
+	}
+	/*
+	 * Everything is OK; insert in global list.
+	 */
+	list_insert_tail(&sof_entry_list, ent);
+	rw_exit(&sockconf_lock);
+
+	return (0);
+}
+
+/*
+ * Removes the filter entry `ent' from global list and all sockparams.
+ */
+sof_entry_t *
+sof_entry_remove_by_name(const char *name)
+{
+	sof_entry_t *ent;
+
+	rw_enter(&sockconf_lock, RW_WRITER);
+	if ((ent = sof_entry_find(name)) == NULL) {
+		rw_exit(&sockconf_lock);
+		return (NULL);
+	}
+	list_remove(&sof_entry_list, ent);
+	sockparams_filter_cleanup(ent);
+	sof_entry_kstat_destroy(ent);
+	rw_exit(&sockconf_lock);
+
+	return (ent);
+}
+
+/*
+ * Filter entry `ent' will process sockparams entry `sp' to determine whether
+ * it should be attached to the sockparams. It should be called whenever a new
+ * filter or sockparams is being added. Returns zero either if the filter is
+ * not interested in the sockparams or if it successfully attached to the
+ * sockparams. On failure an errno is returned.
+ */
+int
+sof_entry_proc_sockparams(sof_entry_t *ent, struct sockparams *sp)
+{
+	uint_t i;
+	sof_socktuple_t *t = ent->sofe_socktuple;
+	sp_filter_t *new, *fil;
+
+	/* Only interested in non-TPI sockets */
+	if (strcmp(sp->sp_smod_name, SOTPI_SMOD_NAME) == 0)
+		return (0);
+
+	for (i = 0; i < ent->sofe_socktuple_cnt; i++) {
+		if (t[i].sofst_family == sp->sp_family &&
+		    t[i].sofst_type == sp->sp_type &&
+		    t[i].sofst_protocol == sp->sp_protocol)
+			break;
+	}
+	/* This filter is not interested in the sockparams entry */
+	if (i == ent->sofe_socktuple_cnt)
+		return (0);
+
+	new = kmem_zalloc(sizeof (sp_filter_t), KM_NOSLEEP);
+	if (new == NULL)
+		return (ENOMEM);
+
+	new->spf_filter = ent;
+	if (ent->sofe_flags & SOFEF_PROG) {
+		/* placement is irrelevant for programmatic filters */
+		list_insert_head(&sp->sp_prog_filters, new);
+		return (0);
+	} else {
+		ASSERT(ent->sofe_flags & SOFEF_AUTO);
+		/*
+		 * If the filter specifies a placement hint, then make sure
+		 * it can be satisfied.
+		 */
+		switch (ent->sofe_hint) {
+		case SOF_HINT_TOP:
+			if ((fil = list_head(&sp->sp_auto_filters)) != NULL &&
+			    fil->spf_filter->sofe_hint == SOF_HINT_TOP)
+				break;
+			list_insert_head(&sp->sp_auto_filters, new);
+			return (0);
+		case SOF_HINT_BOTTOM:
+			if ((fil = list_tail(&sp->sp_auto_filters)) != NULL &&
+			    fil->spf_filter->sofe_hint == SOF_HINT_BOTTOM)
+				break;
+			list_insert_tail(&sp->sp_auto_filters, new);
+			return (0);
+		case SOF_HINT_BEFORE:
+		case SOF_HINT_AFTER:
+			for (fil = list_head(&sp->sp_auto_filters);
+			    fil != NULL;
+			    fil = list_next(&sp->sp_auto_filters, fil)) {
+				if (strncmp(ent->sofe_hintarg,
+				    fil->spf_filter->sofe_name,
+				    SOF_MAXNAMELEN) == 0)
+				break;
+			}
+
+			if (fil != NULL) {
+				if (ent->sofe_hint == SOF_HINT_BEFORE) {
+					if (fil->spf_filter->sofe_hint ==
+					    SOF_HINT_TOP)
+						break;
+					list_insert_before(&sp->sp_auto_filters,
+					    fil, new);
+				} else {
+					if (fil->spf_filter->sofe_hint ==
+					    SOF_HINT_BOTTOM)
+						break;
+					list_insert_after(&sp->sp_auto_filters,
+					    fil, new);
+				}
+				return (0);
+			}
+			/*FALLTHRU*/
+		case SOF_HINT_NONE:
+			/*
+			 * Insert the new filter at the beginning as long as it
+			 * does not violate a TOP hint, otherwise insert in the
+			 * next suitable location.
+			 */
+			if ((fil = list_head(&sp->sp_auto_filters)) != NULL &&
+			    fil->spf_filter->sofe_hint == SOF_HINT_TOP) {
+				list_insert_after(&sp->sp_auto_filters, fil,
+				    new);
+			} else {
+				list_insert_head(&sp->sp_auto_filters, new);
+			}
+			return (0);
+		}
+		/* Failed to insert the filter */
+		kmem_free(new, sizeof (sp_filter_t));
+		return (ENOSPC);
+	}
+}
+
+/*
+ * Remove all filter entries attached to the sockparams entry `sp'.
+ */
+void
+sof_sockparams_fini(struct sockparams *sp)
+{
+	sp_filter_t *fil;
+
+	ASSERT(!list_link_active(&sp->sp_node));
+
+	while ((fil = list_remove_head(&sp->sp_auto_filters)) != NULL)
+		kmem_free(fil, sizeof (sp_filter_t));
+	while ((fil = list_remove_head(&sp->sp_prog_filters)) != NULL)
+		kmem_free(fil, sizeof (sp_filter_t));
+}
+
+/*
+ * A new sockparams is being added. Walk all filters and attach those that
+ * are interested in the entry.
+ *
+ * It should be called when the sockparams entry is about to be made available
+ * for use and while holding the sockconf_lock.
+ */
+int
+sof_sockparams_init(struct sockparams *sp)
+{
+	sof_entry_t *ent;
+
+	ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+	for (ent = list_head(&sof_entry_list); ent != NULL;
+	    ent = list_next(&sof_entry_list, ent)) {
+		if (sof_entry_proc_sockparams(ent, sp) != 0) {
+			sof_sockparams_fini(sp);
+			return (ENOMEM);
+		}
+	}
+	return (0);
+}
+
+static sof_module_t *
+sof_module_find(const char *name)
+{
+	sof_module_t *ent;
+
+	ASSERT(MUTEX_HELD(&sof_module_lock));
+
+	for (ent = list_head(&sof_module_list); ent != NULL;
+	    ent = list_next(&sof_module_list, ent))
+		if (strcmp(ent->sofm_name, name) == 0)
+			return (ent);
+	return (NULL);
+}
+
+/*
+ * Returns a pointer to a module identified by `name' with its ref count
+ * bumped. An attempt to load the module is done if it's not found in the
+ * global list.
+ */
+sof_module_t *
+sof_module_hold_by_name(const char *name, const char *modname)
+{
+	ddi_modhandle_t handle = NULL;
+	sof_module_t *mod = NULL;
+	char *modpath;
+	int error;
+
+	/*
+	 * We'll go through the loop at most two times, which will only
+	 * happen if the module needs to be loaded.
+	 */
+	for (;;) {
+		mutex_enter(&sof_module_lock);
+		mod = sof_module_find(name);
+		if (mod != NULL || handle != NULL)
+			break;
+		mutex_exit(&sof_module_lock);
+
+		modpath = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+		(void) snprintf(modpath, MAXPATHLEN, "%s/%s", SOF_MODPATH,
+		    modname);
+		handle = ddi_modopen(modpath, KRTLD_MODE_FIRST, &error);
+		kmem_free(modpath, MAXPATHLEN);
+		/* Failed to load, then bail */
+		if (handle == NULL) {
+			cmn_err(CE_WARN,
+			    "Failed to load socket filter module: %s (err %d)",
+			    modname, error);
+			return (NULL);
+		}
+	}
+	if (mod != NULL)
+		mod->sofm_refcnt++;
+	mutex_exit(&sof_module_lock);
+
+	if (handle != NULL) {
+		(void) ddi_modclose(handle);
+		/*
+		 * The module was loaded, but the filter module could not be
+		 * found. It's likely a misconfigured filter.
+		 */
+		if (mod == NULL) {
+			cmn_err(CE_WARN,
+			    "Socket filter module %s was loaded, but did not" \
+			    "register. Filter %s is likely misconfigured.",
+			    modname, name);
+		}
+	}
+
+	return (mod);
+}
+
+void
+sof_module_rele(sof_module_t *mod)
+{
+	mutex_enter(&sof_module_lock);
+	mod->sofm_refcnt--;
+	mutex_exit(&sof_module_lock);
+}
+
+int
+sof_rval2errno(sof_rval_t rval)
+{
+	if (rval > SOF_RVAL_CONTINUE) {
+		return ((int)rval);
+	} else {
+#ifdef DEBUG
+		if (socket_filter_debug)
+			printf("sof_rval2errno: invalid rval '%d'\n", rval);
+#endif
+		return (EINVAL);
+	}
+}
+
+/*
+ * Walk through all the filters attached to `so' and allow each filter
+ * to process the data using its data_out callback. `mp' is a b_cont chain.
+ *
+ * Returns the processed mblk, or NULL if mblk was consumed. The mblk might
+ * have been consumed as a result of an error, in which case `errp' is set to
+ * the appropriate errno.
+ */
+mblk_t *
+sof_filter_data_out_from(struct sonode *so, sof_instance_t *start,
+    mblk_t *mp, struct nmsghdr *msg, cred_t *cr, int *errp)
+{
+	sof_instance_t *inst;
+	sof_rval_t rval;
+
+	_NOTE(ARGUNUSED(so));
+
+	for (inst = start; inst != NULL; inst = inst->sofi_next) {
+		if (!SOF_INTERESTED(inst, data_out))
+			continue;
+		mp = (inst->sofi_ops->sofop_data_out)((sof_handle_t)inst,
+		    inst->sofi_cookie, mp, msg, cr, &rval);
+		DTRACE_PROBE2(filter__data, (sof_instance_t), inst,
+		    (mblk_t *), mp);
+		if (mp == NULL) {
+			*errp = sof_rval2errno(rval);
+			break;
+		}
+	}
+	return (mp);
+}
+
+/*
+ * Walk through all the filters attached to `so' and allow each filter
+ * to process the data using its data_in_proc callback. `mp' is the start of
+ * a possible b_next chain, and `lastmp' points to the last mblk in the chain.
+ *
+ * Returns the processed mblk, or NULL if all mblks in the chain were
+ * consumed. `lastmp' is updated to point to the last mblk in the processed
+ * chain.
+ */
+mblk_t *
+sof_filter_data_in_proc(struct sonode *so, mblk_t *mp, mblk_t **lastmp)
+{
+	sof_instance_t *inst;
+	size_t len = 0, orig = 0;
+	ssize_t diff = 0;
+	mblk_t *retmp = NULL, *tailmp, *nextmp;
+
+	*lastmp = NULL;
+	do {
+		nextmp = mp->b_next;
+		mp->b_next = mp->b_prev = NULL;
+		len = orig = msgdsize(mp);
+		for (inst = so->so_filter_bottom; inst != NULL;
+		    inst = inst->sofi_prev) {
+			if (!SOF_INTERESTED(inst, data_in_proc))
+				continue;
+			mp = (inst->sofi_ops->sofop_data_in_proc)(
+			    (sof_handle_t)inst, inst->sofi_cookie, mp,
+			    kcred, &len);
+			if (mp == NULL)
+				break;
+		}
+		DTRACE_PROBE2(filter__data, (sof_instance_t), inst,
+		    (mblk_t *), mp);
+		diff += len - orig;
+		if (mp == NULL)
+			continue;
+
+		for (tailmp = mp; tailmp->b_cont != NULL;
+		    tailmp = tailmp->b_cont)
+			;
+		mp->b_prev = tailmp;
+
+		if (*lastmp == NULL)
+			retmp = mp;
+		else
+			(*lastmp)->b_next = mp;
+		*lastmp = mp;
+	} while ((mp = nextmp) != NULL);
+
+	/*
+	 * The size of the chain has changed; make sure the rcv queue
+	 * stays consistent and check if the flow control state should
+	 * change.
+	 */
+	if (diff != 0) {
+		DTRACE_PROBE2(filter__data__adjust__qlen,
+		    (struct sonode *), so, (size_t), diff);
+		mutex_enter(&so->so_lock);
+		so->so_rcv_queued += diff;
+		/* so_check_flow_control drops so_lock */
+		so_check_flow_control(so);
+	}
+
+	return (retmp);
+}
+
+int
+sof_filter_bind(struct sonode *so, struct sockaddr *addr,
+    socklen_t *addrlen, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, bind, cr, addr, addrlen)
+}
+
+int
+sof_filter_listen(struct sonode *so, int *backlogp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, listen, cr, backlogp)
+}
+
+int
+sof_filter_connect(struct sonode *so, struct sockaddr *addr,
+    socklen_t *addrlen, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, connect, cr, addr, addrlen)
+}
+
+int
+sof_filter_accept(struct sonode *so, cred_t *cr)
+{
+	sof_instance_t *inst;
+	sof_rval_t rval;
+
+	for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) {
+		if (!SOF_INTERESTED(inst, accept))
+			continue;
+		rval = (inst->sofi_ops->sofop_accept)((sof_handle_t)inst,
+		    inst->sofi_cookie, cr);
+		DTRACE_PROBE2(filter__action, (sof_instance_t), inst,
+		    (sof_rval_t), rval);
+		if (rval != SOF_RVAL_CONTINUE) {
+			ASSERT(rval != SOF_RVAL_RETURN);
+			return (sof_rval2errno(rval));
+		}
+	}
+	return (-1);
+}
+
+int
+sof_filter_shutdown(struct sonode *so, int *howp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, shutdown, cr, howp)
+}
+
+int
+sof_filter_getsockname(struct sonode *so, struct sockaddr *addr,
+    socklen_t *addrlenp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, getsockname, cr, addr, addrlenp)
+}
+
+int
+sof_filter_getpeername(struct sonode *so, struct sockaddr *addr,
+    socklen_t *addrlenp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, getpeername, cr, addr, addrlenp)
+}
+
+int
+sof_filter_setsockopt(struct sonode *so, int level, int option_name,
+    void *optval, socklen_t *optlenp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, setsockopt, cr, level, option_name,
+	    optval, optlenp)
+}
+
+int
+sof_filter_getsockopt(struct sonode *so, int level, int option_name,
+    void *optval, socklen_t *optlenp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, getsockopt, cr, level, option_name,
+	    optval, optlenp)
+}
+
+int
+sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+    int32_t *rvalp, cred_t *cr)
+{
+	__SOF_FILTER_OP(so, ioctl, cr, cmd, arg, mode, rvalp)
+}
+
+/*
+ * sof_register(version, name, ops, flags)
+ *
+ * Register a socket filter identified by name `name' and which should use
+ * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * On success 0 is returned, otherwise an errno is returned.
+ */
+int
+sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
+{
+	sof_module_t *mod;
+
+	_NOTE(ARGUNUSED(flags));
+
+	if (version != SOF_VERSION)
+		return (EINVAL);
+
+	mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
+	mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
+	(void) strcpy(mod->sofm_name, name);
+	mod->sofm_ops = *ops;
+
+	mutex_enter(&sof_module_lock);
+	if (sof_module_find(name) != NULL) {
+		mutex_exit(&sof_module_lock);
+		kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1);
+		kmem_free(mod, sizeof (sof_module_t));
+		return (EEXIST);
+	}
+	list_insert_tail(&sof_module_list, mod);
+	mutex_exit(&sof_module_lock);
+
+	return (0);
+}
+
+/*
+ * sof_unregister(name)
+ *
+ * Try to unregister the socket filter identified by `name'. If the filter
+ * is successfully unregistered, then 0 is returned, otherwise an errno is
+ * returned.
+ */
+int
+sof_unregister(const char *name)
+{
+	sof_module_t *mod;
+
+	mutex_enter(&sof_module_lock);
+	mod = sof_module_find(name);
+	if (mod != NULL) {
+		if (mod->sofm_refcnt == 0) {
+			list_remove(&sof_module_list, mod);
+			mutex_exit(&sof_module_lock);
+
+			kmem_free(mod->sofm_name, strlen(mod->sofm_name) + 1);
+			kmem_free(mod, sizeof (sof_module_t));
+			return (0);
+		} else {
+			mutex_exit(&sof_module_lock);
+			return (EBUSY);
+		}
+	}
+	mutex_exit(&sof_module_lock);
+
+	return (ENXIO);
+}
+
+/*
+ * sof_newconn_ready(handle)
+ *
+ * The filter `handle` no longer wants to defer the socket it is attached
+ * to. A newconn notification will be generated if there is no other filter
+ * that wants the socket deferred.
+ */
+void
+sof_newconn_ready(sof_handle_t handle)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	struct sonode *so = inst->sofi_sonode;
+	struct sonode *pso = so->so_listener;
+
+	mutex_enter(&so->so_lock);
+	if (!(inst->sofi_flags & SOFIF_DEFER)) {
+		mutex_exit(&so->so_lock);
+		return;
+	}
+	ASSERT(so->so_state & SS_FIL_DEFER);
+	inst->sofi_flags &= ~SOFIF_DEFER;
+	SOF_STAT_ADD(inst, ndeferred, -1);
+
+	/*
+	 * Check if any other filter has deferred the socket. The last
+	 * filter to remove its DEFER flag will be the one generating the
+	 * wakeup.
+	 */
+	for (inst = so->so_filter_top; inst != NULL; inst = inst->sofi_next) {
+		/* Still deferred; nothing to do */
+		if (inst->sofi_flags & SOFIF_DEFER) {
+			mutex_exit(&so->so_lock);
+			return;
+		}
+	}
+	so->so_state &= ~SS_FIL_DEFER;
+	mutex_exit(&so->so_lock);
+
+	/*
+	 * The socket is no longer deferred; move it over to the regular
+	 * accept list and notify the user. However, it is possible that
+	 * the socket is being dropped by sof_sonode_drop_deferred(), so
+	 * first make sure the socket is on the deferred list.
+	 */
+	mutex_enter(&pso->so_acceptq_lock);
+	if (!list_link_active(&so->so_acceptq_node)) {
+		mutex_exit(&pso->so_acceptq_lock);
+		return;
+	}
+	list_remove(&pso->so_acceptq_defer, so);
+	list_insert_tail(&pso->so_acceptq_list, so);
+	cv_signal(&pso->so_acceptq_cv);
+	mutex_exit(&pso->so_acceptq_lock);
+
+	mutex_enter(&pso->so_lock);
+	so_notify_newconn(pso);		/* so_notify_newconn drops the lock */
+}
+
+/*
+ * sof_bypass(handle)
+ *
+ * Stop generating callbacks for `handle'.
+ */
+void
+sof_bypass(sof_handle_t handle)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	struct sonode *so = inst->sofi_sonode;
+
+	mutex_enter(&so->so_lock);
+	if (!(inst->sofi_flags & SOFIF_BYPASS)) {
+		inst->sofi_flags |= SOFIF_BYPASS;
+		ASSERT(so->so_filter_active > 0);
+		so->so_filter_active--;
+	}
+	mutex_exit(&so->so_lock);
+}
+
+/*
+ * sof_rcv_flowctrl(handle, enable)
+ *
+ * If `enable' is TRUE, then recv side flow control will be asserted for
+ * the socket associated with `handle'. When `enable' is FALSE the filter
+ * indicates that it no longer wants to assert flow control, however, the
+ * condition will not be removed until there are no other filters asserting
+ * flow control and there is space available in the receive buffer.
+ */
+void
+sof_rcv_flowctrl(sof_handle_t handle, boolean_t enable)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	struct sonode *so = inst->sofi_sonode;
+
+	mutex_enter(&so->so_lock);
+	if (enable) {
+		inst->sofi_flags |= SOFIF_RCV_FLOWCTRL;
+		so->so_flowctrld = B_TRUE;
+		so->so_state |= SS_FIL_RCV_FLOWCTRL;
+		mutex_exit(&so->so_lock);
+	} else {
+		inst->sofi_flags &= ~SOFIF_RCV_FLOWCTRL;
+		for (inst = so->so_filter_top; inst != NULL;
+		    inst = inst->sofi_next) {
+			/* another filter is asserting flow control */
+			if (inst->sofi_flags & SOFIF_RCV_FLOWCTRL) {
+				mutex_exit(&so->so_lock);
+				return;
+			}
+		}
+		so->so_state &= ~SS_FIL_RCV_FLOWCTRL;
+		/* so_check_flow_control drops so_lock */
+		so_check_flow_control(so);
+	}
+	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * sof_snd_flowctrl(handle, enable)
+ *
+ * If `enable' is TRUE, then send side flow control will be asserted for
+ * the socket associated with `handle'. When `enable' is FALSE the filter
+ * indicates that is no longer wants to assert flow control, however, the
+ * condition will not be removed until there are no other filters asserting
+ * flow control and there are tx buffers available.
+ */
+void
+sof_snd_flowctrl(sof_handle_t handle, boolean_t enable)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	struct sonode *so = inst->sofi_sonode;
+
+	mutex_enter(&so->so_lock);
+	if (enable) {
+		inst->sofi_flags |= SOFIF_SND_FLOWCTRL;
+		so->so_state |= SS_FIL_SND_FLOWCTRL;
+	} else {
+		inst->sofi_flags &= ~SOFIF_SND_FLOWCTRL;
+		for (inst = so->so_filter_top; inst != NULL;
+		    inst = inst->sofi_next) {
+			if (inst->sofi_flags & SOFIF_SND_FLOWCTRL) {
+				mutex_exit(&so->so_lock);
+				return;
+			}
+		}
+		so->so_state &= ~SS_FIL_SND_FLOWCTRL;
+		/*
+		 * Wake up writer if the socket is no longer flow controlled.
+		 */
+		if (!SO_SND_FLOWCTRLD(so)) {
+			/* so_notify_writable drops so_lock */
+			so_notify_writable(so);
+			return;
+		}
+	}
+	mutex_exit(&so->so_lock);
+}
+
+/*
+ * sof_get_cookie(handle)
+ *
+ * Returns the cookie used by `handle'.
+ */
+void *
+sof_get_cookie(sof_handle_t handle)
+{
+	return (((sof_instance_t *)handle)->sofi_cookie);
+}
+
+/*
+ * sof_cas_cookie(handle, old, new)
+ *
+ * Compare-and-swap the cookie used by `handle'.
+ */
+void *
+sof_cas_cookie(sof_handle_t handle, void *old, void *new)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+
+	return (atomic_cas_ptr(&inst->sofi_cookie, old, new));
+}
+
+/*
+ * sof_inject_data_out(handle, mp, msg, flowctrld)
+ *
+ * Submit `mp' for transmission. `msg' cannot by NULL, and may contain
+ * ancillary data and destination address. Returns 0 when successful
+ * in which case `flowctrld' is updated. If flow controlled, no new data
+ * should be injected until a SOF_EV_INJECT_DATA_OUT_OK event is observed.
+ * In case of failure, an errno is returned.
+ *
+ * Filters that are lower in the stack than `handle' will see the data
+ * before it is transmitted and may end up modifying or freeing the data.
+ */
+int
+sof_inject_data_out(sof_handle_t handle, mblk_t *mp, struct nmsghdr *msg,
+    boolean_t *flowctrld)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	struct sonode *so = inst->sofi_sonode;
+	int error;
+
+	/*
+	 * Data cannot be sent down to the protocol once the socket has
+	 * started the process of closing.
+	 */
+	mutex_enter(&so->so_lock);
+	if (so->so_state & SS_CLOSING) {
+		mutex_exit(&so->so_lock);
+		freemsg(mp);
+		return (EPIPE);
+	}
+	so->so_filter_tx++;
+	mutex_exit(&so->so_lock);
+
+	error = so_sendmblk_impl(inst->sofi_sonode, msg, FNONBLOCK,
+	    kcred, &mp, inst->sofi_next, B_TRUE);
+
+	mutex_enter(&so->so_lock);
+	ASSERT(so->so_filter_tx > 0);
+	so->so_filter_tx--;
+	if (so->so_state & SS_CLOSING)
+		cv_signal(&so->so_closing_cv);
+	mutex_exit(&so->so_lock);
+
+	if (mp != NULL)
+		freemsg(mp);
+
+	if (error == ENOSPC) {
+		*flowctrld = B_TRUE;
+		error = 0;
+	} else {
+		*flowctrld = B_FALSE;
+	}
+
+	return (error);
+}
+
+/*
+ * sof_inject_data_in(handle, mp, len, flag, flowctrld)
+ *
+ * Enqueue `mp' which contains `len' bytes of M_DATA onto the socket
+ * associated with `handle'. `flags' should be set to 0. Returns 0 when
+ * successful in which case `flowctrld' is updated. If flow controlled,
+ * no new data should be injected until a SOF_EV_INJECT_DATA_IN_OK event
+ * is observed.  In case of failure, an errno is returned.
+ *
+ * Filters that are higher in the stack than `handle' will see the data
+ * before it is enqueued on the receive queue and may end up modifying or
+ * freeing the data.
+ */
+int
+sof_inject_data_in(sof_handle_t handle, mblk_t *mp, size_t len, int flags,
+    boolean_t *flowctrld)
+{
+	sof_instance_t *inst = (sof_instance_t *)handle;
+	ssize_t avail;
+	int error = 0;
+
+	ASSERT(flags == 0);
+	avail = so_queue_msg_impl(inst->sofi_sonode, mp, len, flags, &error,
+	    NULL, inst->sofi_prev);
+	/* fallback should never happen when there is an active filter */
+	ASSERT(error != EOPNOTSUPP);
+
+	*flowctrld = (avail > 0) ? B_FALSE : B_TRUE;
+	return (error);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h	Thu Jun 17 17:22:09 2010 -0700
@@ -0,0 +1,213 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SOCKFS_SOCKFILTER_H
+#define	_SOCKFS_SOCKFILTER_H
+
+#include <sys/kstat.h>
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct sonode;
+struct sockparams;
+
+typedef struct sof_module	sof_module_t;
+typedef struct sof_entry_kstat	sof_entry_kstat_t;
+typedef struct sof_entry	sof_entry_t;
+typedef struct sof_instance	sof_instance_t;
+typedef struct sof_kstat	sof_kstat_t;
+
+#define	SOF_MAXNAMELEN		FILNAME_MAX
+#define	SOF_MAXSOCKTUPLECNT	32
+#define	SOF_MODPATH		SOCKMOD_PATH
+
+struct sof_module {
+	char		*sofm_name;
+	sof_ops_t	sofm_ops;
+	uint_t		sofm_refcnt;
+	list_node_t	sofm_node;
+};
+
+struct sof_kstat {
+	kstat_named_t	sofks_defer_closed;
+	kstat_named_t	sofks_defer_close_backlog;
+	kstat_named_t	sofks_defer_close_failed_backlog_too_big;
+};
+
+#define	SOF_GLOBAL_STAT_BUMP(s) \
+	atomic_add_64(&sof_stat.sofks_##s.value.ui64, 1)
+
+/*
+ * Per filter statistics.
+ */
+struct sof_entry_kstat {
+	kstat_named_t	sofek_nactive;		/* # of consumers */
+	kstat_named_t	sofek_tot_active_attach;
+	kstat_named_t	sofek_tot_passive_attach;
+	kstat_named_t	sofek_ndeferred; 	/* # of deferred conns */
+	kstat_named_t	sofek_attach_failures;
+};
+
+/*
+ * Socket filter entry - one for each configured filter (added and
+ * removed by soconfig(1M)).
+ *
+ * sofe_flags, sofe_refcnt and sofe_mod are protected by sofe_lock, and all
+ * other fields are write once.
+ */
+struct sof_entry {
+	char		sofe_name[SOF_MAXNAMELEN];	/* filter name */
+	char		sofe_modname[MODMAXNAMELEN];	/* filter module */
+	sof_hint_t	sofe_hint;			/* order hint */
+	char		*sofe_hintarg;			/* hint argument */
+	list_node_t	sofe_node;			/* global list node */
+	uint_t		sofe_socktuple_cnt;		/* # of socket tuples */
+	sof_socktuple_t	*sofe_socktuple;		/* socket tuple list */
+
+	sof_entry_kstat_t sofe_kstat;			/* filter stats */
+	kstat_t		*sofe_ksp;
+
+	kmutex_t	sofe_lock;
+	char		sofe_flags;			/* SOFEF_* flags */
+	uint_t		sofe_refcnt;			/* # of instances */
+	sof_module_t	*sofe_mod;			/* filter module */
+};
+
+/* Filter entry flags */
+#define	SOFEF_AUTO	0x1	/* automatic filter */
+#define	SOFEF_PROG	0x2	/* programmatic filter */
+#define	SOFEF_CONDEMED	0x4	/* removed by soconfig(1M) */
+
+/*
+ * Socket filter instance - one for each socket using a sof_entry_t
+ */
+struct sof_instance {
+	sof_ops_t	*sofi_ops;	/* filter ops */
+	void		*sofi_cookie;	/* filter cookie (from attach) */
+	char		sofi_flags;	/* instance flags (SOFIF_*) */
+	sof_instance_t	*sofi_prev;	/* up the stack */
+	sof_instance_t	*sofi_next;	/* down the stack */
+	struct sonode	*sofi_sonode;	/* socket instance is attached to */
+	sof_entry_t	*sofi_filter;	/* filter this is an instance of */
+};
+
+/* Filter instance flags */
+#define	SOFIF_BYPASS		0x1	/* filter does not want any callbacks */
+#define	SOFIF_DEFER		0x2	/* defer notification of socket */
+#define	SOFIF_RCV_FLOWCTRL	0x4	/* flow control recv path */
+#define	SOFIF_SND_FLOWCTRL	0x8	/* flow control send path */
+
+#define	SOF_STAT_ADD(i, s, v) \
+	atomic_add_64(&(i)->sofi_filter->sofe_kstat.sofek_##s.value.ui64, (v))
+
+extern void	sof_init(void);
+
+extern void 	sof_entry_free(sof_entry_t *);
+extern int	sof_entry_add(sof_entry_t *);
+extern sof_entry_t *sof_entry_remove_by_name(const char *);
+extern int 	sof_entry_proc_sockparams(sof_entry_t *, struct sockparams *);
+
+extern int	sof_sockparams_init(struct sockparams *);
+extern void	sof_sockparams_fini(struct sockparams *);
+
+extern int	sof_sonode_autoattach_filters(struct sonode *, cred_t *);
+extern int	sof_sonode_inherit_filters(struct sonode *, struct sonode *);
+extern void	sof_sonode_closing(struct sonode *);
+extern void	sof_sonode_cleanup(struct sonode *);
+extern void	sof_sonode_notify_filters(struct sonode *, sof_event_t,
+    uintptr_t);
+extern boolean_t sof_sonode_drop_deferred(struct sonode *);
+
+extern int 	sof_setsockopt(struct sonode *, int, const void *, socklen_t,
+    struct cred *);
+extern int 	sof_getsockopt(struct sonode *, int, void *, socklen_t *,
+    struct cred *);
+
+extern int	sof_rval2errno(sof_rval_t);
+
+#define	SOF_INTERESTED(inst, op)			\
+	(!((inst)->sofi_flags & SOFIF_BYPASS) &&	\
+	(inst)->sofi_ops->sofop_##op != NULL)
+
+/*
+ * SOF_FILTER_OP traverses the filter stack for sonode `so' top-down,
+ * calling `op' for each filter with the supplied `args'. A non-negative
+ * return value indicates that a filter action was taken.
+ */
+#define	__SOF_FILTER_OP(so, op, cr, ...) 		\
+	sof_instance_t *__inst;					\
+	sof_rval_t __rval;					\
+								\
+	for (__inst = (so)->so_filter_top; __inst != NULL;	\
+	    __inst = __inst->sofi_next) {			\
+		if (!SOF_INTERESTED(__inst, op))		\
+			continue;				\
+		__rval = (__inst->sofi_ops->sofop_##op)((sof_handle_t)__inst,\
+		    __inst->sofi_cookie, __VA_ARGS__, cr);	\
+		DTRACE_PROBE2(filter__action, (sof_instance_t), __inst,\
+		    (sof_rval_t), __rval);			\
+		if (__rval != SOF_RVAL_CONTINUE) 		\
+			return (sof_rval2errno(__rval));	\
+	}							\
+	return (-1);
+
+extern mblk_t	*sof_filter_data_out_from(struct sonode *so,
+    sof_instance_t *, mblk_t *, struct nmsghdr *, cred_t *, int *);
+extern mblk_t	*sof_filter_data_in_proc(struct sonode *so,
+    mblk_t *, mblk_t **);
+extern int	sof_filter_bind(struct sonode *, struct sockaddr *,
+    socklen_t *, cred_t *);
+extern int	sof_filter_listen(struct sonode *, int *, cred_t *);
+extern int	sof_filter_connect(struct sonode *, struct sockaddr *,
+    socklen_t *, cred_t *);
+extern int	sof_filter_accept(struct sonode *, cred_t *);
+extern int	sof_filter_shutdown(struct sonode *, int *, cred_t *);
+extern int 	sof_filter_getsockname(struct sonode *, struct sockaddr *,
+    socklen_t *, cred_t *);
+extern int 	sof_filter_getpeername(struct sonode *, struct sockaddr *,
+    socklen_t *, cred_t *);
+extern int	sof_filter_setsockopt(struct sonode *, int, int, void *,
+    socklen_t *, cred_t *);
+extern int	sof_filter_getsockopt(struct sonode *, int, int, void *,
+    socklen_t *, cred_t *);
+extern int	sof_filter_ioctl(struct sonode *, int, intptr_t, int,
+    int32_t *, cred_t *);
+
+#define	SOF_FILTER_DATA_OUT(so, mp, msg, cr, errp) \
+	sof_filter_data_out_from(so, (so)->so_filter_top, mp, msg, cr, errp)
+#define	SOF_FILTER_DATA_OUT_FROM(so, inst, mp, msg, cr, errp) \
+	sof_filter_data_out_from(so, inst, mp, msg, cr, errp)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SOCKFS_SOCKFILTER_H */
--- a/usr/src/uts/common/fs/sockfs/socknotify.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -33,6 +32,7 @@
 #include <io/ksocket/ksocket_impl.h>
 #include <fs/sockfs/sockcommon.h>
 #include <fs/sockfs/sodirect.h>
+#include <fs/sockfs/sockfilter_impl.h>
 
 /*
  * There can only be a single thread waiting for data (enforced by
@@ -78,6 +78,7 @@
 		mutex_exit(&so->so_lock);
 		pollwakeup(&so->so_poll_list, POLLOUT);
 	}
+	sof_sonode_notify_filters(so, SOF_EV_CONNECTED, 0);
 
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 }
@@ -93,18 +94,19 @@
 	int sigev = 0;
 
 	ASSERT(MUTEX_HELD(&so->so_lock));
+	(void) i_so_notify_last_tx(so, &pollev, &sigev);
 
 	if (IS_KERNEL_SOCKET(so)) {
-		SO_WAKEUP_WRITER(so);
 		KSOCKET_CALLBACK(so, cantsendmore, 0);
 		mutex_exit(&so->so_lock);
-	} else if (i_so_notify_last_tx(so, &pollev, &sigev)) {
-		socket_sendsig(so, sigev);
+	} else {
+		if (sigev != 0)
+			socket_sendsig(so, sigev);
 		mutex_exit(&so->so_lock);
-		pollwakeup(&so->so_poll_list, pollev);
-	} else {
-		mutex_exit(&so->so_lock);
+		if (pollev != 0)
+			pollwakeup(&so->so_poll_list, pollev);
 	}
+	sof_sonode_notify_filters(so, SOF_EV_CANTSENDMORE, 0);
 
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 }
@@ -114,7 +116,7 @@
  * Wake up anyone that is waiting to send or receive data.
  */
 void
-so_notify_disconnected(struct sonode *so, int error)
+so_notify_disconnected(struct sonode *so, boolean_t connfailed, int error)
 {
 	int pollev = 0;
 	int sigev = 0;
@@ -125,7 +127,11 @@
 	(void) i_so_notify_last_rx(so, &pollev, &sigev);
 
 	if (IS_KERNEL_SOCKET(so)) {
-		KSOCKET_CALLBACK(so, disconnected, error);
+		if (connfailed) {
+			KSOCKET_CALLBACK(so, disconnected, error);
+		} else {
+			KSOCKET_CALLBACK(so, connectfailed, error);
+		}
 		mutex_exit(&so->so_lock);
 	} else {
 		if (sigev != 0)
@@ -134,6 +140,8 @@
 		if (pollev != 0)
 			pollwakeup(&so->so_poll_list, pollev);
 	}
+	sof_sonode_notify_filters(so, (connfailed) ? SOF_EV_CONNECTFAILED :
+	    SOF_EV_DISCONNECTED, error);
 
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 }
@@ -158,6 +166,10 @@
 	}
 
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+
+	/* filters can start injecting data */
+	if (so->so_filter_active > 0)
+		sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_OUT_OK, 0);
 }
 
 /*
@@ -270,7 +282,6 @@
 	(void) i_so_notify_last_rx(so, &pollev, &sigev);
 
 	if (IS_KERNEL_SOCKET(so)) {
-		SO_WAKEUP_READER(so);
 		KSOCKET_CALLBACK(so, cantrecvmore, 0);
 		mutex_exit(&so->so_lock);
 	} else {
@@ -281,6 +292,7 @@
 			pollwakeup(&so->so_poll_list, pollev);
 
 	}
+	sof_sonode_notify_filters(so, SOF_EV_CANTRECVMORE, 0);
 
 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 }
@@ -294,7 +306,7 @@
 	ASSERT(MUTEX_HELD(&so->so_lock));
 
 	if (IS_KERNEL_SOCKET(so)) {
-		KSOCKET_CALLBACK(so, newconn, so->so_rcv_queued);
+		KSOCKET_CALLBACK(so, newconn, 0);
 		mutex_exit(&so->so_lock);
 	} else {
 		socket_sendsig(so, SOCKETSIG_READ);
--- a/usr/src/uts/common/fs/sockfs/sockparams.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockparams.c	Thu Jun 17 17:22:09 2010 -0700
@@ -36,6 +36,7 @@
 #include <sys/socketvar.h>
 
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 #include <fs/sockfs/socktpi.h>
 
 /*
@@ -53,12 +54,9 @@
  * supplied device path, or when a socket is falling back to TPI.
  *
  * Lock order:
- *   The lock order is splist_lock -> sp_lock.
- *   The lock order is sp_ephem_lock -> sp_lock.
+ *   The lock order is sockconf_lock -> sp_lock.
  */
 extern int 	kobj_path_exists(char *, int);
-extern void	nl7c_init(void);
-extern int	sockfs_defer_nl7c_init;
 
 static int 	sockparams_sdev_init(struct sockparams *, char *, int);
 static void 	sockparams_sdev_fini(struct sockparams *);
@@ -67,13 +65,11 @@
  * Global sockparams list (populated via soconfig(1M)).
  */
 static list_t sphead;
-static krwlock_t splist_lock;
 
 /*
  * List of ephemeral sockparams.
  */
 static list_t sp_ephem_list;
-static krwlock_t sp_ephem_lock;
 
 /* Global kstats for sockparams */
 typedef struct sockparams_g_stats {
@@ -93,9 +89,6 @@
 	list_create(&sp_ephem_list, sizeof (struct sockparams),
 	    offsetof(struct sockparams, sp_node));
 
-	rw_init(&splist_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&sp_ephem_lock, NULL, RW_DEFAULT, NULL);
-
 	kstat_named_init(&sp_g_stats.spgs_ephem_nalloc, "ephemeral_nalloc",
 	    KSTAT_DATA_UINT64);
 	kstat_named_init(&sp_g_stats.spgs_ephem_nreuse, "ephemeral_nreuse",
@@ -170,9 +163,8 @@
  *   modname: Name of the module associated with the socket type. The
  *            module can be NULL if a device path is given, in which
  *            case the TPI module is used.
- *   devpath: Path to the STREAMS device. May be NULL for non-STREAMS
- *            based transports, or those transports that do not provide
- *            the capability to fallback to STREAMS.
+ *   devpath: Path to the STREAMS device. Must be NULL for non-STREAMS
+ *            based transports.
  *   devpathlen: Length of the devpath string. The argument can be 0,
  *            indicating that devpath was allocated statically, and should
  *            not be freed when the sockparams entry is destroyed.
@@ -202,7 +194,7 @@
 		goto error;
 	}
 
-	/* either a module or device must be given */
+	/* either a module or device must be given, but not both */
 	if (modname == NULL && devpath == NULL) {
 		*errorp = EINVAL;
 		goto error;
@@ -219,6 +211,11 @@
 	sp->sp_refcnt = 0;
 	sp->sp_flags = flags;
 
+	list_create(&sp->sp_auto_filters, sizeof (sp_filter_t),
+	    offsetof(sp_filter_t, spf_node));
+	list_create(&sp->sp_prog_filters, sizeof (sp_filter_t),
+	    offsetof(sp_filter_t, spf_node));
+
 	kstat_named_init(&sp->sp_stats.sps_nfallback, "nfallback",
 	    KSTAT_DATA_UINT64);
 	kstat_named_init(&sp->sp_stats.sps_nactive, "nactive",
@@ -322,6 +319,10 @@
 	mutex_destroy(&sp->sp_lock);
 	sockparams_kstat_fini(sp);
 
+	sof_sockparams_fini(sp);
+	list_destroy(&sp->sp_auto_filters);
+	list_destroy(&sp->sp_prog_filters);
+
 	kmem_free(sp, sizeof (*sp));
 }
 
@@ -404,12 +405,12 @@
 	/*
 	 * First look for an existing entry
 	 */
-	rw_enter(&sp_ephem_lock, RW_READER);
+	rw_enter(&sockconf_lock, RW_READER);
 	sp = sockparams_find(&sp_ephem_list, family, type, protocol,
 	    by_devpath, name);
 	if (sp != NULL) {
 		SOCKPARAMS_INC_REF(sp);
-		rw_exit(&sp_ephem_lock);
+		rw_exit(&sockconf_lock);
 		sp_g_stats.spgs_ephem_nreuse.value.ui64++;
 
 		return (sp);
@@ -418,7 +419,7 @@
 		char *namebuf = NULL;
 		int namelen = 0;
 
-		rw_exit(&sp_ephem_lock);
+		rw_exit(&sockconf_lock);
 
 		namelen = strlen(name) + 1;
 		namebuf = kmem_alloc(namelen, kmflag);
@@ -460,7 +461,7 @@
 		 * The sockparams entry was created, now try to add it
 		 * to the list. We need to hold the lock as a WRITER.
 		 */
-		rw_enter(&sp_ephem_lock, RW_WRITER);
+		rw_enter(&sockconf_lock, RW_WRITER);
 		sp = sockparams_find(&sp_ephem_list, family, type, protocol,
 		    by_devpath, name);
 		if (sp != NULL) {
@@ -469,13 +470,19 @@
 			 * place a hold on it and release the entry we alloc'ed.
 			 */
 			SOCKPARAMS_INC_REF(sp);
-			rw_exit(&sp_ephem_lock);
+			rw_exit(&sockconf_lock);
 
 			sockparams_destroy(newsp);
 		} else {
+			*errorp = sof_sockparams_init(newsp);
+			if (*errorp != 0) {
+				rw_exit(&sockconf_lock);
+				sockparams_destroy(newsp);
+				return (NULL);
+			}
 			SOCKPARAMS_INC_REF(newsp);
 			list_insert_tail(&sp_ephem_list, newsp);
-			rw_exit(&sp_ephem_lock);
+			rw_exit(&sockconf_lock);
 
 			sp = newsp;
 		}
@@ -514,18 +521,18 @@
 	ASSERT(sp->sp_flags & SOCKPARAMS_EPHEMERAL);
 	ASSERT(MUTEX_NOT_HELD(&sp->sp_lock));
 
-	rw_enter(&sp_ephem_lock, RW_WRITER);
+	rw_enter(&sockconf_lock, RW_WRITER);
 	mutex_enter(&sp->sp_lock);
 
 	if (--sp->sp_refcnt == 0) {
 		list_remove(&sp_ephem_list, sp);
 		mutex_exit(&sp->sp_lock);
-		rw_exit(&sp_ephem_lock);
+		rw_exit(&sockconf_lock);
 
 		sockparams_destroy(sp);
 	} else {
 		mutex_exit(&sp->sp_lock);
-		rw_exit(&sp_ephem_lock);
+		rw_exit(&sockconf_lock);
 	}
 }
 
@@ -542,21 +549,37 @@
  *   is returned.
  *
  * Locking:
- *   The caller can not be holding splist_lock.
+ *   The caller can not be holding sockconf_lock.
  */
-static int
+int
 sockparams_add(struct sockparams *sp)
 {
+	int error;
+
 	ASSERT(!(sp->sp_flags & SOCKPARAMS_EPHEMERAL));
 
-	rw_enter(&splist_lock, RW_WRITER);
+	rw_enter(&sockconf_lock, RW_WRITER);
 	if (sockparams_find(&sphead, sp->sp_family, sp->sp_type,
 	    sp->sp_protocol, B_TRUE, NULL) != 0) {
-		rw_exit(&splist_lock);
+		rw_exit(&sockconf_lock);
 		return (EEXIST);
 	} else {
+		/*
+		 * Unique sockparams entry, so init the kstats.
+		 */
+		sockparams_kstat_init(sp);
+
+		/*
+		 * Before making the socket type available we must make
+		 * sure that interested socket filters are aware of it.
+		 */
+		error = sof_sockparams_init(sp);
+		if (error != 0) {
+			rw_exit(&sockconf_lock);
+			return (error);
+		}
 		list_insert_tail(&sphead, sp);
-		rw_exit(&splist_lock);
+		rw_exit(&sockconf_lock);
 		return (0);
 	}
 }
@@ -575,15 +598,15 @@
  *   On success 0, otherwise ENXIO.
  *
  * Locking:
- *   Caller can not be holding splist_lock or the sp_lock of
+ *   Caller can not be holding sockconf_lock or the sp_lock of
  *   any sockparams entry.
  */
-static int
+int
 sockparams_delete(int family, int type, int protocol)
 {
 	struct sockparams *sp;
 
-	rw_enter(&splist_lock, RW_WRITER);
+	rw_enter(&sockconf_lock, RW_WRITER);
 	sp = sockparams_find(&sphead, family, type, protocol, B_TRUE, NULL);
 
 	if (sp != NULL) {
@@ -595,97 +618,22 @@
 		mutex_enter(&sp->sp_lock);
 		if (sp->sp_refcnt != 0) {
 			mutex_exit(&sp->sp_lock);
-			rw_exit(&splist_lock);
+			rw_exit(&sockconf_lock);
 			return (EBUSY);
 		}
 		mutex_exit(&sp->sp_lock);
 		/* Delete the sockparams entry. */
 		list_remove(&sphead, sp);
-		rw_exit(&splist_lock);
+		rw_exit(&sockconf_lock);
 
 		sockparams_destroy(sp);
 		return (0);
 	} else {
-		rw_exit(&splist_lock);
+		rw_exit(&sockconf_lock);
 		return (ENXIO);
 	}
 }
 
-/*
- * soconfig(int family, int type, int protocol,
- *     char *devpath, int devpathlen, char *module)
- *
- * Add or delete an entry to the sockparams table.
- * When devpath and module both are NULL, it will delete an entry.
- *
- * Arguments:
- *   family, type, protocol: the tuple in question
- *   devpath: STREAMS device path. Can be NULL for module based sockets.
- *   module : Name of the socket module. Can be NULL for STREAMS
- *            based sockets.
- *   devpathlen: length of the devpath string, or 0 if devpath
- *            was statically allocated.
- *
- * Note:
- *   This routine assumes that the caller has kmem_alloced
- *   devpath (if devpathlen > 0) and module for this routine to
- *   consume.
- */
-int
-soconfig(int family, int type, int protocol,
-    char *devpath, int devpathlen, char *module)
-{
-	struct sockparams *sp;
-	int error = 0;
-
-	dprint(0, ("soconfig(%d,%d,%d,%s,%d,%s)\n",
-	    family, type, protocol, devpath, devpathlen,
-	    module == NULL ? "NULL" : module));
-
-	if (sockfs_defer_nl7c_init) {
-		nl7c_init();
-		sockfs_defer_nl7c_init = 0;
-	}
-
-	if (devpath == NULL && module == NULL) {
-		/*
-		 * Delete existing entry,
-		 * both socket module and STEAMS device.
-		 */
-		ASSERT(module == NULL);
-		error = sockparams_delete(family, type, protocol);
-	} else {
-		/*
-		 * Adding an entry
-		 * sockparams_create frees mod name and devpath upon failure.
-		 */
-		sp = sockparams_create(family, type, protocol, module,
-		    devpath, devpathlen, 0, KM_SLEEP, &error);
-
-		if (sp != NULL) {
-			/*
-			 * The sockparams entry becomes globally visible once
-			 * we call sockparams_add(). So we add a reference so
-			 * we do not have to worry about the entry being
-			 * immediately deleted.
-			 */
-			SOCKPARAMS_INC_REF(sp);
-			error = sockparams_add(sp);
-			if (error != 0) {
-				SOCKPARAMS_DEC_REF(sp);
-				sockparams_destroy(sp);
-			} else {
-				/*
-				 * Unique sockparams entry, so init the kstats.
-				 */
-				sockparams_kstat_init(sp);
-				SOCKPARAMS_DEC_REF(sp);
-			}
-		}
-	}
-
-	return (error);
-}
 
 /*
  * solookup(int family, int type, int protocol, struct sockparams **spp)
@@ -716,7 +664,7 @@
 	int error = 0;
 
 	*spp = NULL;
-	rw_enter(&splist_lock, RW_READER);
+	rw_enter(&sockconf_lock, RW_READER);
 
 	/*
 	 * Search the sockparams list for an appropiate entry.
@@ -740,7 +688,7 @@
 			    sp->sp_protocol == protocol && found < 2)
 				found = 2;
 		}
-		rw_exit(&splist_lock);
+		rw_exit(&sockconf_lock);
 		switch (found) {
 		case 0:
 			error = EAFNOSUPPORT;
@@ -760,13 +708,13 @@
 	 *
 	 * We put a hold on the entry early on, so if the
 	 * sockmod is not loaded, and we have to exit
-	 * splist_lock to call modload(), we know that the
+	 * sockconf_lock to call modload(), we know that the
 	 * sockparams entry wont go away. That way we don't
 	 * have to look up the entry once we come back from
 	 * modload().
 	 */
 	SOCKPARAMS_INC_REF(sp);
-	rw_exit(&splist_lock);
+	rw_exit(&sockconf_lock);
 
 	if (sp->sp_smod_info == NULL) {
 		smod_info_t *smod = smod_lookup_byname(sp->sp_smod_name);
@@ -807,3 +755,73 @@
 	*spp = sp;
 	return (0);
 }
+
+/*
+ * Called when filter entry `ent' is going away. All sockparams remove
+ * their references to `ent'.
+ */
+static void
+sockparams_filter_cleanup_impl(sof_entry_t *ent, list_t *list)
+{
+	struct sockparams *sp;
+	sp_filter_t *fil;
+	list_t *flist;
+
+	ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+	for (sp = list_head(list); sp != NULL;
+	    sp = list_next(list, sp)) {
+		flist = (ent->sofe_flags & SOFEF_AUTO) ?
+		    &sp->sp_auto_filters : &sp->sp_prog_filters;
+		fil = list_head(flist);
+		for (fil = list_head(flist); fil != NULL;
+		    fil = list_next(flist, fil)) {
+			if (fil->spf_filter == ent) {
+				list_remove(flist, fil);
+				kmem_free(fil, sizeof (sp_filter_t));
+				break;
+			}
+		}
+	}
+}
+void
+sockparams_filter_cleanup(sof_entry_t *ent)
+{
+	sockparams_filter_cleanup_impl(ent, &sphead);
+	sockparams_filter_cleanup_impl(ent, &sp_ephem_list);
+}
+
+/*
+ * New filter is being added; walk the list of sockparams to see if
+ * the filter is interested in any of the sockparams.
+ */
+static int
+sockparams_new_filter_impl(sof_entry_t *ent, list_t *list)
+{
+	struct sockparams *sp;
+	int err;
+
+	ASSERT(RW_WRITE_HELD(&sockconf_lock));
+
+	for (sp = list_head(list); sp != NULL;
+	    sp = list_next(list, sp)) {
+		if ((err = sof_entry_proc_sockparams(ent, sp)) != 0) {
+			sockparams_filter_cleanup(ent);
+			return (err);
+		}
+	}
+	return (0);
+}
+
+int
+sockparams_new_filter(sof_entry_t *ent)
+{
+	int error;
+
+	if ((error = sockparams_new_filter_impl(ent, &sphead)) != 0)
+		return (error);
+
+	if ((error = sockparams_new_filter_impl(ent, &sp_ephem_list)) != 0)
+		sockparams_filter_cleanup_impl(ent, &sphead);
+	return (error);
+}
--- a/usr/src/uts/common/fs/sockfs/socksubr.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -72,6 +71,7 @@
 
 #include <fs/sockfs/nl7c.h>
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 #include <fs/sockfs/socktpi.h>
 #include <fs/sockfs/socktpi_impl.h>
 #include <fs/sockfs/sodirect.h>
@@ -97,6 +97,12 @@
 
 struct kmem_cache *socket_cache;
 
+/*
+ * sockconf_lock protects the socket configuration (socket types and
+ * socket filters) which is changed via the sockconfig system call.
+ */
+krwlock_t sockconf_lock;
+
 static int sockfs_update(kstat_t *, int);
 static int sockfs_snapshot(kstat_t *, void *, int);
 extern smod_info_t *sotpi_smod_create(void);
@@ -239,6 +245,8 @@
 	    sizeof (struct sonode), 0, sonode_constructor,
 	    sonode_destructor, NULL, NULL, NULL, 0);
 
+	rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
+
 	error = socktpi_init();
 	if (error != 0) {
 		err_str = NULL;
@@ -288,6 +296,9 @@
 		nl7c_init();
 	}
 
+	/* Initialize socket filters */
+	sof_init();
+
 	return (0);
 
 failure:
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -67,6 +66,7 @@
 
 #include <fs/sockfs/nl7c.h>
 #include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/sockfilter_impl.h>
 #include <fs/sockfs/socktpi.h>
 
 #ifdef SOCK_TEST
@@ -75,7 +75,10 @@
 #define	do_useracc	1
 #endif /* SOCK_TEST */
 
-extern int xnet_truncate_print;
+extern int 	xnet_truncate_print;
+
+extern void	nl7c_init(void);
+extern int	sockfs_defer_nl7c_init;
 
 /*
  * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
@@ -1519,143 +1522,291 @@
 	return (0);
 }
 
+static int
+sockconf_add_sock(int family, int type, int protocol, char *name)
+{
+	int error = 0;
+	char *kdevpath = NULL;
+	char *kmodule = NULL;
+	char *buf = NULL;
+	size_t pathlen = 0;
+	struct sockparams *sp;
+
+	if (name == NULL)
+		return (EINVAL);
+	/*
+	 * Copyin the name.
+	 * This also makes it possible to check for too long pathnames.
+	 * Compress the space needed for the name before passing it
+	 * to soconfig - soconfig will store the string until
+	 * the configuration is removed.
+	 */
+	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
+		kmem_free(buf, MAXPATHLEN);
+		return (error);
+	}
+	if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
+		/* For device */
+
+		/*
+		 * Special handling for NCA:
+		 *
+		 * DEV_NCA is never opened even if an application
+		 * requests for AF_NCA. The device opened is instead a
+		 * predefined AF_INET transport (NCA_INET_DEV).
+		 *
+		 * Prior to Volo (PSARC/2007/587) NCA would determine
+		 * the device using a lookup, which worked then because
+		 * all protocols were based on TPI. Since TPI is no
+		 * longer the default, we have to explicitly state
+		 * which device to use.
+		 */
+		if (strcmp(buf, NCA_DEV) == 0) {
+			/* only support entry <28, 2, 0> */
+			if (family != AF_NCA || type != SOCK_STREAM ||
+			    protocol != 0) {
+				kmem_free(buf, MAXPATHLEN);
+				return (EINVAL);
+			}
+
+			pathlen = strlen(NCA_INET_DEV) + 1;
+			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+			bcopy(NCA_INET_DEV, kdevpath, pathlen);
+			kdevpath[pathlen - 1] = '\0';
+		} else {
+			kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+			bcopy(buf, kdevpath, pathlen);
+			kdevpath[pathlen - 1] = '\0';
+		}
+	} else {
+		/* For socket module */
+		kmodule = kmem_alloc(pathlen, KM_SLEEP);
+		bcopy(buf, kmodule, pathlen);
+		kmodule[pathlen - 1] = '\0';
+		pathlen = 0;
+	}
+	kmem_free(buf, MAXPATHLEN);
+
+	/* sockparams_create frees mod name and devpath upon failure */
+	sp = sockparams_create(family, type, protocol, kmodule,
+	    kdevpath, pathlen, 0, KM_SLEEP, &error);
+	if (sp != NULL) {
+		error = sockparams_add(sp);
+		if (error != 0)
+			sockparams_destroy(sp);
+	}
+
+	return (error);
+}
+
+static int
+sockconf_remove_sock(int family, int type, int protocol)
+{
+	return (sockparams_delete(family, type, protocol));
+}
+
+static int
+sockconfig_remove_filter(const char *uname)
+{
+	char kname[SOF_MAXNAMELEN];
+	size_t len;
+	int error;
+	sof_entry_t *ent;
+
+	if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
+		return (error);
+
+	ent = sof_entry_remove_by_name(kname);
+	if (ent == NULL)
+		return (ENXIO);
+
+	mutex_enter(&ent->sofe_lock);
+	ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
+	if (ent->sofe_refcnt == 0) {
+		mutex_exit(&ent->sofe_lock);
+		sof_entry_free(ent);
+	} else {
+		/* let the last socket free the filter */
+		ent->sofe_flags |= SOFEF_CONDEMED;
+		mutex_exit(&ent->sofe_lock);
+	}
+
+	return (0);
+}
+
+static int
+sockconfig_add_filter(const char *uname, void *ufilpropp)
+{
+	struct sockconfig_filter_props filprop;
+	sof_entry_t *ent;
+	int error;
+	size_t tuplesz, len;
+	char hintbuf[SOF_MAXNAMELEN];
+
+	ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
+	mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
+	    &len)) != 0) {
+		sof_entry_free(ent);
+		return (error);
+	}
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
+			sof_entry_free(ent);
+			return (EFAULT);
+		}
+	}
+#ifdef	_SYSCALL32_IMPL
+	else {
+		struct sockconfig_filter_props32 filprop32;
+
+		if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
+			sof_entry_free(ent);
+			return (EFAULT);
+		}
+		filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
+		filprop.sfp_autoattach = filprop32.sfp_autoattach;
+		filprop.sfp_hint = filprop32.sfp_hint;
+		filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
+		filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
+		filprop.sfp_socktuple =
+		    (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
+	}
+#endif	/* _SYSCALL32_IMPL */
+
+	if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
+	    sizeof (ent->sofe_modname), &len)) != 0) {
+		sof_entry_free(ent);
+		return (error);
+	}
+
+	/*
+	 * A filter must specify at least one socket tuple.
+	 */
+	if (filprop.sfp_socktuple_cnt == 0 ||
+	    filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
+		sof_entry_free(ent);
+		return (EINVAL);
+	}
+	ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
+	ent->sofe_hint = filprop.sfp_hint;
+
+	/*
+	 * Verify the hint, and copy in the hint argument, if necessary.
+	 */
+	switch (ent->sofe_hint) {
+	case SOF_HINT_BEFORE:
+	case SOF_HINT_AFTER:
+		if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
+		    sizeof (hintbuf), &len)) != 0) {
+			sof_entry_free(ent);
+			return (error);
+		}
+		ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
+		bcopy(hintbuf, ent->sofe_hintarg, len);
+		/* FALLTHRU */
+	case SOF_HINT_TOP:
+	case SOF_HINT_BOTTOM:
+		/* hints cannot be used with programmatic filters */
+		if (ent->sofe_flags & SOFEF_PROG) {
+			sof_entry_free(ent);
+			return (EINVAL);
+		}
+		break;
+	case SOF_HINT_NONE:
+		break;
+	default:
+		/* bad hint value */
+		sof_entry_free(ent);
+		return (EINVAL);
+	}
+
+	ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
+	tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
+	ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
+
+	if (get_udatamodel() == DATAMODEL_NATIVE) {
+		if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
+		    tuplesz)) {
+			sof_entry_free(ent);
+			return (EFAULT);
+		}
+	}
+#ifdef	_SYSCALL32_IMPL
+	else {
+		int i;
+		caddr_t data = (caddr_t)filprop.sfp_socktuple;
+		sof_socktuple_t	*tup = ent->sofe_socktuple;
+		sof_socktuple32_t tup32;
+
+		tup = ent->sofe_socktuple;
+		for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
+			ASSERT(tup < ent->sofe_socktuple + tuplesz);
+
+			if (copyin(data, &tup32, sizeof (tup32)) != 0) {
+				sof_entry_free(ent);
+				return (EFAULT);
+			}
+			tup->sofst_family = tup32.sofst_family;
+			tup->sofst_type = tup32.sofst_type;
+			tup->sofst_protocol = tup32.sofst_protocol;
+
+			data += sizeof (tup32);
+		}
+	}
+#endif	/* _SYSCALL32_IMPL */
+
+	/* Sockets can start using the filter as soon as the filter is added */
+	if ((error = sof_entry_add(ent)) != 0)
+		sof_entry_free(ent);
+
+	return (error);
+}
+
 /*
- * Add config info when name is non-NULL; delete info when name is NULL.
- * name could be a device name or a module name and are user address.
+ * Socket configuration system call. It is used to add and remove
+ * socket types.
  */
 int
-sockconfig(int family, int type, int protocol, char *name)
+sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
 {
-	char *kdevpath = NULL;		/* Copied in devpath string */
-	char *kmodule = NULL;
-	size_t pathlen = 0;
 	int error = 0;
 
-	dprint(1, ("sockconfig(%d, %d, %d, %p)\n",
-	    family, type, protocol, (void *)name));
-
 	if (secpolicy_net_config(CRED(), B_FALSE) != 0)
 		return (set_errno(EPERM));
 
-	/*
-	 * By default set the kdevpath and kmodule to NULL to delete an entry.
-	 * Otherwise when name is not NULL, set the kdevpath or kmodule
-	 * value to add an entry.
-	 */
-	if (name != NULL) {
-		/*
-		 * Adding an entry.
-		 * Copyin the name.
-		 * This also makes it possible to check for too long pathnames.
-		 * Compress the space needed for the name before passing it
-		 * to soconfig - soconfig will store the string until
-		 * the configuration is removed.
-		 */
-		char *buf;
-		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-		if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
-			kmem_free(buf, MAXPATHLEN);
-			goto done;
-		}
-		if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
-			/* For device */
-
-			/*
-			 * Special handling for NCA:
-			 *
-			 * DEV_NCA is never opened even if an application
-			 * requests for AF_NCA. The device opened is instead a
-			 * predefined AF_INET transport (NCA_INET_DEV).
-			 *
-			 * Prior to Volo (PSARC/2007/587) NCA would determine
-			 * the device using a lookup, which worked then because
-			 * all protocols were based on TPI. Since TPI is no
-			 * longer the default, we have to explicitly state
-			 * which device to use.
-			 */
-			if (strcmp(buf, NCA_DEV) == 0) {
-				/* only support entry <28, 2, 0> */
-				if (family != AF_NCA || type != SOCK_STREAM ||
-				    protocol != 0) {
-					kmem_free(buf, MAXPATHLEN);
-					error = EINVAL;
-					goto done;
-				}
-
-				pathlen = strlen(NCA_INET_DEV) + 1;
-				kdevpath = kmem_alloc(pathlen, KM_SLEEP);
-				bcopy(NCA_INET_DEV, kdevpath, pathlen);
-				kdevpath[pathlen - 1] = '\0';
-			} else {
-				kdevpath = kmem_alloc(pathlen, KM_SLEEP);
-				bcopy(buf, kdevpath, pathlen);
-				kdevpath[pathlen - 1] = '\0';
-			}
-		} else {
-			/* For socket module */
-			kmodule = kmem_alloc(pathlen, KM_SLEEP);
-			bcopy(buf, kmodule, pathlen);
-			kmodule[pathlen - 1] = '\0';
+	if (sockfs_defer_nl7c_init) {
+		nl7c_init();
+		sockfs_defer_nl7c_init = 0;
+	}
 
-			pathlen = 0;
-			if (strcmp(kmodule, "tcp") == 0) {
-				/* Get the tcp device name for fallback */
-				if (family == 2) {
-					pathlen = strlen("/dev/tcp") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/tcp", kdevpath,
-					    pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				} else {
-					ASSERT(family == 26);
-					pathlen = strlen("/dev/tcp6") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/tcp6", kdevpath, pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				}
-			} else if (strcmp(kmodule, "udp") == 0) {
-				/* Get the udp device name for fallback */
-				if (family == 2) {
-					pathlen = strlen("/dev/udp") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/udp", kdevpath, pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				} else {
-					ASSERT(family == 26);
-					pathlen = strlen("/dev/udp6") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/udp6", kdevpath, pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				}
-			} else if (strcmp(kmodule, "icmp") == 0) {
-				/* Get the icmp device name for fallback */
-				if (family == 2) {
-					pathlen = strlen("/dev/rawip") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/rawip", kdevpath, pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				} else {
-					ASSERT(family == 26);
-					pathlen = strlen("/dev/rawip6") + 1;
-					kdevpath = kmem_alloc(pathlen,
-					    KM_SLEEP);
-					bcopy("/dev/rawip6", kdevpath, pathlen);
-					kdevpath[pathlen - 1] = '\0';
-				}
-			}
-		}
+	switch (cmd) {
+	case SOCKCONFIG_ADD_SOCK:
+		error = sockconf_add_sock((int)(uintptr_t)arg1,
+		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
+		break;
+	case SOCKCONFIG_REMOVE_SOCK:
+		error = sockconf_remove_sock((int)(uintptr_t)arg1,
+		    (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
+		break;
+	case SOCKCONFIG_ADD_FILTER:
+		error = sockconfig_add_filter((const char *)arg1, arg2);
+		break;
+	case SOCKCONFIG_REMOVE_FILTER:
+		error = sockconfig_remove_filter((const char *)arg1);
+		break;
+	default:
+#ifdef	DEBUG
+		cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
+#endif
+		error = EINVAL;
+		break;
+	}
 
-		kmem_free(buf, MAXPATHLEN);
-	}
-	error = soconfig(family, type, protocol, kdevpath, (int)pathlen,
-	    kmodule);
-done:
-	if (error) {
+	if (error != 0) {
 		eprintline(error);
 		return (set_errno(error));
 	}
@@ -1943,9 +2094,15 @@
 		 * For sockets acting as an SSL proxy, we
 		 * need to adjust the size to the maximum
 		 * SSL record size set in the stream head.
+		 *
+		 * Socket filters can limit the mblk size,
+		 * so limit reads to maxblk if there are
+		 * filters present.
 		 */
-		if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
-		    SOTOTPI(so)->sti_kssl_ctx != NULL)
+		if (vp->v_type == VSOCK &&
+		    (!SOCK_IS_NONSTR(so) &&
+		    SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+		    (so->so_filter_active > 0 && maxblk != INFPSZ))
 			iosize = (int)MIN(iosize, maxblk);
 
 		if (is_system_labeled()) {
@@ -2550,9 +2707,14 @@
 		 * For sockets acting as an SSL proxy, we
 		 * need to adjust the size to the maximum
 		 * SSL record size set in the stream head.
+		 *
+		 * Socket filters can limit the mblk size,
+		 * so limit reads to maxblk if there are
 		 */
-		if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
-		    SOTOTPI(so)->sti_kssl_ctx != NULL)
+		if (vp->v_type == VSOCK &&
+		    (!SOCK_IS_NONSTR(so) &&
+		    SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+		    so->so_filter_active > 0 && maxblk != INFPSZ)
 			iosize = (int)MIN(iosize, maxblk);
 
 		if (is_system_labeled()) {
@@ -2804,7 +2966,7 @@
 }
 
 int
-soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen,
+soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
     int fflag, int flags)
 {
 	return (socket_connect(so, name, namelen, fflag, flags, CRED()));
--- a/usr/src/uts/common/fs/sockfs/socktpi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -214,7 +213,7 @@
 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
 		    int, struct cred *);
 static int	sotpi_listen(struct sonode *, int, struct cred *);
-static int	sotpi_connect(struct sonode *, const struct sockaddr *,
+static int	sotpi_connect(struct sonode *, struct sockaddr *,
 		    socklen_t, int, int, struct cred *);
 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
 		    struct uio *, struct cred *);
@@ -2231,7 +2230,7 @@
  */
 int
 sotpi_connect(struct sonode *so,
-	const struct sockaddr *name,
+	struct sockaddr *name,
 	socklen_t namelen,
 	int fflag,
 	int flags,
@@ -6484,23 +6483,6 @@
 		*direct = B_TRUE;
 
 	/*
-	 * When it comes to urgent data we have two cases to deal with;
-	 * (1) The oob byte has already arrived, or (2) the protocol has
-	 * notified that oob data is pending, but it has not yet arrived.
-	 *
-	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
-	 * in the byte stream the oob byte is. For (2) we have to send a
-	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
-	 * the oob byte will be the next byte from the protocol.
-	 *
-	 * So in the worst case we need two mblks, one for the signal, another
-	 * for mark indication. In that case we use the exdata_mp for the sig.
-	 */
-	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
-	    STR_NOSIG, NULL);
-	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
-
-	/*
 	 * Keep the original sp around so we can properly dispose of the
 	 * sonode when the socket is being closed.
 	 */
@@ -6560,16 +6542,6 @@
 	ASSERT(!SOCK_IS_NONSTR(so));
 	ASSERT(vp->v_stream != NULL);
 
-	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
-		freeb(SOTOTPI(so)->sti_exdata_mp);
-		SOTOTPI(so)->sti_exdata_mp = NULL;
-	}
-
-	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
-		freeb(SOTOTPI(so)->sti_urgmark_mp);
-		SOTOTPI(so)->sti_urgmark_mp = NULL;
-	}
-
 	strclean(vp);
 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
 
@@ -6677,9 +6649,6 @@
 	sti->sti_nl7c_uri	= NULL;
 	sti->sti_nl7c_rcv_mp	= NULL;
 
-	sti->sti_exdata_mp	= NULL;
-	sti->sti_urgmark_mp	= NULL;
-
 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
 
@@ -6705,9 +6674,6 @@
 	ASSERT(sti->sti_nl7c_uri == NULL);
 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
 
-	ASSERT(sti->sti_exdata_mp == NULL);
-	ASSERT(sti->sti_urgmark_mp == NULL);
-
 	mutex_destroy(&sti->sti_plumb_lock);
 	cv_destroy(&sti->sti_ack_cv);
 }
--- a/usr/src/uts/common/fs/sockfs/socktpi.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/socktpi.h	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SOCKFS_SOCKTPI_H
@@ -251,12 +250,6 @@
 	kssl_endpt_type_t	sti_kssl_type;	/* is proxy/is proxied/none */
 	kssl_ent_t		sti_kssl_ent;	/* SSL config entry */
 	kssl_ctx_t		sti_kssl_ctx;	/* SSL session context */
-
-	/*
-	 * The mblks below are only allocated and used during fallback.
-	 */
-	mblk_t	*sti_exdata_mp;		/* T_EXDATA_IND or SIGURG */
-	mblk_t	*sti_urgmark_mp;	/* mark indication */
 } sotpi_info_t;
 
 struct T_capability_ack;
--- a/usr/src/uts/common/fs/sockfs/sodirect.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/fs/sockfs/sodirect.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -78,7 +77,7 @@
 	if (uiop->uio_resid >= uioasync.mincnt &&
 	    sodp != NULL && sodp->sod_enabled &&
 	    uioasync.enabled && !(flags & MSG_PEEK) &&
-	    !so->so_proto_props.sopp_loopback &&
+	    !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 &&
 	    !(so->so_state & SS_CANTRCVMORE)) {
 		/*
 		 * Big enough I/O for uioa min setup and an sodirect socket
--- a/usr/src/uts/common/inet/inetddi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/inetddi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -88,6 +87,12 @@
 #elif	defined(INET_SOCKDESC) && !defined(INET_SOCK_PROTO_CREATE_FUNC)
 #error  inetddi.c: INET_SOCKDESC is defined but INET_SOCK_PROTO_CREATE_FUNC \
 is not!
+#elif	defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V4)
+#error	inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \
+INET_SOCK_FALLBACK_DEV_V4 is not!
+#elif	defined(INET_SOCK_PROTO_FB_FUNC) && !defined(INET_SOCK_FALLBACK_DEV_V6)
+#error	inetddi.c: INET_SOCK_PROTO_FB_FUNC is defined but \
+INET_SOCK_FALLBACK_DEV_V6 is not!
 #endif
 
 #ifdef	INET_DEVDESC
@@ -216,7 +221,9 @@
 static __smod_priv_t smodpriv = {
 	NULL,
 	NULL,
-	INET_SOCK_PROTO_FB_FUNC
+	INET_SOCK_PROTO_FB_FUNC,
+	INET_SOCK_FALLBACK_DEV_V4,
+	INET_SOCK_FALLBACK_DEV_V6
 };
 #endif	/* INET_SOCK_PROTO_FB_FUNC */
 
--- a/usr/src/uts/common/inet/ip/icmp.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/ip/icmp.c	Thu Jun 17 17:22:09 2010 -0700
@@ -5215,7 +5215,8 @@
 /* ARGSUSED2 */
 int
 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
-    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
+    sock_quiesce_arg_t *arg)
 {
 	conn_t  *connp = (conn_t *)proto_handle;
 	icmp_t	*icmp;
@@ -5224,7 +5225,7 @@
 	socklen_t laddrlen, faddrlen;
 	short opts;
 	struct stroptions *stropt;
-	mblk_t *stropt_mp;
+	mblk_t *mp, *stropt_mp;
 	int error;
 
 	icmp = connp->conn_icmp;
@@ -5276,7 +5277,7 @@
 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 		opts |= SO_DONTROUTE;
 
-	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 	    (struct sockaddr *)&laddr, laddrlen,
 	    (struct sockaddr *)&faddr, faddrlen, opts);
 
@@ -5285,9 +5286,11 @@
 	 * queued in icmp_t. Now we push up any queued packets.
 	 */
 	mutex_enter(&icmp->icmp_recv_lock);
+	if (mp != NULL) {
+		mp->b_next = icmp->icmp_fallback_queue_head;
+		icmp->icmp_fallback_queue_head = mp;
+	}
 	while (icmp->icmp_fallback_queue_head != NULL) {
-		mblk_t	*mp;
-
 		mp = icmp->icmp_fallback_queue_head;
 		icmp->icmp_fallback_queue_head = mp->b_next;
 		mp->b_next = NULL;
--- a/usr/src/uts/common/inet/ip/icmpddi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/ip/icmpddi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -40,8 +39,10 @@
 #define	INET_DEVSTRTAB	icmpinfov4
 #define	INET_MODSTRTAB	dummymodinfo
 #define	INET_SOCKDESC	"Rawip socket module"
-#define	INET_SOCK_PROTO_CREATE_FUNC (*rawip_create)
-#define	INET_SOCK_PROTO_FB_FUNC (*rawip_fallback)
+#define	INET_SOCK_PROTO_CREATE_FUNC 	(*rawip_create)
+#define	INET_SOCK_PROTO_FB_FUNC 	(*rawip_fallback)
+#define	INET_SOCK_FALLBACK_DEV_V4	"/dev/icmp"
+#define	INET_SOCK_FALLBACK_DEV_V6	"/dev/icmp6"
 #define	INET_DEVMTFLAGS	D_MP
 #define	INET_MODMTFLAGS D_MP
 
--- a/usr/src/uts/common/inet/rawip_impl.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/rawip_impl.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -107,7 +106,7 @@
 extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **,
     uint_t *, int *, int, cred_t *);
 extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
-    so_proto_quiesced_cb_t);
+    so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
 
 extern sock_downcalls_t sock_rawip_downcalls;
 
--- a/usr/src/uts/common/inet/sockmods/socksctp.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -63,7 +62,7 @@
 static int sosctp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
     struct cred *);
 static int sosctp_listen(struct sonode *, int, struct cred *);
-static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t,
+static int sosctp_connect(struct sonode *, struct sockaddr *, socklen_t,
     int, int, struct cred *);
 static int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
     struct cred *);
@@ -86,7 +85,7 @@
 /*
  * SCTP sockfs sonode operations, 1-N socket
  */
-static int sosctp_seq_connect(struct sonode *, const struct sockaddr *,
+static int sosctp_seq_connect(struct sonode *, struct sockaddr *,
     socklen_t, int, int, struct cred *);
 static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
     struct cred *);
@@ -352,7 +351,7 @@
  */
 /*ARGSUSED*/
 static int
-sosctp_connect(struct sonode *so, const struct sockaddr *name,
+sosctp_connect(struct sonode *so, struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, struct cred *cr)
 {
 	int error = 0;
@@ -433,7 +432,7 @@
  * make it so.
  */
 static int
-sosctp_seq_connect(struct sonode *so, const struct sockaddr *name,
+sosctp_seq_connect(struct sonode *so, struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, struct cred *cr)
 {
 	struct sctp_soassoc *ssa;
--- a/usr/src/uts/common/inet/sockmods/socksdp.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/sockmods/socksdp.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -63,7 +62,7 @@
 static int sosdp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
     struct cred *);
 static int sosdp_listen(struct sonode *, int, struct cred *);
-static int sosdp_connect(struct sonode *, const struct sockaddr *, socklen_t,
+static int sosdp_connect(struct sonode *, struct sockaddr *, socklen_t,
     int, int, struct cred *);
 static int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
     struct cred *);
@@ -325,7 +324,7 @@
  */
 /*ARGSUSED*/
 static int
-sosdp_connect(struct sonode *so, const struct sockaddr *name,
+sosdp_connect(struct sonode *so, struct sockaddr *name,
     socklen_t namelen, int fflag, int flags, struct cred *cr)
 {
 	int error = 0;
@@ -1120,7 +1119,7 @@
 		*reventsp |= (POLLIN|POLLRDNORM) & events;
 	}
 
-	if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_head != NULL)) {
+	if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_len > 0)) {
 		*reventsp |= (POLLIN|POLLRDNORM) & events;
 	}
 
@@ -1158,7 +1157,7 @@
 
 	mutex_enter(&so->so_lock);
 	so_unlock_single(so, SOLOCKED);
-	so_notify_disconnected(so, error);
+	so_notify_disconnected(so, B_FALSE, error);
 
 	return (error);
 }
@@ -1266,7 +1265,7 @@
 	ASSERT(so->so_proto_handle != NULL); /* closed conn */
 
 	soisdisconnected(so, error);
-	so_notify_disconnected(so, error);
+	so_notify_disconnected(so, B_FALSE, error);
 }
 
 /*
--- a/usr/src/uts/common/inet/tcp/tcp.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Thu Jun 17 17:22:09 2010 -0700
@@ -952,6 +952,18 @@
 		}
 	}
 
+	/*
+	 * ESTABLISHED non-STREAMS eagers are not 'detached' because
+	 * an upper handle is obtained when the SYN-ACK comes in. So it
+	 * should receive the 'disconnected' upcall, but tcp_reinit should
+	 * not be called since this is an eager.
+	 */
+	if (tcp->tcp_listener != NULL && IPCL_IS_NONSTR(connp)) {
+		tcp_closei_local(tcp);
+		tcp->tcp_state = TCPS_BOUND;
+		return (0);
+	}
+
 	tcp_reinit(tcp);
 	if (IPCL_IS_NONSTR(connp))
 		(void) tcp_do_unbind(connp);
@@ -1014,15 +1026,23 @@
 		CONN_DEC_REF(connp);
 	}
 finish:
-	/* Signal closing thread that it can complete close */
-	mutex_enter(&tcp->tcp_closelock);
 	tcp->tcp_detached = B_TRUE;
 	connp->conn_rq = NULL;
 	connp->conn_wq = NULL;
 
+	/* Signal closing thread that it can complete close */
+	mutex_enter(&tcp->tcp_closelock);
 	tcp->tcp_closed = 1;
 	cv_signal(&tcp->tcp_closecv);
 	mutex_exit(&tcp->tcp_closelock);
+
+	/* If we have an upper handle (socket), release it */
+	if (IPCL_IS_NONSTR(connp)) {
+		ASSERT(connp->conn_upper_handle != NULL);
+		(*connp->conn_upcalls->su_closed)(connp->conn_upper_handle);
+		connp->conn_upper_handle = NULL;
+		connp->conn_upcalls = NULL;
+	}
 }
 
 void
@@ -1088,6 +1108,15 @@
 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
 	    NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
 
+	/*
+	 * For non-STREAMS sockets, the normal case is that the conn makes
+	 * an upcall when it's finally closed, so there is no need to wait
+	 * in the protocol. But in case of SO_LINGER the thread sleeps here
+	 * so it can properly deal with the thread being interrupted.
+	 */
+	if (IPCL_IS_NONSTR(connp) && connp->conn_linger == 0)
+		goto nowait;
+
 	mutex_enter(&tcp->tcp_closelock);
 	while (!tcp->tcp_closed) {
 		if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
@@ -1129,8 +1158,12 @@
 	 * conn_wq of the eagers point to our queues. By waiting for the
 	 * refcnt to drop to 1, we are sure that the eagers have cleaned
 	 * up their queue pointers and also dropped their references to us.
+	 *
+	 * For non-STREAMS sockets we do not have to wait here; the
+	 * listener will instead make a su_closed upcall when the last
+	 * reference is dropped.
 	 */
-	if (tcp->tcp_wait_for_eagers) {
+	if (tcp->tcp_wait_for_eagers && !IPCL_IS_NONSTR(connp)) {
 		mutex_enter(&connp->conn_lock);
 		while (connp->conn_ref != 1) {
 			cv_wait(&connp->conn_cv, &connp->conn_lock);
@@ -1138,6 +1171,7 @@
 		mutex_exit(&connp->conn_lock);
 	}
 
+nowait:
 	connp->conn_cpid = NOPID;
 }
 
@@ -1410,6 +1444,22 @@
 	 * the following code is enough.
 	 */
 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+
+	/*
+	 * If this is a non-STREAM socket still holding on to an upper
+	 * handle, release it. As a result of fallback we might also see
+	 * STREAMS based conns with upper handles, in which case there is
+	 * nothing to do other than clearing the field.
+	 */
+	if (connp->conn_upper_handle != NULL) {
+		if (IPCL_IS_NONSTR(connp)) {
+			(*connp->conn_upcalls->su_closed)(
+			    connp->conn_upper_handle);
+			tcp->tcp_detached = B_TRUE;
+		}
+		connp->conn_upper_handle = NULL;
+		connp->conn_upcalls = NULL;
+	}
 }
 
 /*
@@ -3092,103 +3142,19 @@
 }
 
 /*
- * This runs at the tail end of accept processing on the squeue of the
- * new connection.
+ * Collect protocol properties to send to the upper handle.
  */
-/* ARGSUSED */
 void
-tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+tcp_get_proto_props(tcp_t *tcp, struct sock_proto_props *sopp)
 {
-	conn_t			*connp = (conn_t *)arg;
-	tcp_t			*tcp = connp->conn_tcp;
-	queue_t			*q = connp->conn_rq;
-	tcp_stack_t		*tcps = tcp->tcp_tcps;
-	/* socket options */
-	struct sock_proto_props	sopp;
-
-	/* We should just receive a single mblk that fits a T_discon_ind */
-	ASSERT(mp->b_cont == NULL);
-
-	/*
-	 * Drop the eager's ref on the listener, that was placed when
-	 * this eager began life in tcp_input_listener.
-	 */
-	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
-	if (IPCL_IS_NONSTR(connp)) {
-		/* Safe to free conn_ind message */
-		freemsg(tcp->tcp_conn.tcp_eager_conn_ind);
-		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
-	}
-
-	tcp->tcp_detached = B_FALSE;
-
-	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
-		/*
-		 * Someone blewoff the eager before we could finish
-		 * the accept.
-		 *
-		 * The only reason eager exists it because we put in
-		 * a ref on it when conn ind went up. We need to send
-		 * a disconnect indication up while the last reference
-		 * on the eager will be dropped by the squeue when we
-		 * return.
-		 */
-		ASSERT(tcp->tcp_listener == NULL);
-		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
-			if (IPCL_IS_NONSTR(connp)) {
-				ASSERT(tcp->tcp_issocket);
-				(*connp->conn_upcalls->su_disconnected)(
-				    connp->conn_upper_handle, tcp->tcp_connid,
-				    ECONNREFUSED);
-				freemsg(mp);
-			} else {
-				struct	T_discon_ind	*tdi;
-
-				(void) putnextctl1(q, M_FLUSH, FLUSHRW);
-				/*
-				 * Let us reuse the incoming mblk to avoid
-				 * memory allocation failure problems. We know
-				 * that the size of the incoming mblk i.e.
-				 * stroptions is greater than sizeof
-				 * T_discon_ind.
-				 */
-				ASSERT(DB_REF(mp) == 1);
-				ASSERT(MBLKSIZE(mp) >=
-				    sizeof (struct T_discon_ind));
-
-				DB_TYPE(mp) = M_PROTO;
-				((union T_primitives *)mp->b_rptr)->type =
-				    T_DISCON_IND;
-				tdi = (struct T_discon_ind *)mp->b_rptr;
-				if (tcp->tcp_issocket) {
-					tdi->DISCON_reason = ECONNREFUSED;
-					tdi->SEQ_number = 0;
-				} else {
-					tdi->DISCON_reason = ENOPROTOOPT;
-					tdi->SEQ_number =
-					    tcp->tcp_conn_req_seqnum;
-				}
-				mp->b_wptr = mp->b_rptr +
-				    sizeof (struct T_discon_ind);
-				putnext(q, mp);
-			}
-		}
-		tcp->tcp_hard_binding = B_FALSE;
-		return;
-	}
-
-	/*
-	 * This is the first time we run on the correct
-	 * queue after tcp_accept. So fix all the q parameters
-	 * here.
-	 */
-	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
-	sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
-
-	sopp.sopp_rxhiwat = tcp->tcp_fused ?
+	conn_t *connp = tcp->tcp_connp;
+
+	sopp->sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
+	sopp->sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+
+	sopp->sopp_rxhiwat = tcp->tcp_fused ?
 	    tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) :
 	    connp->conn_rcvbuf;
-
 	/*
 	 * Determine what write offset value to use depending on SACK and
 	 * whether the endpoint is fused or not.
@@ -3203,18 +3169,18 @@
 		 * since it would reduce the amount of work done by kmem.
 		 * Non-fused tcp loopback case is handled separately below.
 		 */
-		sopp.sopp_wroff = 0;
+		sopp->sopp_wroff = 0;
 		/*
 		 * Update the peer's transmit parameters according to
 		 * our recently calculated high water mark value.
 		 */
 		(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
 	} else if (tcp->tcp_snd_sack_ok) {
-		sopp.sopp_wroff = connp->conn_ht_iphc_allocated +
-		    (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
+		sopp->sopp_wroff = connp->conn_ht_iphc_allocated +
+		    (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra);
 	} else {
-		sopp.sopp_wroff = connp->conn_ht_iphc_len +
-		    (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
+		sopp->sopp_wroff = connp->conn_ht_iphc_len +
+		    (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra);
 	}
 
 	/*
@@ -3239,297 +3205,10 @@
 
 		sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN;
 	}
-
-	/* Send the options up */
-	if (IPCL_IS_NONSTR(connp)) {
-		if (sopp.sopp_flags & SOCKOPT_TAIL) {
-			ASSERT(tcp->tcp_kssl_ctx != NULL);
-			ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY);
-		}
-		if (tcp->tcp_loopback) {
-			sopp.sopp_flags |= SOCKOPT_LOOPBACK;
-			sopp.sopp_loopback = B_TRUE;
-		}
-		(*connp->conn_upcalls->su_set_proto_props)
-		    (connp->conn_upper_handle, &sopp);
-		freemsg(mp);
-	} else {
-		/*
-		 * Let us reuse the incoming mblk to avoid
-		 * memory allocation failure problems. We know
-		 * that the size of the incoming mblk is at least
-		 * stroptions
-		 */
-		struct stroptions *stropt;
-
-		ASSERT(DB_REF(mp) == 1);
-		ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
-
-		DB_TYPE(mp) = M_SETOPTS;
-		stropt = (struct stroptions *)mp->b_rptr;
-		mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
-		stropt = (struct stroptions *)mp->b_rptr;
-		stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
-		stropt->so_hiwat = sopp.sopp_rxhiwat;
-		stropt->so_wroff = sopp.sopp_wroff;
-		stropt->so_maxblk = sopp.sopp_maxblk;
-
-		if (sopp.sopp_flags & SOCKOPT_TAIL) {
-			ASSERT(tcp->tcp_kssl_ctx != NULL);
-
-			stropt->so_flags |= SO_TAIL | SO_COPYOPT;
-			stropt->so_tail = sopp.sopp_tail;
-			stropt->so_copyopt = sopp.sopp_zcopyflag;
-		}
-
-		/* Send the options up */
-		putnext(q, mp);
-	}
-
-	/*
-	 * Pass up any data and/or a fin that has been received.
-	 *
-	 * Adjust receive window in case it had decreased
-	 * (because there is data <=> tcp_rcv_list != NULL)
-	 * while the connection was detached. Note that
-	 * in case the eager was flow-controlled, w/o this
-	 * code, the rwnd may never open up again!
-	 */
-	if (tcp->tcp_rcv_list != NULL) {
-		if (IPCL_IS_NONSTR(connp)) {
-			mblk_t *mp;
-			int space_left;
-			int error;
-			boolean_t push = B_TRUE;
-
-			if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv)
-			    (connp->conn_upper_handle, NULL, 0, 0, &error,
-			    &push) >= 0) {
-				tcp->tcp_rwnd = connp->conn_rcvbuf;
-				if (tcp->tcp_state >= TCPS_ESTABLISHED &&
-				    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
-					tcp_xmit_ctl(NULL,
-					    tcp, (tcp->tcp_swnd == 0) ?
-					    tcp->tcp_suna : tcp->tcp_snxt,
-					    tcp->tcp_rnxt, TH_ACK);
-				}
-			}
-			while ((mp = tcp->tcp_rcv_list) != NULL) {
-				push = B_TRUE;
-				tcp->tcp_rcv_list = mp->b_next;
-				mp->b_next = NULL;
-				space_left = (*connp->conn_upcalls->su_recv)
-				    (connp->conn_upper_handle, mp, msgdsize(mp),
-				    0, &error, &push);
-				if (space_left < 0) {
-					/*
-					 * We should never be in middle of a
-					 * fallback, the squeue guarantees that.
-					 */
-					ASSERT(error != EOPNOTSUPP);
-				}
-			}
-			tcp->tcp_rcv_last_head = NULL;
-			tcp->tcp_rcv_last_tail = NULL;
-			tcp->tcp_rcv_cnt = 0;
-		} else {
-			/* We drain directly in case of fused tcp loopback */
-
-			if (!tcp->tcp_fused && canputnext(q)) {
-				tcp->tcp_rwnd = connp->conn_rcvbuf;
-				if (tcp->tcp_state >= TCPS_ESTABLISHED &&
-				    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
-					tcp_xmit_ctl(NULL,
-					    tcp, (tcp->tcp_swnd == 0) ?
-					    tcp->tcp_suna : tcp->tcp_snxt,
-					    tcp->tcp_rnxt, TH_ACK);
-				}
-			}
-
-			(void) tcp_rcv_drain(tcp);
-		}
-
-		/*
-		 * For fused tcp loopback, back-enable peer endpoint
-		 * if it's currently flow-controlled.
-		 */
-		if (tcp->tcp_fused) {
-			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-
-			ASSERT(peer_tcp != NULL);
-			ASSERT(peer_tcp->tcp_fused);
-
-			mutex_enter(&peer_tcp->tcp_non_sq_lock);
-			if (peer_tcp->tcp_flow_stopped) {
-				tcp_clrqfull(peer_tcp);
-				TCP_STAT(tcps, tcp_fusion_backenabled);
-			}
-			mutex_exit(&peer_tcp->tcp_non_sq_lock);
-		}
-	}
-	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
-	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
-		tcp->tcp_ordrel_done = B_TRUE;
-		if (IPCL_IS_NONSTR(connp)) {
-			ASSERT(tcp->tcp_ordrel_mp == NULL);
-			(*connp->conn_upcalls->su_opctl)(
-			    connp->conn_upper_handle,
-			    SOCK_OPCTL_SHUT_RECV, 0);
-		} else {
-			mp = tcp->tcp_ordrel_mp;
-			tcp->tcp_ordrel_mp = NULL;
-			putnext(q, mp);
-		}
+	if (tcp->tcp_loopback) {
+		sopp->sopp_flags |= SOCKOPT_LOOPBACK;
+		sopp->sopp_loopback = B_TRUE;
 	}
-	tcp->tcp_hard_binding = B_FALSE;
-
-	if (connp->conn_keepalive) {
-		tcp->tcp_ka_last_intrvl = 0;
-		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
-		    tcp->tcp_ka_interval);
-	}
-
-	/*
-	 * At this point, eager is fully established and will
-	 * have the following references -
-	 *
-	 * 2 references for connection to exist (1 for TCP and 1 for IP).
-	 * 1 reference for the squeue which will be dropped by the squeue as
-	 *	soon as this function returns.
-	 * There will be 1 additonal reference for being in classifier
-	 *	hash list provided something bad hasn't happened.
-	 */
-	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
-	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
-}
-
-/*
- * Common to TPI and sockfs accept code.
- */
-/* ARGSUSED2 */
-int
-tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
-{
-	tcp_t *listener, *eager;
-	mblk_t *discon_mp;
-
-	listener = lconnp->conn_tcp;
-	ASSERT(listener->tcp_state == TCPS_LISTEN);
-	eager = econnp->conn_tcp;
-	ASSERT(eager->tcp_listener != NULL);
-
-	/*
-	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
-	 * use it if something failed.
-	 */
-	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
-	    sizeof (struct stroptions)), BPRI_HI);
-
-	if (discon_mp == NULL) {
-		return (-TPROTO);
-	}
-	eager->tcp_issocket = B_TRUE;
-
-	econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
-	econnp->conn_allzones = listener->tcp_connp->conn_allzones;
-	ASSERT(econnp->conn_netstack ==
-	    listener->tcp_connp->conn_netstack);
-	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
-
-	/* Put the ref for IP */
-	CONN_INC_REF(econnp);
-
-	/*
-	 * We should have minimum of 3 references on the conn
-	 * at this point. One each for TCP and IP and one for
-	 * the T_conn_ind that was sent up when the 3-way handshake
-	 * completed. In the normal case we would also have another
-	 * reference (making a total of 4) for the conn being in the
-	 * classifier hash list. However the eager could have received
-	 * an RST subsequently and tcp_closei_local could have removed
-	 * the eager from the classifier hash list, hence we can't
-	 * assert that reference.
-	 */
-	ASSERT(econnp->conn_ref >= 3);
-
-	mutex_enter(&listener->tcp_eager_lock);
-	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
-
-		tcp_t *tail;
-		tcp_t *tcp;
-		mblk_t *mp1;
-
-		tcp = listener->tcp_eager_prev_q0;
-		/*
-		 * listener->tcp_eager_prev_q0 points to the TAIL of the
-		 * deferred T_conn_ind queue. We need to get to the head
-		 * of the queue in order to send up T_conn_ind the same
-		 * order as how the 3WHS is completed.
-		 */
-		while (tcp != listener) {
-			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
-			    !tcp->tcp_kssl_pending)
-				break;
-			else
-				tcp = tcp->tcp_eager_prev_q0;
-		}
-		/* None of the pending eagers can be sent up now */
-		if (tcp == listener)
-			goto no_more_eagers;
-
-		mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
-		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
-		/* Move from q0 to q */
-		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
-		listener->tcp_conn_req_cnt_q0--;
-		listener->tcp_conn_req_cnt_q++;
-		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
-		    tcp->tcp_eager_prev_q0;
-		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
-		    tcp->tcp_eager_next_q0;
-		tcp->tcp_eager_prev_q0 = NULL;
-		tcp->tcp_eager_next_q0 = NULL;
-		tcp->tcp_conn_def_q0 = B_FALSE;
-
-		/* Make sure the tcp isn't in the list of droppables */
-		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
-		    tcp->tcp_eager_prev_drop_q0 == NULL);
-
-		/*
-		 * Insert at end of the queue because sockfs sends
-		 * down T_CONN_RES in chronological order. Leaving
-		 * the older conn indications at front of the queue
-		 * helps reducing search time.
-		 */
-		tail = listener->tcp_eager_last_q;
-		if (tail != NULL) {
-			tail->tcp_eager_next_q = tcp;
-		} else {
-			listener->tcp_eager_next_q = tcp;
-		}
-		listener->tcp_eager_last_q = tcp;
-		tcp->tcp_eager_next_q = NULL;
-
-		/* Need to get inside the listener perimeter */
-		CONN_INC_REF(listener->tcp_connp);
-		SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
-		    tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL,
-		    SQTAG_TCP_SEND_PENDING);
-	}
-no_more_eagers:
-	tcp_eager_unlink(eager);
-	mutex_exit(&listener->tcp_eager_lock);
-
-	/*
-	 * At this point, the eager is detached from the listener
-	 * but we still have an extra refs on eager (apart from the
-	 * usual tcp references). The ref was placed in tcp_input_data
-	 * before sending the conn_ind in tcp_send_conn_ind.
-	 * The ref will be dropped in tcp_accept_finish().
-	 */
-	SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
-	    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
-	return (0);
 }
 
 /*
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c	Thu Jun 17 17:22:09 2010 -0700
@@ -233,8 +233,9 @@
 			mp->b_wptr += sizeof (*stropt);
 
 			stropt = (struct stroptions *)mp->b_rptr;
-			stropt->so_flags = SO_WROFF;
+			stropt->so_flags = SO_WROFF | SO_MAXBLK;
 			stropt->so_wroff = 0;
+			stropt->so_maxblk = INFPSZ;
 
 			/* Send the options up */
 			putnext(peer_rq, mp);
@@ -244,8 +245,9 @@
 			/* The peer is a non-STREAMS end point */
 			ASSERT(IPCL_IS_TCP(peer_connp));
 
-			sopp.sopp_flags = SOCKOPT_WROFF;
+			sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK;
 			sopp.sopp_wroff = 0;
+			sopp.sopp_maxblk = INFPSZ;
 			(*peer_connp->conn_upcalls->su_set_proto_props)
 			    (peer_connp->conn_upper_handle, &sopp);
 		}
--- a/usr/src/uts/common/inet/tcp/tcp_input.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c	Thu Jun 17 17:22:09 2010 -0700
@@ -1542,14 +1542,14 @@
 		eager->tcp_kssl_pending = B_TRUE;
 	}
 
+	ASSERT(eager->tcp_ordrel_mp == NULL);
+
 	/* Inherit the listener's non-STREAMS flag */
 	if (IPCL_IS_NONSTR(lconnp)) {
 		econnp->conn_flags |= IPCL_NONSTR;
-	}
-
-	ASSERT(eager->tcp_ordrel_mp == NULL);
-
-	if (!IPCL_IS_NONSTR(econnp)) {
+		/* All non-STREAMS tcp_ts are sockets */
+		eager->tcp_issocket = B_TRUE;
+	} else {
 		/*
 		 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
 		 * at close time, we will always have that to send up.
@@ -1632,7 +1632,7 @@
 	/*
 	 * Since we will clear tcp_listener before we clear tcp_detached
 	 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
-	 * so we can tell a TCP_DETACHED_NONEAGER apart.
+	 * so we can tell a TCP_IS_DETACHED_NONEAGER apart.
 	 */
 	eager->tcp_hard_binding = B_TRUE;
 
@@ -2003,8 +2003,6 @@
 	 * some work.
 	 */
 	if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
-		ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) ||
-		    tcp->tcp_fused_sigurg_mp != NULL);
 		if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
 		    &tcp->tcp_fused_sigurg_mp))
 			return (ret);
@@ -3588,14 +3586,79 @@
 	if (bytes_acked > 0)
 		tcp->tcp_ip_forward_progress = B_TRUE;
 	if (tcp->tcp_state == TCPS_SYN_RCVD) {
-		if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) &&
-		    ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) {
-			/* 3-way handshake complete - pass up the T_CONN_IND */
+		/*
+		 * tcp_sendmsg() checks tcp_state without entering
+		 * the squeue so tcp_state should be updated before
+		 * sending up a connection confirmation or a new
+		 * connection indication.
+		 */
+		tcp->tcp_state = TCPS_ESTABLISHED;
+
+		/*
+		 * We are seeing the final ack in the three way
+		 * hand shake of a active open'ed connection
+		 * so we must send up a T_CONN_CON
+		 */
+		if (tcp->tcp_active_open) {
+			if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
+				freemsg(mp);
+				tcp->tcp_state = TCPS_SYN_RCVD;
+				return;
+			}
+			/*
+			 * Don't fuse the loopback endpoints for
+			 * simultaneous active opens.
+			 */
+			if (tcp->tcp_loopback) {
+				TCP_STAT(tcps, tcp_fusion_unfusable);
+				tcp->tcp_unfusable = B_TRUE;
+			}
+			/*
+			 * For simultaneous active open, trace receipt of final
+			 * ACK as tcp:::connect-established.
+			 */
+			DTRACE_TCP5(connect__established, mblk_t *, NULL,
+			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
+			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
+		} else if (IPCL_IS_NONSTR(connp)) {
+			/*
+			 * 3-way handshake has completed, so notify socket
+			 * of the new connection.
+			 *
+			 * We are here means eager is fine but it can
+			 * get a TH_RST at any point between now and till
+			 * accept completes and disappear. We need to
+			 * ensure that reference to eager is valid after
+			 * we get out of eager's perimeter. So we do
+			 * an extra refhold.
+			 */
+			CONN_INC_REF(connp);
+
+			if (!tcp_newconn_notify(tcp, ira)) {
+				freemsg(mp);
+				/* notification did not go up, so drop ref */
+				CONN_DEC_REF(connp);
+				return;
+			}
+			/*
+			 * For passive open, trace receipt of final ACK as
+			 * tcp:::accept-established.
+			 */
+			DTRACE_TCP5(accept__established, mlbk_t *, NULL,
+			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
+			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
+		} else if (((tcp->tcp_kssl_ent == NULL) ||
+		    !tcp->tcp_kssl_pending)) {
+			/*
+			 * 3-way handshake complete - this is a STREAMS based
+			 * socket, so pass up the T_CONN_IND.
+			 */
 			tcp_t	*listener = tcp->tcp_listener;
 			mblk_t	*mp = tcp->tcp_conn.tcp_eager_conn_ind;
 
 			tcp->tcp_tconnind_started = B_TRUE;
 			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+			ASSERT(mp != NULL);
 			/*
 			 * We are here means eager is fine but it can
 			 * get a TH_RST at any point between now and till
@@ -3638,43 +3701,6 @@
 				    listener->tcp_connp, NULL, SQ_NODRAIN,
 				    SQTAG_TCP_CONN_IND);
 			}
-		}
-
-		/*
-		 * We are seeing the final ack in the three way
-		 * hand shake of a active open'ed connection
-		 * so we must send up a T_CONN_CON
-		 *
-		 * tcp_sendmsg() checks tcp_state without entering
-		 * the squeue so tcp_state should be updated before
-		 * sending up connection confirmation.  Probe the state
-		 * change below when we are sure sending of the confirmation
-		 * has succeeded.
-		 */
-		tcp->tcp_state = TCPS_ESTABLISHED;
-
-		if (tcp->tcp_active_open) {
-			if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
-				freemsg(mp);
-				tcp->tcp_state = TCPS_SYN_RCVD;
-				return;
-			}
-			/*
-			 * Don't fuse the loopback endpoints for
-			 * simultaneous active opens.
-			 */
-			if (tcp->tcp_loopback) {
-				TCP_STAT(tcps, tcp_fusion_unfusable);
-				tcp->tcp_unfusable = B_TRUE;
-			}
-			/*
-			 * For simultaneous active open, trace receipt of final
-			 * ACK as tcp:::connect-established.
-			 */
-			DTRACE_TCP5(connect__established, mblk_t *, NULL,
-			    ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
-			    iphdr, tcp_t *, tcp, tcph_t *, tcpha);
-		} else {
 			/*
 			 * For passive open, trace receipt of final ACK as
 			 * tcp:::accept-established.
@@ -4454,13 +4480,14 @@
 			tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 
 			/*
-			 * Generate the ordrel_ind at the end unless we
-			 * are an eager guy.
-			 * In the eager case tcp_rsrv will do this when run
-			 * after tcp_accept is done.
+			 * Generate the ordrel_ind at the end unless the
+			 * conn is detached or it is a STREAMS based eager.
+			 * In the eager case we defer the notification until
+			 * tcp_accept_finish has run.
 			 */
-			if (tcp->tcp_listener == NULL &&
-			    !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding)
+			if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) ||
+			    (tcp->tcp_listener == NULL &&
+			    !tcp->tcp_hard_binding)))
 				flags |= TH_ORDREL_NEEDED;
 			switch (tcp->tcp_state) {
 			case TCPS_SYN_RCVD:
@@ -4599,25 +4626,7 @@
 			return;
 	}
 
-	if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
-		/*
-		 * Side queue inbound data until the accept happens.
-		 * tcp_accept/tcp_rput drains this when the accept happens.
-		 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
-		 * T_EXDATA_IND) it is queued on b_next.
-		 * XXX Make urgent data use this. Requires:
-		 *	Removing tcp_listener check for TH_URG
-		 *	Making M_PCPROTO and MARK messages skip the eager case
-		 */
-
-		if (tcp->tcp_kssl_pending) {
-			DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
-			    mblk_t *, mp);
-			tcp_kssl_input(tcp, mp, ira->ira_cred);
-		} else {
-			tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
-		}
-	} else if (IPCL_IS_NONSTR(connp)) {
+	if (IPCL_IS_NONSTR(connp)) {
 		/*
 		 * Non-STREAMS socket
 		 *
@@ -4641,8 +4650,26 @@
 			/* PUSH bit set and sockfs is not flow controlled */
 			flags |= tcp_rwnd_reopen(tcp);
 		}
+	} else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
+		/*
+		 * Side queue inbound data until the accept happens.
+		 * tcp_accept/tcp_rput drains this when the accept happens.
+		 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
+		 * T_EXDATA_IND) it is queued on b_next.
+		 * XXX Make urgent data use this. Requires:
+		 *	Removing tcp_listener check for TH_URG
+		 *	Making M_PCPROTO and MARK messages skip the eager case
+		 */
+
+		if (tcp->tcp_kssl_pending) {
+			DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
+			    mblk_t *, mp);
+			tcp_kssl_input(tcp, mp, ira->ira_cred);
+		} else {
+			tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
+		}
 	} else {
-		/* STREAMS socket */
+		/* Active STREAMS socket */
 		if (mp->b_datap->db_type != M_DATA ||
 		    (flags & TH_MARKNEXT_NEEDED)) {
 			if (tcp->tcp_rcv_list != NULL) {
@@ -4858,11 +4885,14 @@
 	}
 	if (flags & TH_ORDREL_NEEDED) {
 		/*
-		 * Send up the ordrel_ind unless we are an eager guy.
-		 * In the eager case tcp_rsrv will do this when run
-		 * after tcp_accept is done.
+		 * Notify upper layer about an orderly release. If this is
+		 * a non-STREAMS socket, then just make an upcall. For STREAMS
+		 * we send up an ordrel_ind, unless this is an eager, in which
+		 * case the ordrel will be sent when tcp_accept_finish runs.
+		 * Note that for non-STREAMS we make an upcall even if it is an
+		 * eager, because we have an upper handle to send it to.
 		 */
-		ASSERT(tcp->tcp_listener == NULL);
+		ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
 		ASSERT(!tcp->tcp_detached);
 
 		if (IPCL_IS_NONSTR(connp)) {
--- a/usr/src/uts/common/inet/tcp/tcp_output.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c	Thu Jun 17 17:22:09 2010 -0700
@@ -1465,13 +1465,24 @@
 	clock_t	delta = 0;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 
-	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
-	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
+	/*
+	 * When a non-STREAMS socket is being closed, it does not always
+	 * stick around waiting for tcp_close_output to run and can therefore
+	 * have dropped a reference already. So adjust the asserts accordingly.
+	 */
+	ASSERT((connp->conn_fanout != NULL &&
+	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 3 : 4)) ||
+	    (connp->conn_fanout == NULL &&
+	    connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3)));
 
 	mutex_enter(&tcp->tcp_eager_lock);
 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
-		/* Cleanup for listener */
-		tcp_eager_cleanup(tcp, 0);
+		/*
+		 * Cleanup for listener. For non-STREAM sockets sockfs will
+		 * close all the eagers on 'q', so in that case only deal
+		 * with 'q0'.
+		 */
+		tcp_eager_cleanup(tcp, IPCL_IS_NONSTR(connp) ? 1 : 0);
 		tcp->tcp_wait_for_eagers = 1;
 	}
 	mutex_exit(&tcp->tcp_eager_lock);
@@ -1516,14 +1527,37 @@
 			msg = "tcp_close, unread data";
 			break;
 		}
+
 		/*
-		 * We have done a qwait() above which could have possibly
-		 * drained more messages in turn causing transition to a
-		 * different state. Check whether we have to do the rest
-		 * of the processing or not.
+		 * Abort connection if it is being closed without first
+		 * being accepted. This can happen if a listening non-STREAM
+		 * socket wants to get rid of the socket, for example, if the
+		 * listener is closing.
 		 */
-		if (tcp->tcp_state <= TCPS_LISTEN)
+		if (tcp->tcp_listener != NULL) {
+			ASSERT(IPCL_IS_NONSTR(connp));
+			msg = "tcp_close, close before accept";
+
+			/*
+			 * Unlink from the listener and drop the reference
+			 * put on it by the eager. tcp_closei_local will not
+			 * do it because tcp_tconnind_started is TRUE.
+			 */
+			mutex_enter(&tcp->tcp_saved_listener->tcp_eager_lock);
+			tcp_eager_unlink(tcp);
+			mutex_exit(&tcp->tcp_saved_listener->tcp_eager_lock);
+			CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+
+			/*
+			 * If the conn has received a RST, the only thing
+			 * left to do is to drop the ref.
+			 */
+			if (tcp->tcp_state <= TCPS_BOUND) {
+				CONN_DEC_REF(tcp->tcp_connp);
+				return;
+			}
 			break;
+		}
 
 		/*
 		 * Transmit the FIN before detaching the tcp_t.
@@ -1593,7 +1627,8 @@
 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
 			tcp_time_wait_append(tcp);
 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
-			ASSERT(connp->conn_ref >= 3);
+			ASSERT(connp->conn_ref >=
+			    (IPCL_IS_NONSTR(connp) ? 2 : 3));
 			goto finish;
 		}
 
@@ -1606,7 +1641,7 @@
 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
 			    delta ? delta : 1);
 
-		ASSERT(connp->conn_ref >= 3);
+		ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 2 : 3));
 		goto finish;
 	}
 
@@ -1623,22 +1658,35 @@
 
 	tcp_closei_local(tcp);
 	CONN_DEC_REF(connp);
-	ASSERT(connp->conn_ref >= 2);
+	ASSERT(connp->conn_ref >= (IPCL_IS_NONSTR(connp) ? 1 : 2));
 
 finish:
-	mutex_enter(&tcp->tcp_closelock);
 	/*
 	 * Don't change the queues in the case of a listener that has
 	 * eagers in its q or q0. It could surprise the eagers.
 	 * Instead wait for the eagers outside the squeue.
+	 *
+	 * For non-STREAMS sockets tcp_wait_for_eagers implies that
+	 * we should delay the su_closed upcall until all eagers have
+	 * dropped their references.
 	 */
 	if (!tcp->tcp_wait_for_eagers) {
 		tcp->tcp_detached = B_TRUE;
 		connp->conn_rq = NULL;
 		connp->conn_wq = NULL;
+
+		/* non-STREAM socket, release the upper handle */
+		if (IPCL_IS_NONSTR(connp)) {
+			ASSERT(connp->conn_upper_handle != NULL);
+			(*connp->conn_upcalls->su_closed)
+			    (connp->conn_upper_handle);
+			connp->conn_upper_handle = NULL;
+			connp->conn_upcalls = NULL;
+		}
 	}
 
 	/* Signal tcp_close() to finish closing. */
+	mutex_enter(&tcp->tcp_closelock);
 	tcp->tcp_closed = 1;
 	cv_signal(&tcp->tcp_closecv);
 	mutex_exit(&tcp->tcp_closelock);
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c	Thu Jun 17 17:22:09 2010 -0700
@@ -33,6 +33,7 @@
 #include <sys/strsun.h>
 #include <sys/squeue_impl.h>
 #include <sys/squeue.h>
+#define	_SUN_TPI_VERSION 2
 #include <sys/tihdr.h>
 #include <sys/timod.h>
 #include <sys/tpicommon.h>
@@ -121,6 +122,7 @@
 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 }
 
+/*ARGSUSED*/
 static int
 tcp_accept(sock_lower_handle_t lproto_handle,
     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
@@ -135,18 +137,59 @@
 	econnp = (conn_t *)eproto_handle;
 	eager = econnp->conn_tcp;
 	ASSERT(eager->tcp_listener != NULL);
+	ASSERT(IPCL_IS_NONSTR(econnp));
+	ASSERT(lconnp->conn_upper_handle != NULL);
+
+	/*
+	 * It is possible for the accept thread to race with the thread that
+	 * made the su_newconn upcall in tcp_newconn_notify. Both
+	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
+	 * and conn_upcalls be set before returning, so they both write to
+	 * them. However, we're guaranteed that the value written is the same
+	 * for both threads.
+	 */
+	ASSERT(econnp->conn_upper_handle == NULL ||
+	    econnp->conn_upper_handle == sock_handle);
+	ASSERT(econnp->conn_upcalls == NULL ||
+	    econnp->conn_upcalls == lconnp->conn_upcalls);
+	econnp->conn_upper_handle = sock_handle;
+	econnp->conn_upcalls = lconnp->conn_upcalls;
+
+	ASSERT(econnp->conn_netstack ==
+	    listener->tcp_connp->conn_netstack);
+	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 
 	/*
-	 * It is OK to manipulate these fields outside the eager's squeue
-	 * because they will not start being used until tcp_accept_finish
-	 * has been called.
+	 * We should have a minimum of 2 references on the conn at this
+	 * point. One for TCP and one for the newconn notification
+	 * (which is now taken over by IP). In the normal case we would
+	 * also have another reference (making a total of 3) for the conn
+	 * being in the classifier hash list. However the eager could have
+	 * received an RST subsequently and tcp_closei_local could have
+	 * removed the eager from the classifier hash list, hence we can't
+	 * assert that reference.
 	 */
-	ASSERT(lconnp->conn_upper_handle != NULL);
-	ASSERT(econnp->conn_upper_handle == NULL);
-	econnp->conn_upper_handle = sock_handle;
-	econnp->conn_upcalls = lconnp->conn_upcalls;
-	ASSERT(IPCL_IS_NONSTR(econnp));
-	return (tcp_accept_common(lconnp, econnp, cr));
+	ASSERT(econnp->conn_ref >= 2);
+
+	/*
+	 * An error is returned if this conn has been reset, which will
+	 * cause the socket to be closed immediately. The eager will be
+	 * unlinked from the listener during close.
+	 */
+	if (eager->tcp_state < TCPS_ESTABLISHED)
+		return (ECONNABORTED);
+
+	mutex_enter(&listener->tcp_eager_lock);
+	/*
+	 * Non-STREAMS listeners never defer the notification of new
+	 * connections.
+	 */
+	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
+	tcp_eager_unlink(eager);
+	mutex_exit(&listener->tcp_eager_lock);
+	CONN_DEC_REF(listener->tcp_connp);
+
+	return (0);
 }
 
 static int
@@ -188,14 +231,12 @@
 	return (error);
 }
 
-/*
- * SOP_LISTEN() calls into tcp_listen().
- */
 /* ARGSUSED */
 static int
 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 {
 	conn_t	*connp = (conn_t *)proto_handle;
+	tcp_t	*tcp = connp->conn_tcp;
 	int 	error;
 
 	ASSERT(connp->conn_upper_handle != NULL);
@@ -211,8 +252,14 @@
 
 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 	if (error == 0) {
+		/*
+		 * sockfs needs to know what's the maximum number of socket
+		 * that can be queued on the listener.
+		 */
 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
-		    SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog);
+		    SOCK_OPCTL_ENAB_ACCEPT,
+		    (uintptr_t)(tcp->tcp_conn_req_max +
+		    tcp->tcp_tcps->tcps_conn_req_max_q0));
 	} else if (error < 0) {
 		if (error == -TOUTSTATE)
 			error = EINVAL;
@@ -296,7 +343,6 @@
 	conn_t	*connp = (conn_t *)proto_handle;
 	tcp_t	*tcp = connp->conn_tcp;
 
-	ASSERT(connp->conn_upper_handle != NULL);
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
@@ -317,7 +363,6 @@
 	/* All Solaris components should pass a cred for this operation. */
 	ASSERT(cr != NULL);
 
-	ASSERT(connp->conn_upper_handle != NULL);
 	return (conn_getsockname(connp, addr, addrlenp));
 }
 
@@ -694,7 +739,12 @@
 	 * packets in squeue for the timewait state.
 	 */
 	CONN_DEC_REF(connp);
-	return (0);
+
+	/*
+	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
+	 * freeing the socket.
+	 */
+	return (EINPROGRESS);
 }
 
 /* ARGSUSED */
@@ -737,9 +787,206 @@
 	return ((sock_lower_handle_t)connp);
 }
 
+/*
+ * tcp_fallback
+ *
+ * A direct socket is falling back to using STREAMS. The queue
+ * that is being passed down was created using tcp_open() with
+ * the SO_FALLBACK flag set. As a result, the queue is not
+ * associated with a conn, and the q_ptrs instead contain the
+ * dev and minor area that should be used.
+ *
+ * The 'issocket' flag indicates whether the FireEngine
+ * optimizations should be used. The common case would be that
+ * optimizations are enabled, and they might be subsequently
+ * disabled using the _SIOCSOCKFALLBACK ioctl.
+ */
+
+/*
+ * An active connection is falling back to TPI. Gather all the information
+ * required by the STREAM head and TPI sonode and send it up.
+ */
+static void
+tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
+    boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
+    sock_quiesce_arg_t *arg)
+{
+	conn_t			*connp = tcp->tcp_connp;
+	struct stroptions	*stropt;
+	struct T_capability_ack tca;
+	struct sockaddr_in6	laddr, faddr;
+	socklen_t 		laddrlen, faddrlen;
+	short			opts;
+	int			error;
+	mblk_t			*mp, *mpnext;
+
+	connp->conn_dev = (dev_t)RD(q)->q_ptr;
+	connp->conn_minor_arena = WR(q)->q_ptr;
+
+	RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+	connp->conn_rq = RD(q);
+	connp->conn_wq = WR(q);
+
+	WR(q)->q_qinfo = &tcp_sock_winit;
+
+	if (!issocket)
+		tcp_use_pure_tpi(tcp);
+
+	/*
+	 * free the helper stream
+	 */
+	ip_free_helper_stream(connp);
+
+	/*
+	 * Notify the STREAM head about options
+	 */
+	DB_TYPE(stropt_mp) = M_SETOPTS;
+	stropt = (struct stroptions *)stropt_mp->b_rptr;
+	stropt_mp->b_wptr += sizeof (struct stroptions);
+	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+
+	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
+	    tcp->tcp_tcps->tcps_wroff_xtra);
+	if (tcp->tcp_snd_sack_ok)
+		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
+	stropt->so_hiwat = connp->conn_rcvbuf;
+	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+
+	putnext(RD(q), stropt_mp);
+
+	/*
+	 * Collect the information needed to sync with the sonode
+	 */
+	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
+
+	laddrlen = faddrlen = sizeof (sin6_t);
+	(void) tcp_getsockname((sock_lower_handle_t)connp,
+	    (struct sockaddr *)&laddr, &laddrlen, CRED());
+	error = tcp_getpeername((sock_lower_handle_t)connp,
+	    (struct sockaddr *)&faddr, &faddrlen, CRED());
+	if (error != 0)
+		faddrlen = 0;
+
+	opts = 0;
+	if (connp->conn_oobinline)
+		opts |= SO_OOBINLINE;
+	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
+		opts |= SO_DONTROUTE;
+
+	/*
+	 * Notify the socket that the protocol is now quiescent,
+	 * and it's therefore safe move data from the socket
+	 * to the stream head.
+	 */
+	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
+	    (struct sockaddr *)&laddr, laddrlen,
+	    (struct sockaddr *)&faddr, faddrlen, opts);
+
+	while (mp != NULL) {
+		mpnext = mp->b_next;
+		tcp->tcp_rcv_list = mp->b_next;
+		mp->b_next = NULL;
+		putnext(q, mp);
+		mp = mpnext;
+	}
+	ASSERT(tcp->tcp_rcv_last_head == NULL);
+	ASSERT(tcp->tcp_rcv_last_tail == NULL);
+	ASSERT(tcp->tcp_rcv_cnt == 0);
+
+	/*
+	 * All eagers in q0 are marked as being non-STREAM, so they will
+	 * make su_newconn upcalls when the handshake completes, which
+	 * will fail (resulting in the conn being closed). So we just blow
+	 * off everything in q0 instead of waiting for the inevitable.
+	 */
+	if (tcp->tcp_conn_req_cnt_q0 != 0)
+		tcp_eager_cleanup(tcp, B_TRUE);
+}
+
+/*
+ * An eager is falling back to TPI. All we have to do is send
+ * up a T_CONN_IND.
+ */
+static void
+tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
+    so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
+{
+	conn_t *connp = eager->tcp_connp;
+	tcp_t *listener = eager->tcp_listener;
+	mblk_t *mp;
+
+	ASSERT(listener != NULL);
+
+	/*
+	 * Notify the socket that the protocol is now quiescent,
+	 * and it's therefore safe move data from the socket
+	 * to tcp's rcv queue.
+	 */
+	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
+	    NULL, 0, 0);
+
+	if (mp != NULL) {
+		ASSERT(eager->tcp_rcv_cnt == 0);
+
+		eager->tcp_rcv_list = mp;
+		eager->tcp_rcv_cnt = msgdsize(mp);
+		while (mp->b_next != NULL) {
+			mp = mp->b_next;
+			eager->tcp_rcv_cnt += msgdsize(mp);
+		}
+		eager->tcp_rcv_last_head = mp;
+		while (mp->b_cont)
+			mp = mp->b_cont;
+		eager->tcp_rcv_last_tail = mp;
+		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
+			eager->tcp_rwnd = 0;
+		else
+			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
+	}
+
+	if (!issocket)
+		eager->tcp_issocket = B_FALSE;
+	/*
+	 * The stream for this eager does not yet exist, so mark it as
+	 * being detached.
+	 */
+	eager->tcp_detached = B_TRUE;
+	eager->tcp_hard_binding = B_TRUE;
+	connp->conn_rq = listener->tcp_connp->conn_rq;
+	connp->conn_wq = listener->tcp_connp->conn_wq;
+
+	/* Send up the connection indication */
+	mp = eager->tcp_conn.tcp_eager_conn_ind;
+	ASSERT(mp != NULL);
+	eager->tcp_conn.tcp_eager_conn_ind = NULL;
+
+	/*
+	 * TLI/XTI applications will get confused by
+	 * sending eager as an option since it violates
+	 * the option semantics. So remove the eager as
+	 * option since TLI/XTI app doesn't need it anyway.
+	 */
+	if (!issocket) {
+		struct T_conn_ind *conn_ind;
+
+		conn_ind = (struct T_conn_ind *)mp->b_rptr;
+		conn_ind->OPT_length = 0;
+		conn_ind->OPT_offset = 0;
+	}
+
+	/*
+	 * Sockfs guarantees that the listener will not be closed
+	 * during fallback. So we can safely use the listener's queue.
+	 */
+	putnext(listener->tcp_connp->conn_rq, mp);
+}
+
+
 int
 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
-    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
+    sock_quiesce_arg_t *arg)
 {
 	tcp_t			*tcp;
 	conn_t 			*connp = (conn_t *)proto_handle;
@@ -768,14 +1015,6 @@
 		/* failed to enter, free all the pre-allocated messages. */
 		freeb(stropt_mp);
 		freeb(ordrel_mp);
-		/*
-		 * We cannot process the eager, so at least send out a
-		 * RST so the peer can reconnect.
-		 */
-		if (tcp->tcp_listener != NULL) {
-			(void) tcp_eager_blowoff(tcp->tcp_listener,
-			    tcp->tcp_conn_req_seqnum);
-		}
 		return (ENOMEM);
 	}
 
@@ -787,21 +1026,24 @@
 	if (tcp->tcp_fused)
 		tcp_unfuse(tcp);
 
+	if (tcp->tcp_listener != NULL) {
+		/* The eager will deal with opts when accept() is called */
+		freeb(stropt_mp);
+		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
+	} else {
+		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
+		    quiesced_cb, arg);
+	}
+
 	/*
 	 * No longer a direct socket
+	 *
+	 * Note that we intentionally leave the upper_handle and upcalls
+	 * intact, since eagers may still be using them.
 	 */
 	connp->conn_flags &= ~IPCL_NONSTR;
 	tcp->tcp_ordrel_mp = ordrel_mp;
 
-	if (tcp->tcp_listener != NULL) {
-		/* The eager will deal with opts when accept() is called */
-		freeb(stropt_mp);
-		tcp_fallback_eager(tcp, direct_sockfs);
-	} else {
-		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
-		    quiesced_cb);
-	}
-
 	/*
 	 * There should be atleast two ref's (IP + TCP)
 	 */
@@ -810,3 +1052,141 @@
 
 	return (0);
 }
+
+/*
+ * Notifies a non-STREAMS based listener about a new connection. This
+ * function is executed on the *eager*'s squeue once the 3 way handshake
+ * has completed. Note that the behavior differs from STREAMS, where the
+ * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
+ * squeue.
+ *
+ * Returns B_TRUE if the notification succeeded, in which case `tcp' will
+ * be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
+ * B_FALSE is returned and `tcp' is killed.
+ */
+boolean_t
+tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
+{
+	tcp_t *listener = tcp->tcp_listener;
+	conn_t *lconnp = listener->tcp_connp;
+	conn_t *econnp = tcp->tcp_connp;
+	tcp_t *tail;
+	ipaddr_t *addr_cache;
+	sock_upper_handle_t upper;
+	struct sock_proto_props sopp;
+	mblk_t *mp;
+
+	mutex_enter(&listener->tcp_eager_lock);
+	/*
+	 * Take the eager out, if it is in the list of droppable eagers
+	 * as we are here because the 3W handshake is over.
+	 */
+	MAKE_UNDROPPABLE(tcp);
+	/*
+	 * The eager already has an extra ref put in tcp_input_data
+	 * so that it stays till accept comes back even though it
+	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
+	 */
+	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+	listener->tcp_conn_req_cnt_q0--;
+	listener->tcp_conn_req_cnt_q++;
+
+	/* Move from SYN_RCVD to ESTABLISHED list  */
+	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
+	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
+	tcp->tcp_eager_prev_q0 = NULL;
+	tcp->tcp_eager_next_q0 = NULL;
+
+	/*
+	 * Insert at end of the queue because connections are accepted
+	 * in chronological order. Leaving the older connections at front
+	 * of the queue helps reducing search time.
+	 */
+	tail = listener->tcp_eager_last_q;
+	if (tail != NULL)
+		tail->tcp_eager_next_q = tcp;
+	else
+		listener->tcp_eager_next_q = tcp;
+	listener->tcp_eager_last_q = tcp;
+	tcp->tcp_eager_next_q = NULL;
+
+	/* we have timed out before */
+	if (tcp->tcp_syn_rcvd_timeout != 0) {
+		tcp->tcp_syn_rcvd_timeout = 0;
+		listener->tcp_syn_rcvd_timeout--;
+		if (listener->tcp_syn_defense &&
+		    listener->tcp_syn_rcvd_timeout <=
+		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
+		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
+		    listener->tcp_last_rcv_lbolt)) {
+			/*
+			 * Turn off the defense mode if we
+			 * believe the SYN attack is over.
+			 */
+			listener->tcp_syn_defense = B_FALSE;
+			if (listener->tcp_ip_addr_cache) {
+				kmem_free((void *)listener->tcp_ip_addr_cache,
+				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
+				listener->tcp_ip_addr_cache = NULL;
+			}
+		}
+	}
+	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
+	if (addr_cache != NULL) {
+		/*
+		 * We have finished a 3-way handshake with this
+		 * remote host. This proves the IP addr is good.
+		 * Cache it!
+		 */
+		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
+		    tcp->tcp_connp->conn_faddr_v4;
+	}
+	mutex_exit(&listener->tcp_eager_lock);
+
+	/*
+	 * Notify the ULP about the newconn. It is guaranteed that no
+	 * tcp_accept() call will be made for the eager if the
+	 * notification fails.
+	 */
+	if ((upper = (*lconnp->conn_upcalls->su_newconn)
+	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
+	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
+	    &econnp->conn_upcalls)) == NULL) {
+		/*
+		 * Normally this should not happen, but the listener might
+		 * have done a fallback to TPI followed by a close(), in
+		 * which case tcp_closemp for this conn might have been
+		 * used by tcp_eager_cleanup().
+		 */
+		mutex_enter(&listener->tcp_eager_lock);
+		if (tcp->tcp_closemp_used) {
+			mutex_exit(&listener->tcp_eager_lock);
+			return (B_FALSE);
+		}
+		tcp->tcp_closemp_used = B_TRUE;
+		TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+		mp = &tcp->tcp_closemp;
+		mutex_exit(&listener->tcp_eager_lock);
+		tcp_eager_kill(econnp, mp, NULL, NULL);
+		return (B_FALSE);
+	}
+	econnp->conn_upper_handle = upper;
+
+	tcp->tcp_detached = B_FALSE;
+	tcp->tcp_hard_binding = B_FALSE;
+	tcp->tcp_tconnind_started = B_TRUE;
+
+	if (econnp->conn_keepalive) {
+		tcp->tcp_ka_last_intrvl = 0;
+		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+		    tcp->tcp_ka_interval);
+	}
+
+	/* Update the necessary parameters */
+	tcp_get_proto_props(tcp, &sopp);
+
+	(*econnp->conn_upcalls->su_set_proto_props)
+	    (econnp->conn_upper_handle, &sopp);
+
+	return (B_TRUE);
+}
--- a/usr/src/uts/common/inet/tcp/tcp_tpi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /* This files contains all TCP TLI/TPI related functions */
@@ -47,7 +46,6 @@
 
 static void	tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
 static int	tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
-static void	tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
 
 void
 tcp_use_pure_tpi(tcp_t *tcp)
@@ -823,7 +821,7 @@
 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
 }
 
-static void
+void
 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
     t_uscalar_t cap_bits1)
 {
@@ -950,148 +948,6 @@
 }
 
 /*
- * tcp_fallback
- *
- * A direct socket is falling back to using STREAMS. The queue
- * that is being passed down was created using tcp_open() with
- * the SO_FALLBACK flag set. As a result, the queue is not
- * associated with a conn, and the q_ptrs instead contain the
- * dev and minor area that should be used.
- *
- * The 'issocket' flag indicates whether the FireEngine
- * optimizations should be used. The common case would be that
- * optimizations are enabled, and they might be subsequently
- * disabled using the _SIOCSOCKFALLBACK ioctl.
- */
-
-/*
- * An active connection is falling back to TPI. Gather all the information
- * required by the STREAM head and TPI sonode and send it up.
- */
-void
-tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
-    boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
-{
-	conn_t			*connp = tcp->tcp_connp;
-	struct stroptions	*stropt;
-	struct T_capability_ack tca;
-	struct sockaddr_in6	laddr, faddr;
-	socklen_t 		laddrlen, faddrlen;
-	short			opts;
-	int			error;
-	mblk_t			*mp;
-
-	connp->conn_dev = (dev_t)RD(q)->q_ptr;
-	connp->conn_minor_arena = WR(q)->q_ptr;
-
-	RD(q)->q_ptr = WR(q)->q_ptr = connp;
-
-	connp->conn_rq = RD(q);
-	connp->conn_wq = WR(q);
-
-	WR(q)->q_qinfo = &tcp_sock_winit;
-
-	if (!issocket)
-		tcp_use_pure_tpi(tcp);
-
-	/*
-	 * free the helper stream
-	 */
-	ip_free_helper_stream(connp);
-
-	/*
-	 * Notify the STREAM head about options
-	 */
-	DB_TYPE(stropt_mp) = M_SETOPTS;
-	stropt = (struct stroptions *)stropt_mp->b_rptr;
-	stropt_mp->b_wptr += sizeof (struct stroptions);
-	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
-
-	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
-	    tcp->tcp_tcps->tcps_wroff_xtra);
-	if (tcp->tcp_snd_sack_ok)
-		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
-	stropt->so_hiwat = connp->conn_rcvbuf;
-	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
-
-	putnext(RD(q), stropt_mp);
-
-	/*
-	 * Collect the information needed to sync with the sonode
-	 */
-	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
-
-	laddrlen = faddrlen = sizeof (sin6_t);
-	(void) tcp_getsockname((sock_lower_handle_t)connp,
-	    (struct sockaddr *)&laddr, &laddrlen, CRED());
-	error = tcp_getpeername((sock_lower_handle_t)connp,
-	    (struct sockaddr *)&faddr, &faddrlen, CRED());
-	if (error != 0)
-		faddrlen = 0;
-
-	opts = 0;
-	if (connp->conn_oobinline)
-		opts |= SO_OOBINLINE;
-	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
-		opts |= SO_DONTROUTE;
-
-	/*
-	 * Notify the socket that the protocol is now quiescent,
-	 * and it's therefore safe move data from the socket
-	 * to the stream head.
-	 */
-	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
-	    (struct sockaddr *)&laddr, laddrlen,
-	    (struct sockaddr *)&faddr, faddrlen, opts);
-
-	while ((mp = tcp->tcp_rcv_list) != NULL) {
-		tcp->tcp_rcv_list = mp->b_next;
-		mp->b_next = NULL;
-		/* We never do fallback for kernel RPC */
-		putnext(q, mp);
-	}
-	tcp->tcp_rcv_last_head = NULL;
-	tcp->tcp_rcv_last_tail = NULL;
-	tcp->tcp_rcv_cnt = 0;
-}
-
-/*
- * An eager is falling back to TPI. All we have to do is send
- * up a T_CONN_IND.
- */
-void
-tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
-{
-	tcp_t *listener = eager->tcp_listener;
-	mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
-
-	ASSERT(listener != NULL);
-	ASSERT(mp != NULL);
-
-	eager->tcp_conn.tcp_eager_conn_ind = NULL;
-
-	/*
-	 * TLI/XTI applications will get confused by
-	 * sending eager as an option since it violates
-	 * the option semantics. So remove the eager as
-	 * option since TLI/XTI app doesn't need it anyway.
-	 */
-	if (!direct_sockfs) {
-		struct T_conn_ind *conn_ind;
-
-		conn_ind = (struct T_conn_ind *)mp->b_rptr;
-		conn_ind->OPT_length = 0;
-		conn_ind->OPT_offset = 0;
-	}
-
-	/*
-	 * Sockfs guarantees that the listener will not be closed
-	 * during fallback. So we can safely use the listener's queue.
-	 */
-	putnext(listener->tcp_connp->conn_rq, mp);
-}
-
-/*
  * Swap information between the eager and acceptor for a TLI/XTI client.
  * The sockfs accept is done on the acceptor stream and control goes
  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
@@ -1185,6 +1041,191 @@
 }
 
 /*
+ * This runs at the tail end of accept processing on the squeue of the
+ * new connection.
+ */
+/* ARGSUSED */
+static void
+tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+{
+	conn_t			*connp = (conn_t *)arg;
+	tcp_t			*tcp = connp->conn_tcp;
+	queue_t			*q = connp->conn_rq;
+	tcp_stack_t		*tcps = tcp->tcp_tcps;
+	struct stroptions 	*stropt;
+	struct sock_proto_props sopp;
+
+	/* Should never be called for non-STREAMS sockets */
+	ASSERT(!IPCL_IS_NONSTR(connp));
+
+	/* We should just receive a single mblk that fits a T_discon_ind */
+	ASSERT(mp->b_cont == NULL);
+
+	/*
+	 * Drop the eager's ref on the listener, that was placed when
+	 * this eager began life in tcp_input_listener.
+	 */
+	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+
+	tcp->tcp_detached = B_FALSE;
+
+	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
+		/*
+		 * Someone blewoff the eager before we could finish
+		 * the accept.
+		 *
+		 * The only reason eager exists it because we put in
+		 * a ref on it when conn ind went up. We need to send
+		 * a disconnect indication up while the last reference
+		 * on the eager will be dropped by the squeue when we
+		 * return.
+		 */
+		ASSERT(tcp->tcp_listener == NULL);
+		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
+			struct	T_discon_ind	*tdi;
+
+			(void) putnextctl1(q, M_FLUSH, FLUSHRW);
+			/*
+			 * Let us reuse the incoming mblk to avoid
+			 * memory allocation failure problems. We know
+			 * that the size of the incoming mblk i.e.
+			 * stroptions is greater than sizeof
+			 * T_discon_ind.
+			 */
+			ASSERT(DB_REF(mp) == 1);
+			ASSERT(MBLKSIZE(mp) >=
+			    sizeof (struct T_discon_ind));
+
+			DB_TYPE(mp) = M_PROTO;
+			((union T_primitives *)mp->b_rptr)->type =
+			    T_DISCON_IND;
+			tdi = (struct T_discon_ind *)mp->b_rptr;
+			if (tcp->tcp_issocket) {
+				tdi->DISCON_reason = ECONNREFUSED;
+				tdi->SEQ_number = 0;
+			} else {
+				tdi->DISCON_reason = ENOPROTOOPT;
+				tdi->SEQ_number =
+				    tcp->tcp_conn_req_seqnum;
+			}
+			mp->b_wptr = mp->b_rptr +
+			    sizeof (struct T_discon_ind);
+			putnext(q, mp);
+		}
+		tcp->tcp_hard_binding = B_FALSE;
+		return;
+	}
+
+	/*
+	 * This is the first time we run on the correct
+	 * queue after tcp_accept. So fix all the q parameters
+	 * here.
+	 *
+	 * Let us reuse the incoming mblk to avoid
+	 * memory allocation failure problems. We know
+	 * that the size of the incoming mblk is at least
+	 * stroptions
+	 */
+	tcp_get_proto_props(tcp, &sopp);
+
+	ASSERT(DB_REF(mp) == 1);
+	ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
+
+	DB_TYPE(mp) = M_SETOPTS;
+	stropt = (struct stroptions *)mp->b_rptr;
+	mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
+	stropt = (struct stroptions *)mp->b_rptr;
+	ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
+	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+	stropt->so_hiwat = sopp.sopp_rxhiwat;
+	stropt->so_wroff = sopp.sopp_wroff;
+	stropt->so_maxblk = sopp.sopp_maxblk;
+
+	if (sopp.sopp_flags & SOCKOPT_TAIL) {
+		ASSERT(tcp->tcp_kssl_ctx != NULL);
+
+		stropt->so_flags |= SO_TAIL | SO_COPYOPT;
+		stropt->so_tail = sopp.sopp_tail;
+		stropt->so_copyopt = sopp.sopp_zcopyflag;
+	}
+
+	/* Send the options up */
+	putnext(q, mp);
+
+	/*
+	 * Pass up any data and/or a fin that has been received.
+	 *
+	 * Adjust receive window in case it had decreased
+	 * (because there is data <=> tcp_rcv_list != NULL)
+	 * while the connection was detached. Note that
+	 * in case the eager was flow-controlled, w/o this
+	 * code, the rwnd may never open up again!
+	 */
+	if (tcp->tcp_rcv_list != NULL) {
+		/* We drain directly in case of fused tcp loopback */
+
+		if (!tcp->tcp_fused && canputnext(q)) {
+			tcp->tcp_rwnd = connp->conn_rcvbuf;
+			if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+			    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
+				tcp_xmit_ctl(NULL,
+				    tcp, (tcp->tcp_swnd == 0) ?
+				    tcp->tcp_suna : tcp->tcp_snxt,
+				    tcp->tcp_rnxt, TH_ACK);
+			}
+		}
+
+		(void) tcp_rcv_drain(tcp);
+
+		/*
+		 * For fused tcp loopback, back-enable peer endpoint
+		 * if it's currently flow-controlled.
+		 */
+		if (tcp->tcp_fused) {
+			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+			ASSERT(peer_tcp != NULL);
+			ASSERT(peer_tcp->tcp_fused);
+
+			mutex_enter(&peer_tcp->tcp_non_sq_lock);
+			if (peer_tcp->tcp_flow_stopped) {
+				tcp_clrqfull(peer_tcp);
+				TCP_STAT(tcps, tcp_fusion_backenabled);
+			}
+			mutex_exit(&peer_tcp->tcp_non_sq_lock);
+		}
+	}
+	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
+		tcp->tcp_ordrel_done = B_TRUE;
+		mp = tcp->tcp_ordrel_mp;
+		tcp->tcp_ordrel_mp = NULL;
+		putnext(q, mp);
+	}
+	tcp->tcp_hard_binding = B_FALSE;
+
+	if (connp->conn_keepalive) {
+		tcp->tcp_ka_last_intrvl = 0;
+		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+		    tcp->tcp_ka_interval);
+	}
+
+	/*
+	 * At this point, eager is fully established and will
+	 * have the following references -
+	 *
+	 * 2 references for connection to exist (1 for TCP and 1 for IP).
+	 * 1 reference for the squeue which will be dropped by the squeue as
+	 *	soon as this function returns.
+	 * There will be 1 additonal reference for being in classifier
+	 *	hash list provided something bad hasn't happened.
+	 */
+	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
+	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
+}
+
+
+/*
  * Reply to a clients T_CONN_RES TPI message. This function
  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
  * on the acceptor STREAM and processed in tcp_accept_common().
@@ -1643,6 +1684,7 @@
 	tcp_t *listener;
 	struct T_ok_ack *ok;
 	t_scalar_t PRIM_type;
+	mblk_t *discon_mp;
 	conn_t *econnp;
 	cred_t *cr;
 
@@ -1703,14 +1745,120 @@
 		q->q_qinfo = &tcp_winit;
 		listener = eager->tcp_listener;
 
-		if (tcp_accept_common(listener->tcp_connp,
-		    econnp, cr) < 0) {
+		/*
+		 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+		 * use it if something failed.
+		 */
+		discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+		    sizeof (struct stroptions)), BPRI_HI);
+
+		if (discon_mp == NULL) {
 			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
 			if (mp != NULL)
 				putnext(rq, mp);
 			return;
 		}
 
+		eager->tcp_issocket = B_TRUE;
+
+		ASSERT(econnp->conn_netstack ==
+		    listener->tcp_connp->conn_netstack);
+		ASSERT(eager->tcp_tcps == listener->tcp_tcps);
+
+		/* Put the ref for IP */
+		CONN_INC_REF(econnp);
+
+		/*
+		 * We should have minimum of 3 references on the conn
+		 * at this point. One each for TCP and IP and one for
+		 * the T_conn_ind that was sent up when the 3-way handshake
+		 * completed. In the normal case we would also have another
+		 * reference (making a total of 4) for the conn being in the
+		 * classifier hash list. However the eager could have received
+		 * an RST subsequently and tcp_closei_local could have removed
+		 * the eager from the classifier hash list, hence we can't
+		 * assert that reference.
+		 */
+		ASSERT(econnp->conn_ref >= 3);
+
+		mutex_enter(&listener->tcp_eager_lock);
+		if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
+
+			tcp_t *tail;
+			tcp_t *tcp;
+			mblk_t *mp1;
+
+			tcp = listener->tcp_eager_prev_q0;
+			/*
+			 * listener->tcp_eager_prev_q0 points to the TAIL of the
+			 * deferred T_conn_ind queue. We need to get to the head
+			 * of the queue in order to send up T_conn_ind the same
+			 * order as how the 3WHS is completed.
+			 */
+			while (tcp != listener) {
+				if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
+				    !tcp->tcp_kssl_pending)
+					break;
+				else
+					tcp = tcp->tcp_eager_prev_q0;
+			}
+			/* None of the pending eagers can be sent up now */
+			if (tcp == listener)
+				goto no_more_eagers;
+
+			mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
+			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+			/* Move from q0 to q */
+			ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+			listener->tcp_conn_req_cnt_q0--;
+			listener->tcp_conn_req_cnt_q++;
+			tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+			    tcp->tcp_eager_prev_q0;
+			tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+			    tcp->tcp_eager_next_q0;
+			tcp->tcp_eager_prev_q0 = NULL;
+			tcp->tcp_eager_next_q0 = NULL;
+			tcp->tcp_conn_def_q0 = B_FALSE;
+
+			/* Make sure the tcp isn't in the list of droppables */
+			ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+			    tcp->tcp_eager_prev_drop_q0 == NULL);
+
+			/*
+			 * Insert at end of the queue because sockfs sends
+			 * down T_CONN_RES in chronological order. Leaving
+			 * the older conn indications at front of the queue
+			 * helps reducing search time.
+			 */
+			tail = listener->tcp_eager_last_q;
+			if (tail != NULL) {
+				tail->tcp_eager_next_q = tcp;
+			} else {
+				listener->tcp_eager_next_q = tcp;
+			}
+			listener->tcp_eager_last_q = tcp;
+			tcp->tcp_eager_next_q = NULL;
+
+			/* Need to get inside the listener perimeter */
+			CONN_INC_REF(listener->tcp_connp);
+			SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
+			    tcp_send_pending, listener->tcp_connp, NULL,
+			    SQ_FILL, SQTAG_TCP_SEND_PENDING);
+		}
+no_more_eagers:
+		tcp_eager_unlink(eager);
+		mutex_exit(&listener->tcp_eager_lock);
+
+		/*
+		 * At this point, the eager is detached from the listener
+		 * but we still have an extra refs on eager (apart from the
+		 * usual tcp references). The ref was placed in tcp_input_data
+		 * before sending the conn_ind in tcp_send_conn_ind.
+		 * The ref will be dropped in tcp_accept_finish().
+		 */
+		SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
+		    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+
 		/*
 		 * Send the new local address also up to sockfs. There
 		 * should already be enough space in the mp that came
@@ -1761,50 +1909,6 @@
 }
 
 /*
- * Send the newconn notification to ulp. The eager is blown off if the
- * notification fails.
- */
-static void
-tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
-{
-	if (IPCL_IS_NONSTR(lconnp)) {
-		cred_t	*cr;
-		pid_t	cpid = NOPID;
-
-		ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
-		ASSERT(econnp->conn_tcp->tcp_saved_listener ==
-		    lconnp->conn_tcp);
-
-		cr = msg_getcred(mp, &cpid);
-
-		/* Keep the message around in case of a fallback to TPI */
-		econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
-		/*
-		 * Notify the ULP about the newconn. It is guaranteed that no
-		 * tcp_accept() call will be made for the eager if the
-		 * notification fails, so it's safe to blow it off in that
-		 * case.
-		 *
-		 * The upper handle will be assigned when tcp_accept() is
-		 * called.
-		 */
-		if ((*lconnp->conn_upcalls->su_newconn)
-		    (lconnp->conn_upper_handle,
-		    (sock_lower_handle_t)econnp,
-		    &sock_tcp_downcalls, cr, cpid,
-		    &econnp->conn_upcalls) == NULL) {
-			/* Failed to allocate a socket */
-			TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps,
-			    tcpEstabResets);
-			(void) tcp_eager_blowoff(lconnp->conn_tcp,
-			    econnp->conn_tcp->tcp_conn_req_seqnum);
-		}
-	} else {
-		putnext(lconnp->conn_rq, mp);
-	}
-}
-
-/*
  * The function called through squeue to get behind listener's perimeter to
  * send a deferred conn_ind.
  */
@@ -1831,7 +1935,7 @@
 		return;
 	}
 
-	tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+	putnext(lconnp->conn_rq, mp);
 }
 
 /*
@@ -1989,5 +2093,5 @@
 	}
 	mutex_exit(&listener->tcp_eager_lock);
 	if (need_send_conn_ind)
-		tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+		putnext(lconnp->conn_rq, mp);
 }
--- a/usr/src/uts/common/inet/tcp/tcpddi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -41,6 +40,8 @@
 #define	INET_SOCKDESC	"TCP socket module"
 #define	INET_SOCK_PROTO_CREATE_FUNC	(*tcp_create)
 #define	INET_SOCK_PROTO_FB_FUNC		(*tcp_fallback)
+#define	INET_SOCK_FALLBACK_DEV_V4	"/dev/tcp"
+#define	INET_SOCK_FALLBACK_DEV_V6	"/dev/tcp6"
 #define	INET_DEVMINOR	0
 #define	INET_MODMTFLAGS	D_MP
 #define	INET_DEVMTFLAGS	(D_MP|_D_DIRECT)
--- a/usr/src/uts/common/inet/tcp_impl.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/tcp_impl.h	Thu Jun 17 17:22:09 2010 -0700
@@ -515,8 +515,6 @@
 /*
  * Functions in tcp.c.
  */
-extern int	tcp_accept_common(conn_t *, conn_t *, cred_t *);
-extern void	tcp_accept_finish(void *, mblk_t *, void *, ip_recv_attr_t *);
 extern void	tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
 extern tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);
 extern void	tcp_acceptor_hash_remove(tcp_t *);
@@ -565,6 +563,7 @@
 extern mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
 extern boolean_t	tcp_zcopy_check(tcp_t *);
 extern void	tcp_zcopy_notify(tcp_t *);
+extern void	tcp_get_proto_props(tcp_t *, struct sock_proto_props *);
 
 /*
  * Bind related functions in tcp_bind.c
@@ -630,8 +629,9 @@
 /*
  * Kernel socket related functions in tcp_socket.c.
  */
-extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
-    so_proto_quiesced_cb_t);
+extern int	tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+		    so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
+extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *);
 
 /*
  * Timer related functions in tcp_timers.c.
@@ -657,9 +657,6 @@
 			    mblk_t **, ip_recv_attr_t *);
 extern void	tcp_err_ack(tcp_t *, mblk_t *, int, int);
 extern void	tcp_err_ack_prim(tcp_t *, mblk_t *, int, int, int);
-extern void	tcp_fallback_eager(tcp_t *, boolean_t);
-extern void	tcp_fallback_noneager(tcp_t *, mblk_t *, queue_t *,
-		    boolean_t, so_proto_quiesced_cb_t);
 extern void	tcp_info_req(tcp_t *, mblk_t *);
 extern void	tcp_send_conn_ind(void *, mblk_t *, void *);
 extern void	tcp_send_pending(void *, mblk_t *, void *, ip_recv_attr_t *);
@@ -674,6 +671,8 @@
 extern void	tcp_tpi_unbind(tcp_t *, mblk_t *);
 extern void	tcp_tli_accept(tcp_t *, mblk_t *);
 extern void	tcp_use_pure_tpi(tcp_t *);
+extern void	tcp_do_capability_ack(tcp_t *, struct T_capability_ack *,
+		    t_uscalar_t);
 
 /*
  * TCP option processing related functions in tcp_opt_data.c
--- a/usr/src/uts/common/inet/udp/udp.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/udp/udp.c	Thu Jun 17 17:22:09 2010 -0700
@@ -6498,7 +6498,8 @@
 
 int
 udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
-    boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
+    boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
+    sock_quiesce_arg_t *arg)
 {
 	conn_t 	*connp = (conn_t *)proto_handle;
 	udp_t	*udp;
@@ -6507,7 +6508,7 @@
 	socklen_t laddrlen, faddrlen;
 	short opts;
 	struct stroptions *stropt;
-	mblk_t *stropt_mp;
+	mblk_t *mp, *stropt_mp;
 	int error;
 
 	udp = connp->conn_udp;
@@ -6563,17 +6564,21 @@
 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 		opts |= SO_DONTROUTE;
 
-	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 	    (struct sockaddr *)&laddr, laddrlen,
 	    (struct sockaddr *)&faddr, faddrlen, opts);
 
 	mutex_enter(&udp->udp_recv_lock);
 	/*
 	 * Attempts to send data up during fallback will result in it being
-	 * queued in udp_t. Now we push up any queued packets.
+	 * queued in udp_t. First push up the datagrams obtained from the
+	 * socket, then any packets queued in udp_t.
 	 */
+	if (mp != NULL) {
+		mp->b_next = udp->udp_fallback_queue_head;
+		udp->udp_fallback_queue_head = mp;
+	}
 	while (udp->udp_fallback_queue_head != NULL) {
-		mblk_t *mp;
 		mp = udp->udp_fallback_queue_head;
 		udp->udp_fallback_queue_head = mp->b_next;
 		mutex_exit(&udp->udp_recv_lock);
@@ -6598,7 +6603,7 @@
 
 /* ARGSUSED3 */
 int
-udp_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *sa,
+udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
     socklen_t *salenp, cred_t *cr)
 {
 	conn_t	*connp = (conn_t *)proto_handle;
--- a/usr/src/uts/common/inet/udp/udpddi.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/udp/udpddi.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -43,6 +42,8 @@
 #define	INET_SOCKDESC	"UDP socket module"
 #define	INET_SOCK_PROTO_CREATE_FUNC	(*udp_create)
 #define	INET_SOCK_PROTO_FB_FUNC		(*udp_fallback)
+#define	INET_SOCK_FALLBACK_DEV_V4	"/dev/udp"
+#define	INET_SOCK_FALLBACK_DEV_V6	"/dev/udp6"
 #define	INET_DEVMTFLAGS (D_MP|_D_DIRECT)
 
 #include "../inetddi.c"
--- a/usr/src/uts/common/inet/udp_impl.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/inet/udp_impl.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_UDP_IMPL_H
@@ -227,7 +226,7 @@
 extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **,
     uint_t *, int *, int, cred_t *);
 extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
-    so_proto_quiesced_cb_t);
+    so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
 
 extern sock_downcalls_t sock_udp_downcalls;
 
--- a/usr/src/uts/common/io/ksocket/ksocket.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/io/ksocket/ksocket.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/file.h>
@@ -166,7 +165,7 @@
 }
 
 int
-ksocket_connect(ksocket_t ks, const struct sockaddr *addr, socklen_t addrlen,
+ksocket_connect(ksocket_t ks, struct sockaddr *addr, socklen_t addrlen,
     struct cred *cr)
 {
 	/* All Solaris components should pass a cred for this operation. */
--- a/usr/src/uts/common/io/sock_conf.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/io/sock_conf.c	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/sysmacros.h>
@@ -127,6 +126,10 @@
 		if (reg->__smod_priv != NULL) {
 			smodp->smod_proto_fallback_func =
 			    reg->__smod_priv->smodp_proto_fallback_func;
+			smodp->smod_fallback_devpath_v4 =
+			    reg->__smod_priv->smodp_fallback_devpath_v4;
+			smodp->smod_fallback_devpath_v6 =
+			    reg->__smod_priv->smodp_fallback_devpath_v6;
 		}
 	}
 	smod_add(smodp);
--- a/usr/src/uts/common/os/sysent.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/os/sysent.c	Thu Jun 17 17:22:09 2010 -0700
@@ -21,8 +21,7 @@
 
 /* ONC_PLUS EXTRACT START */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -731,7 +730,7 @@
 	/* 244 */ SYSENT_CI("getsockname",	getsockname,	4),
 	/* 245 */ SYSENT_CI("getsockopt",	getsockopt,	6),
 	/* 246 */ SYSENT_CI("setsockopt",	setsockopt,	6),
-	/* 247 */ SYSENT_CI("sockconfig",	sockconfig,	4),
+	/* 247 */ SYSENT_CI("sockconfig",	sockconfig,	5),
 	/* 248 */ SYSENT_CI("ntp_gettime",	ntp_gettime,	1),
 	/* 249 */ SYSENT_CI("ntp_adjtime",	ntp_adjtime,	1),
 	/* 250 */ SYSENT_CI("lwp_mutex_unlock",	lwp_mutex_unlock,	1),
@@ -1057,7 +1056,7 @@
 	/* 244 */ SYSENT_CI("getsockname",	getsockname,	4),
 	/* 245 */ SYSENT_CI("getsockopt",	getsockopt,	6),
 	/* 246 */ SYSENT_CI("setsockopt",	setsockopt,	6),
-	/* 247 */ SYSENT_CI("sockconfig",	sockconfig,	4),
+	/* 247 */ SYSENT_CI("sockconfig",	sockconfig,	5),
 	/* 248 */ SYSENT_CI("ntp_gettime",	ntp_gettime,	1),
 	/* 249 */ SYSENT_CI("ntp_adjtime",	ntp_adjtime,	1),
 	/* 250 */ SYSENT_CI("lwp_mutex_unlock",	lwp_mutex_unlock,	1),
--- a/usr/src/uts/common/sys/Makefile	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/sys/Makefile	Thu Jun 17 17:22:09 2010 -0700
@@ -506,6 +506,7 @@
 	socket_impl.h		\
 	socket_proto.h		\
 	socketvar.h		\
+	sockfilter.h		\
 	sockio.h		\
 	soundcard.h		\
 	squeue.h		\
--- a/usr/src/uts/common/sys/ksocket.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/sys/ksocket.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_KSOCKET_H_
@@ -88,7 +87,7 @@
 extern int 	ksocket_listen(ksocket_t, int, struct cred *);
 extern int 	ksocket_accept(ksocket_t, struct sockaddr *, socklen_t *,
 		    ksocket_t *, struct cred *);
-extern int 	ksocket_connect(ksocket_t, const struct sockaddr *, socklen_t,
+extern int 	ksocket_connect(ksocket_t, struct sockaddr *, socklen_t,
 		    struct cred *);
 extern int 	ksocket_send(ksocket_t, void *, size_t, int, size_t *,
 		    struct cred *);
--- a/usr/src/uts/common/sys/socket.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/sys/socket.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -185,6 +184,27 @@
 #define	SO_UNIX_CLOSE	0x2003		/* Internal: AF_UNIX peer closed */
 #endif	/* _KERNEL */
 
+/*
+ * Socket filter options
+ */
+#define	FIL_ATTACH	0x1		/* attach filter */
+#define	FIL_DETACH	0x2		/* detach filter */
+#define	FIL_LIST	0x3		/* list attached filters */
+
+#define	FILNAME_MAX	32
+/*
+ * Structure returned by FIL_LIST
+ */
+struct fil_info {
+	int	fi_flags;		/* see below (FILF_*) */
+	int	fi_pos;			/* position (0 is bottom) */
+	char	fi_name[FILNAME_MAX];	/* filter name */
+};
+
+#define	FILF_PROG	0x1		/* programmatic attach */
+#define	FILF_AUTO	0x2		/* automatic attach */
+#define	FILF_BYPASS	0x4		/* filter is not active */
+
 #ifdef	_KERNEL
 /*
  * new socket open flags to identify socket and acceptor streams
@@ -199,13 +219,6 @@
 #define	SOCKET_SLEEP	KM_SLEEP
 #define	SOCKET_NOSLEEP	KM_NOSLEEP
 
-
-/*
- * flags used by sockfs when falling back to tpi socket
- */
-#define	SO_FB_START	0x1
-#define	SO_FB_FINISH	0x2
-
 #endif	/* _KERNEL */
 
 /*
@@ -224,6 +237,7 @@
 #define	SOL_ROUTE	0xfffe		/* options for routing socket level */
 #endif
 #define	SOL_PACKET	0xfffd		/* options for packet level */
+#define	SOL_FILTER	0xfffc		/* options for socket filter level */
 
 /*
  * Address families.
--- a/usr/src/uts/common/sys/socket_proto.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/sys/socket_proto.h	Thu Jun 17 17:22:09 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_SOCKET_PROTO_H_
@@ -128,11 +127,15 @@
 typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int,
     sock_downcalls_t **, uint_t *, int *, int, cred_t *);
 
-typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *,
-    struct T_capability_ack *, struct sockaddr *, socklen_t,
-    struct sockaddr *, socklen_t, short);
+typedef struct sock_quiesce_arg {
+	mblk_t *soqa_exdata_mp;
+	mblk_t *soqa_urgmark_mp;
+} sock_quiesce_arg_t;
+typedef mblk_t *(*so_proto_quiesced_cb_t)(sock_upper_handle_t,
+    sock_quiesce_arg_t *, struct T_capability_ack *, struct sockaddr *,
+    socklen_t, struct sockaddr *, socklen_t, short);
 typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
-    boolean_t, so_proto_quiesced_cb_t);
+    boolean_t, so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
 
 /*
  * These functions return EOPNOTSUPP and are intended for the sockfs
@@ -196,6 +199,7 @@
 	void	(*su_signal_oob)(sock_upper_handle_t, ssize_t);
 	void	(*su_zcopy_notify)(sock_upper_handle_t);
 	void	(*su_set_error)(sock_upper_handle_t, int);
+	void	(*su_closed)(sock_upper_handle_t);
 };
 
 #define	SOCK_UC_VERSION		sizeof (sock_upcalls_t)
--- a/usr/src/uts/common/sys/socketvar.h	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/sys/socketvar.h	Thu Jun 17 17:22:09 2010 -0700
@@ -162,12 +162,13 @@
 
 	/* Accept queue */
 	kmutex_t	so_acceptq_lock;	/* protects accept queue */
-	struct sonode	*so_acceptq_next;	/* acceptq list node */
-	struct sonode 	*so_acceptq_head;
-	struct sonode	**so_acceptq_tail;
-	unsigned int	so_acceptq_len;
+	list_t		so_acceptq_list;	/* pending conns */
+	list_t		so_acceptq_defer;	/* deferred conns */
+	list_node_t	so_acceptq_node;	/* acceptq list node */
+	unsigned int	so_acceptq_len;		/* # of conns (both lists) */
 	unsigned int	so_backlog;		/* Listen backlog */
 	kcondvar_t	so_acceptq_cv;		/* wait for new conn. */
+	struct sonode	*so_listener;		/* parent socket */
 
 	/* Options */
 	short	so_options;		/* From socket call, see socket.h */
@@ -233,6 +234,13 @@
 
 	/* != NULL for sodirect enabled socket */
 	struct sodirect_s	*so_direct;
+
+	/* socket filters */
+	uint_t			so_filter_active;	/* # of active fil */
+	uint_t			so_filter_tx;		/* pending tx ops */
+	struct sof_instance	*so_filter_top;		/* top of stack */
+	struct sof_instance	*so_filter_bottom;	/* bottom of stack */
+	clock_t			so_filter_defertime;	/* time when deferred */
 };
 
 #define	SO_HAVE_DATA(so)						\
@@ -288,10 +296,10 @@
 #define	SS_HADOOBDATA		0x00008000 /* OOB data consumed */
 #define	SS_CLOSING		0x00010000 /* in process of closing */
 
-/*	unused			0x00020000 */	/* was SS_FADDR_NOXLATE */
-/*	unused			0x00040000 */	/* was SS_HASDATA */
-/*	unused 			0x00080000 */	/* was SS_DONEREAD */
-/*	unused 			0x00100000 */	/* was SS_MOREDATA */
+#define	SS_FIL_DEFER		0x00020000 /* filter deferred notification */
+#define	SS_FILOP_OK		0x00040000 /* socket can attach filters */
+#define	SS_FIL_RCV_FLOWCTRL	0x00080000 /* filter asserted rcv flow ctrl */
+#define	SS_FIL_SND_FLOWCTRL	0x00100000 /* filter asserted snd flow ctrl */
 /*	unused 			0x00200000 */	/* was SS_DIRECT */
 
 #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */
@@ -312,19 +320,27 @@
  * Sockets that can fall back to TPI must ensure that fall back is not
  * initiated while a thread is using a socket.
  */
-#define	SO_BLOCK_FALLBACK(so, fn) {			\
-	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));		\
-	rw_enter(&(so)->so_fallback_rwlock, RW_READER);	\
-	if ((so)->so_state & SS_FALLBACK_COMP) {	\
-		rw_exit(&(so)->so_fallback_rwlock);	\
-		return (fn);				\
-	}						\
-}
+#define	SO_BLOCK_FALLBACK(so, fn)				\
+	ASSERT(MUTEX_NOT_HELD(&(so)->so_lock));			\
+	rw_enter(&(so)->so_fallback_rwlock, RW_READER);		\
+	if ((so)->so_state & (SS_FALLBACK_COMP|SS_FILOP_OK)) {	\
+		if ((so)->so_state & SS_FALLBACK_COMP) {	\
+			rw_exit(&(so)->so_fallback_rwlock);	\
+			return (fn);				\
+		} else {					\
+			mutex_enter(&(so)->so_lock);		\
+			(so)->so_state &= ~SS_FILOP_OK;		\
+			mutex_exit(&(so)->so_lock);		\
+		}						\
+	}
 
 #define	SO_UNBLOCK_FALLBACK(so)	{			\
 	rw_exit(&(so)->so_fallback_rwlock);		\
 }
 
+#define	SO_SND_FLOWCTRLD(so)	\
+	((so)->so_snd_qfull || (so)->so_state & SS_FIL_SND_FLOWCTRL)
+
 /* Poll events */
 #define	SO_POLLEV_IN		0x1	/* POLLIN wakeup needed */
 #define	SO_POLLEV_ALWAYS	0x2	/* wakeups */
@@ -375,7 +391,9 @@
 	vnode_t	*sd_vnode;
 } sdev_info_t;
 
-#define	SOCKMOD_VERSION		1
+#define	SOCKMOD_VERSION_1	1
+#define	SOCKMOD_VERSION		2
+
 /* name of the TPI pseudo socket module */
 #define	SOTPI_SMOD_NAME		"socktpi"
 
@@ -383,6 +401,8 @@
 	so_create_func_t	smodp_sock_create_func;
 	so_destroy_func_t	smodp_sock_destroy_func;
 	so_proto_fallback_func_t smodp_proto_fallback_func;
+	const char		*smodp_fallback_devpath_v4;
+	const char		*smodp_fallback_devpath_v6;
 } __smod_priv_t;
 
 /*
@@ -410,6 +430,8 @@
 	size_t		smod_dc_version;	/* down call version */
 	so_proto_create_func_t	smod_proto_create_func;
 	so_proto_fallback_func_t smod_proto_fallback_func;
+	const char		*smod_fallback_devpath_v4;
+	const char		*smod_fallback_devpath_v6;
 	so_create_func_t	smod_sock_create_func;
 	so_destroy_func_t	smod_sock_destroy_func;
 	list_node_t	smod_node;
@@ -448,12 +470,22 @@
 
 	/*
 	 * The entries below are only modified while holding
-	 * splist_lock as a writer.
+	 * sockconf_lock as a writer.
 	 */
 	int		sp_flags;	/* see below */
 	list_node_t	sp_node;
+
+	list_t		sp_auto_filters; /* list of automatic filters */
+	list_t		sp_prog_filters; /* list of programmatic filters */
 };
 
+struct sof_entry;
+
+typedef struct sp_filter {
+	struct sof_entry *spf_filter;
+	list_node_t	spf_node;
+} sp_filter_t;
+
 
 /*
  * sockparams flags
@@ -467,6 +499,14 @@
     const char *, int, int *);
 extern void sockparams_ephemeral_drop_last_ref(struct sockparams *);
 
+extern struct sockparams *sockparams_create(int, int, int, char *, char *, int,
+    int, int, int *);
+extern void 	sockparams_destroy(struct sockparams *);
+extern int 	sockparams_add(struct sockparams *);
+extern int	sockparams_delete(int, int, int);
+extern int	sockparams_new_filter(struct sof_entry *);
+extern void	sockparams_filter_cleanup(struct sof_entry *);
+
 extern void smod_init(void);
 extern void smod_add(smod_info_t *);
 extern int smod_register(const smod_reg_t *);
@@ -614,7 +654,7 @@
 	int	(*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
 		    int, cred_t *);
 	int	(*sop_listen)(struct sonode *, int, cred_t *);
-	int	(*sop_connect)(struct sonode *, const struct sockaddr *,
+	int	(*sop_connect)(struct sonode *, struct sockaddr *,
 		    socklen_t, int, int, cred_t *);
 	int	(*sop_recvmsg)(struct sonode *, struct msghdr *,
 		    struct uio *, cred_t *);
@@ -833,6 +873,8 @@
 
 extern dev_t				sockdev;
 
+extern krwlock_t			sockconf_lock;
+
 /*
  * sockfs functions
  */
@@ -842,7 +884,6 @@
 			uchar_t, int, int);
 extern int	sogetvp(char *, vnode_t **, int);
 extern int	sockinit(int, char *);
-extern int	soconfig(int, int, int,	char *, int, char *);
 extern int	solookup(int, int, int, struct sockparams **);
 extern void	so_lock_single(struct sonode *);
 extern void	so_unlock_single(struct sonode *, int);
@@ -885,7 +926,7 @@
 extern int	sobind(struct sonode *, struct sockaddr *, socklen_t,
 		    int, int);
 extern int	solisten(struct sonode *, int);
-extern int	soconnect(struct sonode *, const struct sockaddr *, socklen_t,
+extern int	soconnect(struct sonode *, struct sockaddr *, socklen_t,
 		    int, int);
 extern int	sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *);
 extern int	sosendmsg(struct sonode *, struct nmsghdr *, struct uio *);
@@ -927,6 +968,70 @@
 	zoneid_t	si_szoneid;
 };
 
+/*
+ * Subcodes for sockconf() system call
+ */
+#define	SOCKCONFIG_ADD_SOCK		0
+#define	SOCKCONFIG_REMOVE_SOCK		1
+#define	SOCKCONFIG_ADD_FILTER		2
+#define	SOCKCONFIG_REMOVE_FILTER	3
+
+/*
+ * Data structures for configuring socket filters.
+ */
+
+/*
+ * Placement hint for automatic filters
+ */
+typedef enum {
+	SOF_HINT_NONE,
+	SOF_HINT_TOP,
+	SOF_HINT_BOTTOM,
+	SOF_HINT_BEFORE,
+	SOF_HINT_AFTER
+} sof_hint_t;
+
+/*
+ * Socket tuple. Used by sockconfig_filter_props to list socket
+ * types of interest.
+ */
+typedef struct sof_socktuple {
+	int	sofst_family;
+	int	sofst_type;
+	int	sofst_protocol;
+} sof_socktuple_t;
+
+/*
+ * Socket filter properties used by sockconfig() system call.
+ */
+struct sockconfig_filter_props {
+	char		*sfp_modname;
+	boolean_t	sfp_autoattach;
+	sof_hint_t	sfp_hint;
+	char		*sfp_hintarg;
+	uint_t		sfp_socktuple_cnt;
+	sof_socktuple_t	*sfp_socktuple;
+};
+
+#ifdef	_SYSCALL32
+
+typedef struct sof_socktuple32 {
+	int32_t	sofst_family;
+	int32_t	sofst_type;
+	int32_t	sofst_protocol;
+} sof_socktuple32_t;
+
+struct sockconfig_filter_props32 {
+	caddr32_t	sfp_modname;
+	boolean_t	sfp_autoattach;
+	sof_hint_t	sfp_hint;
+	caddr32_t	sfp_hintarg;
+	uint32_t	sfp_socktuple_cnt;
+	caddr32_t	sfp_socktuple;
+};
+
+#endif	/* _SYSCALL32 */
+
 #define	SOCKMOD_PATH	"socketmod"	/* dir where sockmods are stored */
 
 #ifdef	__cplusplus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/sockfilter.h	Thu Jun 17 17:22:09 2010 -0700
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_SOCKFILTER_H
+#define	_SYS_SOCKFILTER_H
+
+#include <sys/cred.h>
+#include <sys/errno.h>
+#include <sys/socket.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Opaque socket filter handle
+ */
+typedef struct __sof_handle	*sof_handle_t;
+
+/*
+ * Return values for callback functions.
+ *
+ * A - Attach (passive/active) only
+ * P - Passive attach only
+ */
+typedef enum {
+	SOF_RVAL_DEFER = -3,		/* defer notification (P) */
+	SOF_RVAL_DETACH = -2,		/* detach filter, continue proc. (A) */
+	SOF_RVAL_CONTINUE = -1,		/* continue processing */
+	SOF_RVAL_RETURN = 0,		/* stop proc, does not return error */
+	SOF_RVAL_EINVAL = EINVAL,	/* stop proc., returns error */
+	SOF_RVAL_EACCES = EACCES,	/* stop proc., returns error */
+	SOF_RVAL_ENOMEM = ENOMEM,	/* stop proc., returns error */
+	SOF_RVAL_ECONNABORTED = ECONNABORTED /* stop proc, returns error */
+} sof_rval_t;
+
+/*
+ * Events generated by the sofop_notify callback.
+ */
+typedef enum {				/* socket ... */
+	SOF_EV_CLOSING,			/* ... is closing */
+	SOF_EV_CONNECTED,		/* ... is connected */
+	SOF_EV_CONNECTFAILED,		/* ... failed to connect */
+	SOF_EV_DISCONNECTED,		/* ... was disconnected */
+	SOF_EV_CANTRECVMORE,		/* ... cannot receive any more data */
+	SOF_EV_CANTSENDMORE,		/* ... cannot send any more data */
+	SOF_EV_INJECT_DATA_IN_OK,	/* ... has cleared rcv flow ctrl */
+	SOF_EV_INJECT_DATA_OUT_OK,	/* ... has cleared snd flow ctrl */
+} sof_event_t;
+
+/* Filter callbacks */
+typedef sof_rval_t 	(*sof_attach_active_fn_t)(sof_handle_t, int, int, int,
+    cred_t *, void **);
+typedef sof_rval_t 	(*sof_attach_passive_fn_t)(sof_handle_t, sof_handle_t,
+    void *, struct sockaddr *, socklen_t, struct sockaddr *, socklen_t,
+    void **);
+typedef void 		(*sof_detach_fn_t)(sof_handle_t, void *, cred_t *);
+typedef mblk_t 		*(*sof_data_in_fn_t)(sof_handle_t, void *, mblk_t *,
+    int, size_t *);
+typedef mblk_t		*(*sof_data_in_proc_fn_t)(sof_handle_t, void *,
+    mblk_t *, cred_t *, size_t *);
+typedef mblk_t		*(*sof_data_out_fn_t)(sof_handle_t, void *, mblk_t *,
+    struct nmsghdr *, cred_t *, sof_rval_t *);
+typedef sof_rval_t	(*sof_bind_fn_t)(sof_handle_t, void *,
+    struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t	(*sof_listen_fn_t)(sof_handle_t, void *, int *,
+    cred_t *);
+typedef sof_rval_t	(*sof_accept_fn_t)(sof_handle_t, void *, cred_t *);
+typedef sof_rval_t	(*sof_connect_fn_t)(sof_handle_t, void *,
+    struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t	(*sof_shutdown_fn_t)(sof_handle_t, void *, int *,
+    cred_t *);
+typedef sof_rval_t	(*sof_getsockname_fn_t)(sof_handle_t, void *,
+    struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t	(*sof_getpeername_fn_t)(sof_handle_t, void *,
+    struct sockaddr *, socklen_t *, cred_t *);
+typedef sof_rval_t 		(*sof_setsockopt_fn_t)(sof_handle_t, void *,
+    int, int, void *, socklen_t *, cred_t *);
+typedef sof_rval_t	(*sof_getsockopt_fn_t)(sof_handle_t, void *,
+    int, int, void *, socklen_t *, cred_t *);
+typedef sof_rval_t	(*sof_ioctl_fn_t)(sof_handle_t, void *, int, intptr_t,
+    int, int32_t *, cred_t *);
+typedef void		(*sof_mblk_prop_fn_t)(sof_handle_t, void *, ssize_t *,
+    ushort_t *, ushort_t *);
+typedef void		(*sof_notify_fn_t)(sof_handle_t, void *, sof_event_t,
+    uintptr_t);
+
+typedef struct sof_ops {
+	sof_attach_active_fn_t	sofop_attach_active;
+	sof_attach_passive_fn_t	sofop_attach_passive;
+	sof_detach_fn_t		sofop_detach;
+	sof_data_in_fn_t	sofop_data_in;
+	sof_data_in_proc_fn_t	sofop_data_in_proc;
+	sof_data_out_fn_t	sofop_data_out;
+	sof_bind_fn_t		sofop_bind;
+	sof_listen_fn_t		sofop_listen;
+	sof_connect_fn_t	sofop_connect;
+	sof_accept_fn_t		sofop_accept;
+	sof_shutdown_fn_t	sofop_shutdown;
+	sof_getsockname_fn_t	sofop_getsockname;
+	sof_getpeername_fn_t	sofop_getpeername;
+	sof_setsockopt_fn_t	sofop_setsockopt;
+	sof_getsockopt_fn_t	sofop_getsockopt;
+	sof_ioctl_fn_t		sofop_ioctl;
+	sof_mblk_prop_fn_t	sofop_mblk_prop;
+	sof_notify_fn_t		sofop_notify;
+} sof_ops_t;
+
+#define	SOF_VERSION	1
+
+extern int	sof_register(int, const char *, const sof_ops_t *, int);
+extern int	sof_unregister(const char *);
+
+extern void	sof_newconn_ready(sof_handle_t);
+extern void	sof_bypass(sof_handle_t);
+extern void	*sof_get_cookie(sof_handle_t);
+extern void 	*sof_cas_cookie(sof_handle_t, void *, void *);
+extern int	sof_inject_data_out(sof_handle_t, mblk_t *, struct nmsghdr *,
+    boolean_t *);
+extern int	sof_inject_data_in(sof_handle_t, mblk_t *, size_t, int,
+    boolean_t *);
+extern void 	sof_rcv_flowctrl(sof_handle_t, boolean_t);
+extern void 	sof_snd_flowctrl(sof_handle_t, boolean_t);
+extern boolean_t sof_newconn_move(sof_handle_t, sof_handle_t);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SOCKFILTER_H */
--- a/usr/src/uts/common/syscall/sendfile.c	Thu Jun 17 16:29:23 2010 -0700
+++ b/usr/src/uts/common/syscall/sendfile.c	Thu Jun 17 17:22:09 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -781,8 +780,16 @@
 					size_t iov_len;
 
 					iov_len = sfv_len;
-					if (!SOCK_IS_NONSTR(so) &&
-					    SOTOTPI(so)->sti_kssl_ctx != NULL)
+					/*
+					 * Socket filters can limit the mblk
+					 * size, so limit reads to maxblk if
+					 * there are filters present.
+					 */
+					if ((!SOCK_IS_NONSTR(so) &&
+					    _SOTOTPI(so)->sti_kssl_ctx
+					    != NULL) ||
+					    (so->so_filter_active > 0 &&
+					    maxblk != INFPSZ))
 						iov_len = MIN(iov_len, maxblk);
 
 					aiov.iov_len = iov_len;
@@ -928,13 +935,16 @@
 
 				copyflag = stp != NULL ? stp->sd_copyflag :
 				    so->so_proto_props.sopp_zcopyflag;
+
 				/*
-				 * For sockets acting as an SSL proxy, we
-				 * need to adjust the size to the maximum
-				 * SSL record size set in the stream head.
+				 * Socket filters can limit the mblk size,
+				 * so limit reads to maxblk if there are
+				 * filters present.
 				 */
-				if (!SOCK_IS_NONSTR(so) &&
-				    _SOTOTPI(so)->sti_kssl_ctx != NULL)
+				if ((!SOCK_IS_NONSTR(so) &&
+				    _SOTOTPI(so)->sti_kssl_ctx != NULL) ||
+				    (so->so_filter_active > 0 &&
+				    maxblk != INFPSZ))
 					size = MIN(size, maxblk);
 
 				if (vn_has_flocks(readvp) ||