upstream/illumos/illumos-gate: changeset 741:40027a3621ac

--- a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c	Sat Oct 22 22:50:14 2005 -0700
@@ -76,6 +76,7 @@
 #include <netinet/in.h>
 #include <sys/tihdr.h>
 #include <inet/mib2.h>
+#include <inet/ip.h>
 #include <sys/ethernet.h>
 #include <sys/ser_sync.h>
 
@@ -92,27 +93,6 @@
 static const char rcsid[] = RCSID;
 #endif
 
-/* Need to use UDP for ifconfig compatibility */
-#if !defined(UDP_DEV_NAME)
-#define	UDP_DEV_NAME		"/dev/udp"
-#endif /* UDP_DEV_NAME */
-
-#if !defined(IP_DEV_NAME)
-#define	IP_DEV_NAME		"/dev/ip"
-#endif /* IP_DEV_NAME */
-
-#if !defined(UDP6_DEV_NAME)
-#define	UDP6_DEV_NAME		"/dev/udp6"
-#endif /* UDP6_DEV_NAME */
-
-#if !defined(IP6_DEV_NAME)
-#define	IP6_DEV_NAME		"/dev/ip6"
-#endif /* IP6_DEV_NAME */
-
-#if !defined(IP_MOD_NAME)
-#define	IP_MOD_NAME		"ip"
-#endif /* IP_MOD_NAME */
-
 #define	PPPSTRTIMOUT	1	/* Timeout in seconds for ioctl */
 #define	MAX_POLLFDS	32
 #define	NMODULES	32

--- a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,18 +55,6 @@
 #include "ncaconf.h"
 
 /* NCA does not support IPv6... */
-#ifndef	IP_DEV_NAME
-#define	IP_DEV_NAME	"/dev/ip"
-#endif
-
-#ifndef	IP_MOD_NAME
-#define	IP_MOD_NAME	"ip"
-#endif
-
-#ifndef	UDP_DEV_NAME
-#define	UDP_DEV_NAME	"/dev/udp"
-#endif
-
 #ifndef	NCA_MOD_NAME
 #define	NCA_MOD_NAME	"nca"
 #endif

--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c	Sat Oct 22 22:50:14 2005 -0700
@@ -18,6 +18,8 @@
 #include <sys/dlpi.h>
 #include <libdlpi.h>
 
+#include <inet/ip.h>
+
 #define	LOOPBACK_IF	"lo0"
 
 #define	NONE_STR	"none"
@@ -26,26 +28,6 @@
 #define	ARP_MOD_NAME	"arp"
 #endif
 
-#ifndef	IP_DEV_NAME
-#define	IP_DEV_NAME	"/dev/ip"
-#endif
-
-#ifndef	IP_MOD_NAME
-#define	IP_MOD_NAME	"ip"
-#endif
-
-#ifndef	IP6_DEV_NAME
-#define	IP6_DEV_NAME	"/dev/ip6"
-#endif
-
-#ifndef	UDP_DEV_NAME
-#define	UDP_DEV_NAME	"/dev/udp"
-#endif
-
-#ifndef	UDP6_DEV_NAME
-#define	UDP6_DEV_NAME	"/dev/udp6"
-#endif
-
 #define	ADDRBITS_V4	32	/* number of bits in IPv4 address */
 #define	ADDRBITS_V6	128	/* number of bits in IPv6 address */

--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Sat Oct 22 22:50:14 2005 -0700
@@ -3883,9 +3883,6 @@
 		mi_walk_init, mi_walk_step, mi_walk_fini, NULL },
 	{ "sonode", "given a sonode, walk its children",
 		sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL },
-	{ "udp", "walk UDP connections using MI",
-		mi_payload_walk_init, mi_payload_walk_step,
-		mi_payload_walk_fini, &mi_udp_arg },
 
 	/* from nvpair.c */
 	{ NVPAIR_WALKER_NAME, NVPAIR_WALKER_DESCR,

--- a/usr/src/cmd/mdb/common/modules/genunix/net.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c	Sat Oct 22 22:50:14 2005 -0700
@@ -107,7 +107,8 @@
 static int
 net_udp_active(const udp_t *udp)
 {
-	return ((udp->udp_state != TS_UNBND) && (udp->udp_state != TS_IDLE));
+	return ((udp->udp_state == TS_IDLE) ||
+	    (udp->udp_state == TS_DATA_XFER));
 }
 
 static int
@@ -355,11 +356,6 @@
 	delete_mi_payload_walk_data(wsp->walk_data, arg->mi_pwa_size);
 }
 
-const mi_payload_walk_arg_t mi_udp_arg = {
-	"udp", "udp_g_head", sizeof (udp_t),
-	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
-};
-
 const mi_payload_walk_arg_t mi_ar_arg = {
 	"arp", "ar_g_head", sizeof (ar_t),
 	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
@@ -595,7 +591,7 @@
 	tcp = (tcp_t *)((uintptr_t)connp + (tcp_kaddr - kaddr));
 
 	if ((uintptr_t)tcp < (uintptr_t)connp ||
-	    (uintptr_t)&tcp->tcp_connp > (uintptr_t)connp + itc_size ||
+	    (uintptr_t)(tcp + 1) > (uintptr_t)connp + itc_size ||
 	    (uintptr_t)tcp->tcp_connp != kaddr) {
 		mdb_warn("conn_tcp %p is invalid", tcp_kaddr);
 		return (WALK_NEXT);
@@ -603,7 +599,7 @@
 	connp->conn_tcp = tcp;
 	tcp->tcp_connp = connp;
 
-	if (!(opts & NETSTAT_ALL || net_tcp_active(tcp)) ||
+	if (!((opts & NETSTAT_ALL) || net_tcp_active(tcp)) ||
 	    (af == AF_INET && !net_tcp_ipv4(tcp)) ||
 	    (af == AF_INET6 && !net_tcp_ipv6(tcp))) {
 		return (WALK_NEXT);
@@ -639,45 +635,57 @@
 	return (netstat_tcp_cb(kaddr, walk_data, cb_data, AF_INET6));
 }
 
+/*ARGSUSED*/
 static int
-netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af)
 {
-	const udp_t *udp = walk_data;
 	const uintptr_t opts = (uintptr_t)cb_data;
+	udp_t udp;
+	conn_t connp;
+
+	if (mdb_vread(&udp, sizeof (udp_t), kaddr) == -1) {
+		mdb_warn("failed to read udp at %p", kaddr);
+		return (WALK_ERR);
+	}
 
-	if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv4(udp)))
-		return (WALK_NEXT);
+	if (mdb_vread(&connp, sizeof (conn_t),
+	    (uintptr_t)udp.udp_connp) == -1) {
+		mdb_warn("failed to read udp_connp at %p",
+		    (uintptr_t)udp.udp_connp);
+		return (WALK_ERR);
+	}
 
-	mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
-	net_ipv4addrport_pr(&udp->udp_v6src, udp->udp_port);
-	mdb_printf(" ");
-	net_ipv4addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
-	mdb_printf(" %4i\n", udp->udp_zoneid);
+	if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) ||
+	    (af == AF_INET && !net_udp_ipv4(&udp)) ||
+	    (af == AF_INET6 && !net_udp_ipv6(&udp))) {
+		return (WALK_NEXT);
+	}
+
+	mdb_printf("%0?p %2i ", kaddr, udp.udp_state);
+	if (af == AF_INET) {
+		net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port);
+		mdb_printf(" ");
+		net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+	} else if (af == AF_INET6) {
+		net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port);
+		mdb_printf(" ");
+		net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+	}
+	mdb_printf(" %4i\n", connp.conn_zoneid);
 
 	return (WALK_NEXT);
 }
 
 static int
+netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+{
+	return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET));
+}
+
+static int
 netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 {
-	const udp_t *udp = walk_data;
-	const uintptr_t opts = (uintptr_t)cb_data;
-
-	if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv6(udp)))
-		return (WALK_NEXT);
-
-	mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
-	net_ipv6addrport_pr(&udp->udp_v6src, udp->udp_port);
-	mdb_printf(" ");
-
-	/* Remote */
-	if (udp->udp_state == TS_DATA_XFER)
-		net_ipv6addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
-	else
-		mdb_printf("%*s.0    ", ADDR_V6_WIDTH, "0:0:0:0:0:0:0:0");
-	mdb_printf(" %4i\n", udp->udp_zoneid);
-
-	return (WALK_NEXT);
+	return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET6));
 }
 
 /*
@@ -855,7 +863,7 @@
 			    "UDPv4", ADDR_V4_WIDTH, "Local Address",
 			    ADDR_V4_WIDTH, "Remote Address", "Zone");
 
-			if (mdb_walk("genunix`udp", netstat_udpv4_cb,
+			if (mdb_walk("udp_cache", netstat_udpv4_cb,
 			    (void *)(uintptr_t)opts) == -1) {
 				mdb_warn("failed to walk genunix`udp");
 				return (DCMD_ERR);
@@ -870,12 +878,11 @@
 			    "UDPv6", ADDR_V6_WIDTH, "Local Address",
 			    ADDR_V6_WIDTH, "Remote Address", "Zone");
 
-			if (mdb_walk("genunix`udp", netstat_udpv6_cb,
+			if (mdb_walk("udp_cache", netstat_udpv6_cb,
 			    (void *)(uintptr_t)opts) == -1) {
 				mdb_warn("failed to walk genunix`udp");
 				return (DCMD_ERR);
 			}
-
 		}
 	}

--- a/usr/src/cmd/mdb/common/modules/genunix/net.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2000, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,7 +33,6 @@
 extern "C" {
 #endif
 
-extern struct mi_payload_walk_arg_s mi_udp_arg;
 extern struct mi_payload_walk_arg_s mi_ar_arg;
 extern struct mi_payload_walk_arg_s mi_icmp_arg;
 extern struct mi_payload_walk_arg_s mi_ill_arg;

--- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c	Sat Oct 22 22:50:14 2005 -0700
@@ -54,6 +54,7 @@
 #include <libdevinfo.h>
 #include <sys/systeminfo.h>
 #include <netdb.h>
+#include <inet/ip.h>
 
 #include <ipmp_mpathd.h>
 #include "rcm_module.h"
@@ -70,12 +71,7 @@
 /* Some generic well-knowns and defaults used in this module */
 #define	SLASH_DEV		"/dev"		/* /dev directory */
 
-#define	IP_DEV_NAME		"/dev/ip"	/* IPV4 ip device */
-#define	IP6_DEV_NAME		"/dev/ip6"	/* IPV6 ip device */
-#define	IP_MOD_NAME		"ip"		/* ip module */
 #define	ARP_MOD_NAME		"arp"		/* arp module */
-#define	UDP_DEV_NAME		"/dev/udp"	/* IPv4 udp device */
-#define	UDP6_DEV_NAME		"/dev/udp6"	/* IPv6 udp device */
 #define	IP_MAX_MODS		9		/* max modules pushed on intr */
 #define	MAX_RECONFIG_SIZE	1024		/* Max. reconfig string size */

--- a/usr/src/pkgdefs/etc/exception_list_i386	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/pkgdefs/etc/exception_list_i386	Sat Oct 22 22:50:14 2005 -0700
@@ -347,6 +347,8 @@
 usr/include/inet/arp_impl.h	i386
 usr/include/inet/rawip_impl.h	i386
 usr/include/inet/udp_impl.h	i386
+usr/include/inet/tcp_impl.h	i386
+usr/include/inet/ip_impl.h	i386
 usr/include/inet/ip_ndp.h	i386
 usr/include/inet/ipdrop.h	i386
 usr/include/inet/tun.h		i386

--- a/usr/src/pkgdefs/etc/exception_list_sparc	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/pkgdefs/etc/exception_list_sparc	Sat Oct 22 22:50:14 2005 -0700
@@ -336,6 +336,8 @@
 usr/include/inet/arp_impl.h	sparc
 usr/include/inet/rawip_impl.h	sparc
 usr/include/inet/udp_impl.h	sparc
+usr/include/inet/tcp_impl.h	sparc
+usr/include/inet/ip_impl.h	sparc
 usr/include/inet/ip_ndp.h	sparc
 usr/include/inet/ipdrop.h	sparc
 usr/include/inet/tun.h		sparc

--- a/usr/src/tools/scripts/bfu.sh	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/tools/scripts/bfu.sh	Sat Oct 22 22:50:14 2005 -0700
@@ -2002,11 +2002,10 @@
 fi
 
 update_script="/ws/onnv-gate/public/bin/update_ce"
-if [ "$plat" = "SUNW,Sun-Fire-15000" ] && ifconfig -a | egrep '^ce' \
-	    >/dev/null 2>/dev/null; then
-	# Sun Fire 12K/15K/20K/25K requires CE version 1.146 or later.
+if ifconfig -a | egrep '^ce' >/dev/null 2>/dev/null; then
+	# CE version 1.148 or later is required
 	cever=`modinfo | grep 'CE Ethernet' | sed 's/.*v1\.//' | tr -d ')' | \
-	    nawk '{ if ($1 < 146) print "BAD"; else print $1 }'`
+	    nawk '{ if ($1 < 148) print "BAD"; else print $1 }'`
 	if [ "$cever" = "BAD" ]; then
 		fail "You must run $update_script to upgrade your ce driver."
 	fi

--- a/usr/src/uts/common/Makefile.files	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/Makefile.files	Sat Oct 22 22:50:14 2005 -0700
@@ -416,13 +416,9 @@
 
 RTS_OBJS +=	rtsddi.o rts.o rts_opt_data.o
 
-IP_OBJS +=	igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
-		ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
-		ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
-		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
-		spd.o ipclassifier.o inet_common.o ip_squeue.o tcp.o \
-		tcp_trace.o tcp_opt_data.o tcp_sack.o squeue.o ip_sadb.o \
-		sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
+IP_TCP_OBJS =	tcp.o tcp_trace.o tcp_opt_data.o tcp_sack.o tcp_fusion.o
+IP_UDP_OBJS =	udp.o udp_opt_data.o
+IP_SCTP_OBJS =	sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_init.o sctp_input.o sctp_cookie.o \
 		sctp_conn.o sctp_error.o sctp_snmp.o \
 		sctp_param.o sctp_shutdown.o sctp_common.o \
@@ -430,6 +426,16 @@
 		sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \
 		sctp_addr.o
 
+IP_OBJS +=	igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
+		ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+		ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
+		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
+		spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
+		ip_sadb.o \
+		$(IP_TCP_OBJS) \
+		$(IP_UDP_OBJS) \
+		$(IP_SCTP_OBJS)
+
 IP6_OBJS +=	ip6ddi.o
 
 KEYSOCK_OBJS +=	keysockddi.o keysock.o keysock_opt_data.o
@@ -467,7 +473,7 @@
 
 6TO4TUN_OBJS +=	6to4tun.o
 
-UDP_OBJS +=	udpddi.o udp.o udp_opt_data.o
+UDP_OBJS +=	udpddi.o
 
 UDP6_OBJS +=	udp6ddi.o

--- a/usr/src/uts/common/fs/sockfs/sockstr.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockstr.c	Sat Oct 22 22:50:14 2005 -0700
@@ -137,21 +137,23 @@
 
 	ASSERT(so->so_version != SOV_STREAM);
 
-	/* tell the transport below that sockmod is being popped */
-	if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
-		int	rval;
-		mblk_t	**mpp;
+	if (so->so_state & SS_DIRECT) {
+		mblk_t **mpp;
+		int rval;
 
+		/*
+		 * Tell the transport below that sockmod is being popped
+		 */
 		mutex_exit(&so->so_lock);
-		error = strioctl(vp, SIOCPOPSOCKFS, NULL, 0, K_TO_K, CRED(),
+		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
 		    &rval);
 		mutex_enter(&so->so_lock);
 		if (error != 0) {
-			dprintso(so, 0,
-			    ("so_sock2stream(%p): SIOCPOPSOCKFS failed\n", so));
+			dprintso(so, 0, ("so_sock2stream(%p): "
+			    "_SIOCSOCKFALLBACK failed\n", so));
 			goto exit;
 		}
-		so->so_state &= ~SS_TCP_FAST_ACCEPT;
+		so->so_state &= ~SS_DIRECT;
 
 		for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
 		    mpp = &mp->b_next) {
@@ -412,7 +414,7 @@
 
 		/* the following do_tcapability may update so->so_mode */
 		if ((tso->so_serv_type != T_CLTS) &&
-		    ((so->so_state & SS_TCP_FAST_ACCEPT) == 0)) {
+		    !(so->so_state & SS_DIRECT)) {
 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
 			if (error)
 				return (error);

--- a/usr/src/uts/common/fs/sockfs/socktpi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -57,6 +57,7 @@
 
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/sockio.h>
 #include <netinet/in.h>
 #include <sys/un.h>
 #include <sys/strsun.h>
@@ -72,6 +73,7 @@
 #include <inet/ip.h>
 #include <inet/ip6.h>
 #include <inet/tcp.h>
+#include <inet/udp_impl.h>
 
 #include <fs/sockfs/nl7c.h>
 #include <sys/zone.h>
@@ -185,6 +187,10 @@
 		    struct uio *);
 static int	sotpi_shutdown(struct sonode *, int);
 static int	sotpi_getsockname(struct sonode *);
+static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
+		    struct uio *, void *, t_uscalar_t, int);
+static int	sodgram_direct(struct sonode *, struct sockaddr *,
+		    socklen_t, struct uio *, int);
 
 sonodeops_t sotpi_sonodeops = {
 	sotpi_accept,		/* sop_accept		*/
@@ -222,16 +228,40 @@
 	so = VTOSO(vp);
 
 	flags = FREAD|FWRITE;
-	if (tso != NULL) {
-		if ((tso->so_state & (SS_TCP_FAST_ACCEPT)) != 0) {
-			flags |= SO_ACCEPTOR|SO_SOCKSTR;
-			so->so_state |= SS_TCP_FAST_ACCEPT;
-		}
-	} else {
-		if ((so->so_type == SOCK_STREAM) &&
-		    (so->so_family == AF_INET || so->so_family == AF_INET6)) {
-			flags |= SO_SOCKSTR;
-			so->so_state |= SS_TCP_FAST_ACCEPT;
+
+	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
+	    (domain == AF_INET || domain == AF_INET6) &&
+	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
+	    protocol == IPPROTO_IP)) {
+		/* Tell tcp or udp that it's talking to sockets */
+		flags |= SO_SOCKSTR;
+
+		/*
+		 * Here we indicate to socktpi_open() our attempt to
+		 * make direct calls between sockfs and transport.
+		 * The final decision is left to socktpi_open().
+		 */
+		so->so_state |= SS_DIRECT;
+
+		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
+		if (so->so_type == SOCK_STREAM && tso != NULL) {
+			if (tso->so_state & SS_DIRECT) {
+				/*
+				 * Inherit SS_DIRECT from listener and pass
+				 * SO_ACCEPTOR open flag to tcp, indicating
+				 * that this is an accept fast-path instance.
+				 */
+				flags |= SO_ACCEPTOR;
+			} else {
+				/*
+				 * SS_DIRECT is not set on listener, meaning
+				 * that the listener has been converted from
+				 * a socket to a stream.  Ensure that the
+				 * acceptor inherits these settings.
+				 */
+				so->so_state &= ~SS_DIRECT;
+				flags &= ~SO_SOCKSTR;
+			}
 		}
 	}
 
@@ -1052,7 +1082,7 @@
 }
 
 /* bind the socket */
-int
+static int
 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
     int flags)
 {
@@ -1372,7 +1402,7 @@
 	case AF_INET:
 	case AF_INET6:
 		if ((optlen == sizeof (intptr_t)) &&
-		    ((so->so_state & SS_TCP_FAST_ACCEPT) != 0)) {
+		    ((so->so_state & SS_DIRECT) != 0)) {
 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
 			    &opt, conn_ind->OPT_length);
 		} else {
@@ -1385,7 +1415,19 @@
 			 * problems when sockfs sends a normal T_CONN_RES
 			 * message down the new stream.
 			 */
-			so->so_state &= ~SS_TCP_FAST_ACCEPT;
+			if (so->so_state & SS_DIRECT) {
+				int rval;
+				/*
+				 * For consistency we inform tcp to disable
+				 * direct interface on the listener, though
+				 * we can certainly live without doing this
+				 * because no data will ever travel upstream
+				 * on the listening socket.
+				 */
+				so->so_state &= ~SS_DIRECT;
+				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
+				    0, 0, K_TO_K, CRED(), &rval);
+			}
 			opt = NULL;
 			optlen = 0;
 		}
@@ -1554,9 +1596,10 @@
 	if (nso->so_options & SO_LINGER)
 		nso->so_linger = so->so_linger;
 
-	if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
+	if ((so->so_state & SS_DIRECT) != 0) {
 		mblk_t *ack_mp;
 
+		ASSERT(nso->so_state & SS_DIRECT);
 		ASSERT(opt != NULL);
 
 		conn_res->OPT_length = optlen;
@@ -3308,13 +3351,8 @@
  * Assumes caller has verified that SS_ISBOUND etc. are set.
  */
 static int
-sosend_dgramcmsg(struct sonode *so,
-		struct sockaddr *name,
-		t_uscalar_t namelen,
-		struct uio *uiop,
-		void *control,
-		t_uscalar_t controllen,
-		int flags)
+sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+    struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
 {
 	struct T_unitdata_req	tudr;
 	mblk_t			*mp;
@@ -3636,11 +3674,8 @@
  * name and the source address is passed as an option.
  */
 int
-sosend_dgram(struct sonode	*so,
-		struct sockaddr	*name,
-		socklen_t	namelen,
-		struct uio	*uiop,
-		int		flags)
+sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
+    struct uio *uiop, int flags)
 {
 	struct T_unitdata_req	tudr;
 	mblk_t			*mp;
@@ -3651,7 +3686,7 @@
 	socklen_t		srclen;
 	ssize_t			len;
 
-	ASSERT(name && namelen);
+	ASSERT(name != NULL && namelen != 0);
 
 	len = uiop->uio_resid;
 	if (len > so->so_tidu_size) {
@@ -3659,14 +3694,14 @@
 		goto done;
 	}
 
-	/*
-	 * Length and family checks.
-	 */
+	/* Length and family checks */
 	error = so_addr_verify(so, name, namelen);
-	if (error) {
-		eprintsoline(so, error);
+	if (error != 0)
 		goto done;
-	}
+
+	if (so->so_state & SS_DIRECT)
+		return (sodgram_direct(so, name, namelen, uiop, flags));
+
 	if (so->so_family == AF_UNIX) {
 		if (so->so_state & SS_FADDR_NOXLATE) {
 			/*
@@ -4061,8 +4096,7 @@
 	if (msg->msg_controllen != 0) {
 		if (!(so_mode & SM_CONNREQUIRED)) {
 			error = sosend_dgramcmsg(so, name, namelen, uiop,
-				msg->msg_control, msg->msg_controllen,
-				flags);
+			    msg->msg_control, msg->msg_controllen, flags);
 		} else {
 			if (flags & MSG_OOB) {
 				/* Can't generate T_EXDATA_REQ with options */
@@ -4080,7 +4114,7 @@
 	if (!(so_mode & SM_CONNREQUIRED)) {
 		/*
 		 * If there is no SO_DONTROUTE to turn off return immediately
-		 * from sosend_dgram. This can allow tail-call optimizations.
+		 * from send_dgram. This can allow tail-call optimizations.
 		 */
 		if (!dontroute) {
 			return (sosend_dgram(so, name, namelen, uiop, flags));
@@ -4104,13 +4138,16 @@
 
 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
 				/*
-				 * If there is no SO_DONTROUTE to turn off
-				 * return immediately from strwrite. This can
-				 * allow tail-call optimizations.
+				 * If there is no SO_DONTROUTE to turn off,
+				 * SS_DIRECT is on, and there is no flow
+				 * control, we can take the fast path.
 				 */
-				if (!dontroute)
-					return (strwrite(SOTOV(so), uiop,
-							CRED()));
+				if (!dontroute &&
+				    (so_state & SS_DIRECT) &&
+				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
+					return (sostream_direct(so, uiop,
+					    NULL, CRED()));
+				}
 				error = strwrite(SOTOV(so), uiop, CRED());
 				goto done;
 			}
@@ -4140,6 +4177,206 @@
 }
 
 /*
+ * Sending data on a datagram socket.
+ * Assumes caller has verified that SS_ISBOUND etc. are set.
+ */
+/* ARGSUSED */
+static int
+sodgram_direct(struct sonode *so, struct sockaddr *name,
+    socklen_t namelen, struct uio *uiop, int flags)
+{
+	struct T_unitdata_req	tudr;
+	mblk_t			*mp;
+	int			error = 0;
+	void			*addr;
+	socklen_t		addrlen;
+	ssize_t			len;
+	struct stdata		*stp = SOTOV(so)->v_stream;
+	int			so_state;
+	queue_t			*udp_wq;
+
+	ASSERT(name != NULL && namelen != 0);
+	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
+	ASSERT(!(so->so_mode & SM_EXDATA));
+	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
+	ASSERT(SOTOV(so)->v_type == VSOCK);
+
+	/* Caller checked for proper length */
+	len = uiop->uio_resid;
+	ASSERT(len <= so->so_tidu_size);
+
+	/* Length and family checks have been done by caller */
+	ASSERT(name->sa_family == so->so_family);
+	ASSERT(so->so_family == AF_INET ||
+	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
+	ASSERT(so->so_family == AF_INET6 ||
+	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
+
+	addr = name;
+	addrlen = namelen;
+
+	if (stp->sd_sidp != NULL &&
+	    (error = straccess(stp, JCWRITE)) != 0)
+		goto done;
+
+	so_state = so->so_state;
+
+	/*
+	 * For UDP we don't break up the copyin into smaller pieces
+	 * as in the TCP case.  That means if ENOMEM is returned by
+	 * mcopyinuio() then the uio vector has not been modified at
+	 * all and we fallback to either strwrite() or kstrputmsg()
+	 * below.  Note also that we never generate priority messages
+	 * from here.
+	 */
+	udp_wq = stp->sd_wrq->q_next;
+	if (canput(udp_wq) &&
+	    (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
+		ASSERT(DB_TYPE(mp) == M_DATA);
+		ASSERT(uiop->uio_resid == 0);
+#ifdef C2_AUDIT
+		if (audit_active)
+			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+		udp_wput_data(udp_wq, mp, addr, addrlen);
+		return (0);
+	}
+	if (error != 0 && error != ENOMEM)
+		return (error);
+
+	/*
+	 * For connected, let strwrite() handle the blocking case.
+	 * Otherwise we fall thru and use kstrputmsg().
+	 */
+	if (so_state & SS_ISCONNECTED)
+		return (strwrite(SOTOV(so), uiop, CRED()));
+
+	tudr.PRIM_type = T_UNITDATA_REQ;
+	tudr.DEST_length = addrlen;
+	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
+	tudr.OPT_length = 0;
+	tudr.OPT_offset = 0;
+
+	mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR);
+	if (mp == NULL) {
+		/*
+		 * Caught a signal waiting for memory.
+		 * Let send* return EINTR.
+		 */
+		error = EINTR;
+		goto done;
+	}
+
+#ifdef C2_AUDIT
+	if (audit_active)
+		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+
+	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
+done:
+#ifdef SOCK_DEBUG
+	if (error != 0) {
+		eprintsoline(so, error);
+	}
+#endif /* SOCK_DEBUG */
+	return (error);
+}
+
+int
+sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
+{
+	struct stdata *stp = SOTOV(so)->v_stream;
+	ssize_t iosize, rmax, maxblk;
+	queue_t *tcp_wq = stp->sd_wrq->q_next;
+	int error = 0, wflag = 0;
+
+	ASSERT(so->so_mode & SM_BYTESTREAM);
+	ASSERT(SOTOV(so)->v_type == VSOCK);
+
+	if (stp->sd_sidp != NULL &&
+	    (error = straccess(stp, JCWRITE)) != 0)
+		return (error);
+
+	if (uiop == NULL) {
+		/*
+		 * kstrwritemp() should have checked sd_flag and
+		 * flow-control before coming here.  If we end up
+		 * here it means that we can simply pass down the
+		 * data to tcp.
+		 */
+		ASSERT(mp != NULL);
+		tcp_wput(tcp_wq, mp);
+		return (0);
+	}
+
+	/* Fallback to strwrite() to do proper error handling */
+	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
+		return (strwrite(SOTOV(so), uiop, cr));
+
+	rmax = stp->sd_qn_maxpsz;
+	ASSERT(rmax >= 0 || rmax == INFPSZ);
+	if (rmax == 0 || uiop->uio_resid <= 0)
+		return (0);
+
+	if (rmax == INFPSZ)
+		rmax = uiop->uio_resid;
+
+	maxblk = stp->sd_maxblk;
+
+	for (;;) {
+		iosize = MIN(uiop->uio_resid, rmax);
+
+		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
+		if (mp == NULL) {
+			/*
+			 * Fallback to strwrite() for ENOMEM; if this
+			 * is our first time in this routine and the uio
+			 * vector has not been modified, we will end up
+			 * calling strwrite() without any flag set.
+			 */
+			if (error == ENOMEM)
+				goto slow_send;
+			else
+				return (error);
+		}
+		ASSERT(uiop->uio_resid >= 0);
+		/*
+		 * If mp is non-NULL and ENOMEM is set, it means that
+		 * mcopyinuio() was able to break down some of the user
+		 * data into one or more mblks.  Send the partial data
+		 * to tcp and let the rest be handled in strwrite().
+		 */
+		ASSERT(error == 0 || error == ENOMEM);
+		tcp_wput(tcp_wq, mp);
+
+		wflag |= NOINTR;
+
+		if (uiop->uio_resid == 0) {	/* No more data; we're done */
+			ASSERT(error == 0);
+			break;
+		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
+		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
+slow_send:
+			/*
+			 * We were able to send down partial data using
+			 * the direct call interface, but are now relying
+			 * on strwrite() to handle the non-fastpath cases.
+			 * If the socket is blocking we will sleep in
+			 * strwaitq() until write is permitted, otherwise,
+			 * we will need to return the amount of bytes
+			 * written so far back to the app.  This is the
+			 * reason why we pass NOINTR flag to strwrite()
+			 * for non-blocking socket, because we don't want
+			 * to return EAGAIN when portion of the user data
+			 * has actually been sent down.
+			 */
+			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
+		}
+	}
+	return (0);
+}
+
+/*
  * Update so_faddr by asking the transport (unless AF_UNIX).
  */
 int

--- a/usr/src/uts/common/fs/sockfs/sockvnops.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockvnops.c	Sat Oct 22 22:50:14 2005 -0700
@@ -53,6 +53,7 @@
 #include <sys/stropts.h>
 #include <sys/stream.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/suntpi.h>
 #include <sys/ioctl.h>
 #include <sys/sockio.h>
@@ -87,6 +88,9 @@
 
 #include <fs/sockfs/nl7c.h>
 
+#include <inet/udp_impl.h>
+#include <inet/tcp_impl.h>
+
 static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *);
 static int socktpi_read(struct vnode *, struct uio *, int, struct cred *,
 	struct caller_context *);
@@ -140,6 +144,15 @@
 };
 
 /*
+ * Do direct function call to the transport layer below; this would
+ * also allow the transport to utilize read-side synchronous stream
+ * interface if necessary.  This is a /etc/system tunable that must
+ * not be modified on a running system.  By default this is enabled
+ * for performance reasons and may be disabled for debugging purposes.
+ */
+boolean_t socktpi_direct = B_TRUE;
+
+/*
  * Open routine used by socket() call. Note that vn_open checks for
  * VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is
  * needed since VSOCK type vnodes exist in various underlying filesystems as
@@ -205,6 +218,56 @@
 
 		ASSERT(stp->sd_wrq != NULL);
 		so->so_provinfo = tpi_findprov(stp->sd_wrq);
+
+		/*
+		 * If caller is interested in doing direct function call
+		 * interface to/from transport module, probe the module
+		 * directly beneath the streamhead to see if it qualifies.
+		 *
+		 * We turn off direct interface when qualifications fail;
+		 * note that we do these checks for everything other than
+		 * the tcp acceptor case, because the acceptor inherits
+		 * the capabilities of the listener and we've already done
+		 * the checks against the listening socket.
+		 */
+		if (!(flag & SO_ACCEPTOR) && (so->so_state & SS_DIRECT)) {
+			queue_t *tq = stp->sd_wrq->q_next;
+
+			/*
+			 * SS_DIRECT is currently supported and tested
+			 * only for tcp/udp; this is the main reason to
+			 * have the following assertions.
+			 */
+			ASSERT(so->so_family == AF_INET ||
+			    so->so_family == AF_INET6);
+			ASSERT(so->so_protocol == IPPROTO_UDP ||
+			    so->so_protocol == IPPROTO_TCP ||
+			    so->so_protocol == IPPROTO_IP);
+			ASSERT(so->so_type == SOCK_DGRAM ||
+			    so->so_type == SOCK_STREAM);
+
+			/*
+			 * Abort direct call interface if the module directly
+			 * underneath the stream head is not defined with the
+			 * _D_DIRECT flag.  This could happen in the tcp or
+			 * udp case, when some other module is autopushed
+			 * above it, or for some reasons the expected module
+			 * isn't purely D_MP (which is the main requirement).
+			 */
+			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
+			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
+				int rval;
+
+				/* Continue on without direct calls */
+				so->so_state &= ~SS_DIRECT;
+				if ((error = strioctl(vp, _SIOCSOCKFALLBACK,
+				    0, 0, K_TO_K, CRED(), &rval)) != 0) {
+					(void) socktpi_close(vp, flag, 1,
+					    (offset_t)0, cr);
+					return (error);
+				}
+			}
+		}
 	} else {
 		/*
 		 * While the same socket can not be reopened (unlike specfs)
@@ -436,6 +499,11 @@
 			/* Give NL7C some data */
 			nl7c_data(so, uiop);
 		}
+
+		if ((so_state & SS_DIRECT) &&
+		    canputnext(vp->v_stream->sd_wrq)) {
+			return (sostream_direct(so, uiop, NULL, cr));
+		}
 		return (strwrite(vp, uiop, cr));
 	} else {
 		/* Send T_DATA_REQ messages without MORE_flag set */
@@ -631,7 +699,7 @@
 	case I_SENDFD:
 	case I_RECVFD:
 	case I_ATMARK:
-	case SIOCPOPSOCKFS:
+	case _SIOCSOCKFALLBACK:
 		/*
 		 * These ioctls do not apply to sockets. I_FDINSERT can be
 		 * used to send M_PROTO messages without modifying the socket
@@ -639,8 +707,9 @@
 		 * descriptor passing since they assume a twisted stream.
 		 * SIOCATMARK must be used instead of I_ATMARK.
 		 *
-		 * SIOCPOPSOCKFS from an application should never be
-		 * processed. It is always generated in response to I_POP.
+		 * _SIOCSOCKFALLBACK from an application should never be
+		 * processed.  It is only generated by socktpi_open() or
+		 * in response to I_POP or I_PUSH.
 		 */
 #ifdef DEBUG
 		cmn_err(CE_WARN, "Unsupported STREAMS ioctl 0x%x on socket. "
@@ -724,6 +793,24 @@
 
 	switch (cmd) {
 	case I_PUSH:
+		if (so->so_state & SS_DIRECT) {
+			mutex_enter(&so->so_lock);
+			so_lock_single(so);
+			mutex_exit(&so->so_lock);
+
+			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
+			    CRED(), rvalp);
+
+			mutex_enter(&so->so_lock);
+			if (error == 0)
+				so->so_state &= ~SS_DIRECT;
+			so_unlock_single(so, SOLOCKED);
+			mutex_exit(&so->so_lock);
+
+			if (error != 0)
+				return (error);
+		}
+
 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
 		if (error == 0)
 			so->so_pushcnt++;

--- a/usr/src/uts/common/inet/Makefile	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/Makefile	Sat Oct 22 22:50:14 2005 -0700
@@ -34,7 +34,7 @@
 	ipsec_info.h ip6_asp.h ip_if.h ip_ire.h ip_multi.h ip_ndp.h ip_rts.h \
 	ipsec_impl.h keysock.h led.h mi.h mib2.h nd.h optcom.h sadb.h \
 	sctp_itf.h snmpcom.h tcp.h tcp_sack.h tun.h udp_impl.h arp_impl.h \
-	rawip_impl.h ipp_common.h
+	rawip_impl.h ipp_common.h ip_impl.h tcp_impl.h
 
 ROOTDIRS= $(ROOT)/usr/include/inet

--- a/usr/src/uts/common/inet/arp/arp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/arp/arp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -194,7 +194,6 @@
 static int	ar_entry_delete(queue_t *q, mblk_t *mp);
 static int	ar_entry_query(queue_t *q, mblk_t *mp);
 static int	ar_entry_squery(queue_t *q, mblk_t *mp);
-static void	ar_freemsg(mblk_t *mp);
 static int	ar_interface_up(queue_t *q, mblk_t *mp);
 static int	ar_interface_down(queue_t *q, mblk_t *mp);
 static int	ar_interface_on(queue_t *q, mblk_t *mp);
@@ -1231,7 +1230,7 @@
 				ar_ip->ar_arl_ip_assoc = ar_arl;
 			}
 		}
-		ar_freemsg(mp);
+		inet_freemsg(mp);
 	}
 
 	/*
@@ -1745,19 +1744,6 @@
 	return (0);
 }
 
-/* Make sure b_next and b_prev are null and then free the message */
-static void
-ar_freemsg(mblk_t *mp)
-{
-	mblk_t *mp1;
-
-	for (mp1 = mp; mp1; mp1 = mp1->b_cont) {
-		mp1->b_prev = mp1->b_next = NULL;
-		mp1->b_queue = NULL;
-	}
-	freemsg(mp);
-}
-
 /* Process an interface down causing us to detach and unbind. */
 /* ARGSUSED */
 static int
@@ -1936,7 +1922,7 @@
 					BUMP_IRE_STATS(ire_stats_v4,
 					    ire_stats_freed);
 				}
-				ar_freemsg(mp);
+				inet_freemsg(mp);
 			} else {
 				prev = mp;
 			}
@@ -2587,7 +2573,7 @@
 			    *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
 				BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
 			}
-			ar_freemsg(mp);
+			inet_freemsg(mp);
 		} else {
 			mpp = &mp->b_next;
 		}
@@ -2657,7 +2643,7 @@
 		} else {
 			if (ret_val != 0) {
 				/* TODO: find some way to let the guy know? */
-				ar_freemsg(mp);
+				inet_freemsg(mp);
 				BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
 				continue;
 			}
@@ -2849,7 +2835,7 @@
 			    "arp_rput_end: q %p (%S)", q, "proto");
 			return;
 		default:
-			ar_freemsg(mp);
+			inet_freemsg(mp);
 			return;
 		}
 		if ((mp->b_wptr - mp->b_rptr) < sizeof (dl_unitdata_ind_t) ||

--- a/usr/src/uts/common/inet/common.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/common.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992-2001, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -97,13 +97,13 @@
 #define	INET_MAXMINOR		MAXMIN	/* maximum device minor number */
 
 #ifdef _KERNEL
+#include <sys/stream.h>
 
-extern void inet_init(void);
-extern void inet_destroy(void);
 extern void *inet_minor_create(char *, dev_t, int);
 extern void inet_minor_destroy(void *);
 extern dev_t inet_minor_alloc(void *);
 extern void inet_minor_free(void *, dev_t);
+extern void inet_freemsg(mblk_t *);
 
 #endif	/* _KERNEL */

--- a/usr/src/uts/common/inet/inet_common.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/inet_common.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,3 +103,21 @@
 	ASSERT((dev != OPENFAIL) && (dev != 0) && (dev <= inet_maxminor));
 	vmem_free(((inet_arena_t *)a)->ineta_arena, (void *)dev, 1);
 }
+
+/*
+ * This function is used to free a message that has gone through
+ * mi_copyin processing which modifies the M_IOCTL mblk's b_next
+ * and b_prev pointers. We use this function to set b_next/b_prev
+ * to NULL and free them.
+ */
+void
+inet_freemsg(mblk_t *mp)
+{
+	mblk_t	*bp = mp;
+
+	for (; bp != NULL; bp = bp->b_cont) {
+		bp->b_prev = NULL;
+		bp->b_next = NULL;
+	}
+	freemsg(mp);
+}

--- a/usr/src/uts/common/inet/ip.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip.h	Sat Oct 22 22:50:14 2005 -0700
@@ -52,6 +52,7 @@
 #include <sys/vmem.h>
 #include <sys/squeue.h>
 #include <sys/systm.h>
+#include <sys/multidata.h>
 
 #ifdef DEBUG
 #define	ILL_DEBUG
@@ -67,7 +68,19 @@
  * of flags.
  */
 #define	IP_DEVMTFLAGS D_MP
-#endif
+#endif	/* _KERNEL */
+
+#define	IP_MOD_NAME	"ip"
+#define	IP_DEV_NAME	"/dev/ip"
+#define	IP6_DEV_NAME	"/dev/ip6"
+
+#define	UDP_MOD_NAME	"udp"
+#define	UDP_DEV_NAME	"/dev/udp"
+#define	UDP6_DEV_NAME	"/dev/udp6"
+
+#define	TCP_MOD_NAME	"tcp"
+#define	TCP_DEV_NAME	"/dev/tcp"
+#define	TCP6_DEV_NAME	"/dev/tcp6"
 
 /* Minor numbers */
 #define	IPV4_MINOR	0
@@ -101,8 +114,6 @@
 #define	ILL_FRAG_HASH_TBL_COUNT	((unsigned int)64)
 #define	ILL_FRAG_HASH_TBL_SIZE	(ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
 
-#define	IP_DEV_NAME			"/dev/ip"
-#define	IP_MOD_NAME			"ip"
 #define	IPV4_ADDR_LEN			4
 #define	IP_ADDR_LEN			IPV4_ADDR_LEN
 #define	IP_ARP_PROTO_TYPE		0x0800
@@ -236,6 +247,7 @@
 
 #define	Q_TO_CONN(q)	((conn_t *)(q)->q_ptr)
 #define	Q_TO_TCP(q)	(Q_TO_CONN((q))->conn_tcp)
+#define	Q_TO_UDP(q)	(Q_TO_CONN((q))->conn_udp)
 
 /*
  * The following two macros are used by IP to get the appropriate
@@ -244,13 +256,10 @@
  * from a conn directly if it knows that the conn is not TCP.
  */
 #define	CONNP_TO_WQ(connp)	\
-	(((connp)->conn_tcp == NULL) ? (connp)->conn_wq :	\
-	(connp)->conn_tcp->tcp_wq)
+	(IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
 
 #define	CONNP_TO_RQ(connp)	RD(CONNP_TO_WQ(connp))
 
-#define	IS_TCP_CONN(connp)	(((connp)->conn_flags & IPCL_TCP) != 0)
-
 #define	GRAB_CONN_LOCK(q)	{				\
 	if (q != NULL && CONN_Q(q))				\
 		mutex_enter(&(Q_TO_CONN(q))->conn_lock);	\
@@ -302,9 +311,8 @@
  */
 #define	IP6_NO_IPPOLICY		0x800	/* Don't do IPQoS processing */
 #define	IP6_IN_LLMCAST		0x1000	/* Multicast */
-#define	IP6_IN_NOCKSUM		0x2000	/* Don't compute checksum */
-
-#define	IP_FF_LOOPBACK		0x4000	/* Loopback fanout */
+
+#define	IP_FF_LOOPBACK		0x2000	/* Loopback fanout */
 
 #ifndef	IRE_DB_TYPE
 #define	IRE_DB_TYPE	M_SIG
@@ -357,6 +365,8 @@
 	uint_t		ipf_prev_nexthdr_offset; /* Offset for nexthdr value */
 	uint8_t		ipf_ecn;	/* ECN info for the fragments */
 	uint8_t		ipf_num_dups;	/* Number of times dup frags recvd */
+	uint16_t	ipf_checksum_flags; /* Hardware checksum flags */
+	uint32_t	ipf_checksum;	/* Partial checksum of fragment data */
 } ipf_t;
 
 #define	ipf_src	V4_PART_OF_V6(ipf_v6src)
@@ -623,9 +633,10 @@
  * depends on the atomic 32 bit access to that field.
  */
 #define	CONN_CLOSING		0x01	/* ip_close waiting for ip_wsrv */
-#define	CONN_IPSEC_LOAD_WAIT	0x10	/* waiting for load */
-#define	CONN_CONDEMNED		0x40	/* conn is closing, no more refs */
-#define	CONN_INCIPIENT		0x80	/* conn not yet visible, no refs */
+#define	CONN_IPSEC_LOAD_WAIT	0x02	/* waiting for load */
+#define	CONN_CONDEMNED		0x04	/* conn is closing, no more refs */
+#define	CONN_INCIPIENT		0x08	/* conn not yet visible, no refs */
+#define	CONN_QUIESCED		0x10	/* conn is now quiescent */
 
 /*
  * Parameter to ip_output giving the identity of the caller.
@@ -2593,6 +2604,7 @@
 
 extern int ip_g_forward;
 extern int ipv6_forward;
+extern vmem_t *ip_minor_arena;
 
 #define	ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value
 #define	ip_g_send_redirects		ip_param_arr[5].ip_param_value
@@ -2697,18 +2709,11 @@
 #define	ip1dbg(a)	if (ip_debug > 2) printf a
 #define	ip2dbg(a)	if (ip_debug > 3) printf a
 #define	ip3dbg(a)	if (ip_debug > 4) printf a
-
-#define	ipcsumdbg(a, b) \
-	if (ip_debug == 1) \
-		prom_printf(a); \
-	else if (ip_debug > 1) \
-		{ prom_printf("%smp=%p\n", a, (void *)b); }
 #else
 #define	ip0dbg(a)	/* */
 #define	ip1dbg(a)	/* */
 #define	ip2dbg(a)	/* */
 #define	ip3dbg(a)	/* */
-#define	ipcsumdbg(a, b)	/* */
 #endif	/* IP_DEBUG */
 
 extern const char *dlpi_prim_str(int);
@@ -2717,7 +2722,6 @@
 extern ill_t	*ill_first(int, int, ill_walk_context_t *);
 extern ill_t	*ill_next(ill_walk_context_t *, ill_t *);
 extern void	ill_frag_timer_start(ill_t *);
-extern void	ip_ioctl_freemsg(mblk_t *);
 extern mblk_t	*ip_carve_mp(mblk_t **, ssize_t);
 extern mblk_t	*ip_dlpi_alloc(size_t, t_uscalar_t);
 extern char	*ip_dot_addr(ipaddr_t, char *);
@@ -2749,6 +2753,9 @@
 extern void	ip_rput_dlpi(queue_t *, mblk_t *);
 extern void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
 extern void	ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
+
+extern int	ip_snmpmod_close(queue_t *);
+extern void	ip_snmpmod_wput(queue_t *, mblk_t *);
 extern void	ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
 extern void	ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
 extern void	ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *);
@@ -2821,6 +2828,7 @@
 extern int	ip_snmp_get(queue_t *q, mblk_t *mctl);
 extern int	ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
 extern void	ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void	ip_quiesce_conn(conn_t *);
 extern  void    ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipif_t *,
@@ -2842,6 +2850,7 @@
 			uint32_t, uint32_t, uint32_t, uint32_t);
 extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
 			uint_t);
+extern mblk_t	*ip_unbind(queue_t *, mblk_t *);
 
 /* Hooks for CGTP (multirt routes) filtering module */
 #define	CGTP_FILTER_REV_1	1
@@ -2925,17 +2934,6 @@
 	uint_t ill_mdt_span_limit; /* maximum payload span per packet */
 };
 
-/*
- * ioctl identifier and structure for Multidata Transmit update
- * private M_CTL communication from IP to ULP.
- */
-#define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
-
-typedef struct ip_mdt_info_s {
-	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
-	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
-} ip_mdt_info_t;
-
 struct ill_hcksum_capab_s {
 	uint_t	ill_hcksum_version;	/* interface version */
 	uint_t	ill_hcksum_txflags;	/* capabilities on transmit */
@@ -2991,35 +2989,6 @@
 };
 
 /*
- * Macro that determines whether or not a given ILL is allowed for MDT.
- */
-#define	ILL_MDT_USABLE(ill)	\
-	((ill->ill_capabilities & ILL_CAPAB_MDT) != 0 &&		\
-	ill->ill_mdt_capab != NULL &&					\
-	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
-	ill->ill_mdt_capab->ill_mdt_on != 0)
-
-/*
- * Macro that determines whether or not a given CONN may be considered
- * for fast path prior to proceeding further with Multidata.
- */
-#define	CONN_IS_MD_FASTPATH(connp)	\
-	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
-	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
-	(connp)->conn_xmit_if_ill == NULL &&	/* IP_XMIT_IF */	\
-	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
-	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
-
-/*
- * Macro that determines whether or not a given IPC requires
- * outbound IPSEC processing.
- */
-#define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
-	((connp)->conn_out_enforce_policy ||	\
-	((connp)->conn_latch != NULL &&		\
-	(connp)->conn_latch->ipl_out_policy != NULL))
-
-/*
  * IP squeues exports
  */
 extern int 		ip_squeue_profile;
@@ -3049,12 +3018,15 @@
 extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
 extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 extern void ip_squeue_clean(void *, mblk_t *, void *);
-
-extern	void	ip_resume_tcp_bind(void *, mblk_t *mp, void *);
+extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
+
+extern void tcp_wput(queue_t *, mblk_t *);
+
 extern int	ip_fill_mtuinfo(struct in6_addr *, in_port_t,
 	struct ip6_mtuinfo *);
-
-typedef	void	(*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
+extern	ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
+
+typedef void    (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
 
 /*
  * Squeue tags. Tags only need to be unique when the callback function is the
@@ -3091,6 +3063,11 @@
 #define	SQTAG_TCP_WPUT_OTHER		28
 #define	SQTAG_TCP_CONN_REQ_UNBOUND	29
 #define	SQTAG_TCP_SEND_PENDING		30
+#define	SQTAG_BIND_RETRY		31
+#define	SQTAG_UDP_FANOUT		32
+#define	SQTAG_UDP_INPUT			33
+#define	SQTAG_UDP_WPUT			34
+#define	SQTAG_UDP_OUTPUT		35
 
 #endif	/* _KERNEL */

--- a/usr/src/uts/common/inet/ip/igmp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/igmp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -1925,6 +1925,8 @@
 	igmpa->igmpa_group  = ilm->ilm_addr;
 	igmpa->igmpa_cksum  = 0;
 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
+	if (igmpa->igmpa_cksum == 0)
+		igmpa->igmpa_cksum = 0xffff;
 
 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
 	rtralert[1] = RTRALERT_LEN;

--- a/usr/src/uts/common/inet/ip/ip.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c	Sat Oct 22 22:50:14 2005 -0700
@@ -75,9 +75,11 @@
 #include <netinet/sctp.h>
 
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip6_asp.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
 #include <inet/ip_multi.h>
 #include <inet/ip_if.h>
 #include <inet/ip_ire.h>
@@ -110,6 +112,7 @@
 
 #include <inet/ipclassifier.h>
 #include <inet/sctp_ip.h>
+#include <inet/udp_impl.h>
 
 /*
  * Values for squeue switch:
@@ -122,7 +125,8 @@
 /*
  * IP statistics.
  */
-#define	IP_STAT(x)	(ip_statistics.x.value.ui64++)
+#define	IP_STAT(x)		(ip_statistics.x.value.ui64++)
+#define	IP_STAT_UPDATE(x, n)	(ip_statistics.x.value.ui64 += (n))
 
 typedef struct ip_stat {
 	kstat_named_t	ipsec_fanout_proto;
@@ -158,42 +162,68 @@
 	kstat_named_t   ip_ire_redirect_timer_expired;
 	kstat_named_t	ip_ire_pmtu_timer_expired;
 	kstat_named_t	ip_input_multi_squeue;
+	kstat_named_t	ip_tcp_in_full_hw_cksum_err;
+	kstat_named_t	ip_tcp_in_part_hw_cksum_err;
+	kstat_named_t	ip_tcp_in_sw_cksum_err;
+	kstat_named_t	ip_tcp_out_sw_cksum_bytes;
+	kstat_named_t	ip_udp_in_full_hw_cksum_err;
+	kstat_named_t	ip_udp_in_part_hw_cksum_err;
+	kstat_named_t	ip_udp_in_sw_cksum_err;
+	kstat_named_t	ip_udp_out_sw_cksum_bytes;
+	kstat_named_t	ip_frag_mdt_pkt_out;
+	kstat_named_t	ip_frag_mdt_discarded;
+	kstat_named_t	ip_frag_mdt_allocfail;
+	kstat_named_t	ip_frag_mdt_addpdescfail;
+	kstat_named_t	ip_frag_mdt_allocd;
 } ip_stat_t;
 
 static ip_stat_t ip_statistics = {
-	{ "ipsec_fanout_proto", 	KSTAT_DATA_UINT64 },
-	{ "ip_udp_fannorm", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fanmb", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fanothers", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fast_path", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_slow_path", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_input_err", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcppullup", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcpoptions", 		KSTAT_DATA_UINT64 },
-	{ "ip_multipkttcp", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_fast_path",		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_slow_path",		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_input_error",		KSTAT_DATA_UINT64 },
-	{ "ip_db_ref",			KSTAT_DATA_UINT64 },
-	{ "ip_notaligned1",		KSTAT_DATA_UINT64 },
-	{ "ip_notaligned2",		KSTAT_DATA_UINT64 },
-	{ "ip_multimblk3",		KSTAT_DATA_UINT64 },
-	{ "ip_multimblk4",		KSTAT_DATA_UINT64 },
-	{ "ip_ipoptions",		KSTAT_DATA_UINT64 },
-	{ "ip_classify_fail",		KSTAT_DATA_UINT64 },
-	{ "ip_opt",			KSTAT_DATA_UINT64 },
-	{ "ip_udp_rput_local",		KSTAT_DATA_UINT64 },
-	{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
-	{ "ip_conn_flputbq",		KSTAT_DATA_UINT64 },
-	{ "ip_conn_walk_drain",		KSTAT_DATA_UINT64 },
-	{ "ip_out_sw_cksum",		KSTAT_DATA_UINT64 },
-	{ "ip_in_sw_cksum",		KSTAT_DATA_UINT64 },
-	{ "ip_trash_ire_reclaim_calls",	KSTAT_DATA_UINT64 },
+	{ "ipsec_fanout_proto",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fannorm",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fanmb",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fanothers",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_input_err",			KSTAT_DATA_UINT64 },
+	{ "ip_tcppullup",			KSTAT_DATA_UINT64 },
+	{ "ip_tcpoptions",			KSTAT_DATA_UINT64 },
+	{ "ip_multipkttcp",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_input_error",			KSTAT_DATA_UINT64 },
+	{ "ip_db_ref",				KSTAT_DATA_UINT64 },
+	{ "ip_notaligned1",			KSTAT_DATA_UINT64 },
+	{ "ip_notaligned2",			KSTAT_DATA_UINT64 },
+	{ "ip_multimblk3",			KSTAT_DATA_UINT64 },
+	{ "ip_multimblk4",			KSTAT_DATA_UINT64 },
+	{ "ip_ipoptions",			KSTAT_DATA_UINT64 },
+	{ "ip_classify_fail",			KSTAT_DATA_UINT64 },
+	{ "ip_opt",				KSTAT_DATA_UINT64 },
+	{ "ip_udp_rput_local",			KSTAT_DATA_UINT64 },
+	{ "ipsec_proto_ahesp",			KSTAT_DATA_UINT64 },
+	{ "ip_conn_flputbq",			KSTAT_DATA_UINT64 },
+	{ "ip_conn_walk_drain",			KSTAT_DATA_UINT64 },
+	{ "ip_out_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip_in_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip_trash_ire_reclaim_calls",		KSTAT_DATA_UINT64 },
 	{ "ip_trash_ire_reclaim_success",	KSTAT_DATA_UINT64 },
-	{ "ip_ire_arp_timer_expired",	KSTAT_DATA_UINT64 },
+	{ "ip_ire_arp_timer_expired",		KSTAT_DATA_UINT64 },
 	{ "ip_ire_redirect_timer_expired",	KSTAT_DATA_UINT64 },
-	{ "ip_ire_pmtu_timer_expired",	KSTAT_DATA_UINT64 },
-	{ "ip_input_multi_squeue",	KSTAT_DATA_UINT64 },
+	{ "ip_ire_pmtu_timer_expired",		KSTAT_DATA_UINT64 },
+	{ "ip_input_multi_squeue",		KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_allocd",			KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *ip_kstat;
@@ -591,28 +621,12 @@
 /* RFC1122 Conformance */
 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
 
-#ifdef	_BIG_ENDIAN
-#define	IP_HDR_CSUM_TTL_ADJUST	256
-#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
-#define	IP_UDP_CSUM_COMP	IPPROTO_UDP
-#else
-#define	IP_HDR_CSUM_TTL_ADJUST	1
-#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
-#define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
-#endif
-
-#define	TCP_CHECKSUM_OFFSET		16
-#define	UDP_CHECKSUM_OFFSET		6
-
 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
 
-#define	UDPH_SIZE	8
-
 /* Leave room for ip_newroute to tack on the src and target addresses */
 #define	OK_RESOLVER_MP(mp)						\
 	((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
 
-static ipif_t	*conn_get_held_ipif(conn_t *, ipif_t **, int *);
 static int	conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
 
 static mblk_t	*ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t);
@@ -668,6 +682,8 @@
 static boolean_t	ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
 			    ire_t *);
 static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *);
+static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
+		    uint16_t *);
 int		ip_snmp_get(queue_t *, mblk_t *);
 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *);
 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *);
@@ -692,7 +708,6 @@
 static boolean_t	ip_source_routed(ipha_t *);
 static boolean_t	ip_source_route_included(ipha_t *);
 
-static void	ip_unbind(queue_t *, mblk_t *);
 static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t);
 static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int);
 static void	ip_wput_local_options(ipha_t *);
@@ -767,6 +782,15 @@
 time_t	ip_g_frag_timeout = IP_FRAG_TIMEOUT;
 clock_t	ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
 
+/*
+ * Threshold which determines whether MDT should be used when
+ * generating IP fragments; payload size must be greater than
+ * this threshold for MDT to take place.
+ */
+#define	IP_WPUT_FRAG_MDT_MIN	32768
+
+int	ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
+
 /* Protected by ip_mi_lock */
 static void	*ip_g_head;		/* Instance Data List Head */
 kmutex_t	ip_mi_lock;		/* Lock for list of instances */
@@ -1431,7 +1455,7 @@
 };
 
 struct module_info ip_mod_info = {
-	5701, "ip", 1, INFPSZ, 65536, 1024
+	IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
 };
 
 static struct qinit rinit = {
@@ -1930,6 +1954,8 @@
 	/* Send out an ICMP packet */
 	icmph->icmph_checksum = 0;
 	icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xFFFF;
 	if (broadcast || CLASSD(ipha->ipha_dst)) {
 		ipif_t	*ipif_chosen;
 		/*
@@ -3204,6 +3230,8 @@
 	bcopy(stuff, icmph, len);
 	icmph->icmph_checksum = 0;
 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xFFFF;
 	BUMP_MIB(&icmp_mib, icmpOutMsgs);
 	put(q, ipsec_mp);
 }
@@ -3704,7 +3732,7 @@
 	ASSERT(!connp->conn_af_isv6);
 	connp->conn_pkt_isv6 = B_FALSE;
 
-	len = mp->b_wptr - mp->b_rptr;
+	len = MBLKL(mp);
 	if (len < (sizeof (*tbr) + 1)) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "ip_bind: bogus msg, len %ld", len);
@@ -3716,7 +3744,7 @@
 	protocol = *mp->b_wptr & 0xFF;
 	tbr = (struct T_bind_req *)mp->b_rptr;
 	/* Reset the message type in preparation for shipping it back. */
-	mp->b_datap->db_type = M_PCPROTO;
+	DB_TYPE(mp) = M_PCPROTO;
 
 	connp->conn_ulp = (uint8_t)protocol;
 
@@ -3762,8 +3790,8 @@
 	 */
 
 	mp1 = mp->b_cont;
-	ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE);
-	ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET);
+	ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE);
+	ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET);
 
 	switch (tbr->ADDR_length) {
 	default:
@@ -4169,7 +4197,7 @@
 	if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
 	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
 	    (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
-	    (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+	    ILL_MDT_CAPABLE(md_ill)) {
 		md_dst_ire = dst_ire;
 		IRE_REFHOLD(md_dst_ire);
 	}
@@ -4689,43 +4717,19 @@
 }
 
 /*
- * IP has been configured as _D_QNEXTLESS for the client side i.e the driver
- * instance. This implies that
- * 1. IP cannot access the read side q_next pointer directly - it must
- *    use routines like putnext and canputnext.
- * 2. ip_close must ensure that all sources of messages being putnext upstream
- *    are gone before qprocsoff is called.
- *
- * #2 is handled by having ip_close do the ipcl_hash_remove and wait for
- * conn_ref to drop to zero before calling qprocsoff.
- */
-
-/* ARGSUSED */
-int
-ip_close(queue_t *q, int flags)
-{
-	conn_t		*connp;
+ * This is called as part of close() for both IP and UDP
+ * in order to quiesce the conn.
+ */
+void
+ip_quiesce_conn(conn_t *connp)
+{
 	boolean_t	drain_cleanup_reqd = B_FALSE;
 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
 	boolean_t	ilg_cleanup_reqd = B_FALSE;
 
-	TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
-
-	/*
-	 * Call the appropriate delete routine depending on whether this is
-	 * a module or device.
-	 */
-	if (WR(q)->q_next != NULL) {
-		/* This is a module close */
-		return (ip_modclose((ill_t *)q->q_ptr));
-	}
-
-	connp = Q_TO_CONN(q);
-	ASSERT(connp->conn_tcp == NULL);
-
-	/*
-	 * We are being closed as /dev/ip or /dev/ip6.
-	 *
+	ASSERT(!IPCL_IS_TCP(connp));
+
+	/*
 	 * Mark the conn as closing, and this conn must not be
 	 * inserted in future into any list. Eg. conn_drain_insert(),
 	 * won't insert this conn into the conn_drain_list.
@@ -4736,6 +4740,7 @@
 	 * cannot get set henceforth.
 	 */
 	mutex_enter(&connp->conn_lock);
+	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
 	connp->conn_state_flags |= CONN_CLOSING;
 	if (connp->conn_idl != NULL)
 		drain_cleanup_reqd = B_TRUE;
@@ -4745,17 +4750,17 @@
 		ilg_cleanup_reqd = B_TRUE;
 	mutex_exit(&connp->conn_lock);
 
+	if (IPCL_IS_UDP(connp))
+		udp_quiesce_conn(connp);
+
 	if (conn_ioctl_cleanup_reqd)
 		conn_ioctl_cleanup(connp);
 
 	/*
 	 * Remove this conn from any fanout list it is on.
-	 * Then wait until the number of pending putnexts from
-	 * the fanout code drops to zero, before calling qprocsoff.
-	 * This is the guarantee a QNEXTLESS driver provides to
-	 * STREAMS, and is mentioned at the top of this function.
-	 */
-
+	 * and then wait for any threads currently operating
+	 * on this endpoint to finish
+	 */
 	ipcl_hash_remove(connp);
 
 	/*
@@ -4776,7 +4781,6 @@
 
 	conn_delete_ire(connp, NULL);
 
-
 	/*
 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
 	 * callers from write side can't be there now because close
@@ -4787,7 +4791,29 @@
 	connp->conn_state_flags |= CONN_CONDEMNED;
 	while (connp->conn_ref != 1)
 		cv_wait(&connp->conn_cv, &connp->conn_lock);
+	connp->conn_state_flags |= CONN_QUIESCED;
 	mutex_exit(&connp->conn_lock);
+}
+
+/* ARGSUSED */
+int
+ip_close(queue_t *q, int flags)
+{
+	conn_t		*connp;
+
+	TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
+
+	/*
+	 * Call the appropriate delete routine depending on whether this is
+	 * a module or device.
+	 */
+	if (WR(q)->q_next != NULL) {
+		/* This is a module close */
+		return (ip_modclose((ill_t *)q->q_ptr));
+	}
+
+	connp = q->q_ptr;
+	ip_quiesce_conn(connp);
 
 	qprocsoff(q);
 
@@ -4801,6 +4827,15 @@
 	 * has completed, and service has completed or won't run in
 	 * future.
 	 */
+	ASSERT(connp->conn_ref == 1);
+
+	/*
+	 * A conn which was previously marked as IPCL_UDP cannot
+	 * retain the flag because it would have been cleared by
+	 * udp_close().
+	 */
+	ASSERT(!IPCL_IS_UDP(connp));
+
 	if (connp->conn_latch != NULL) {
 		IPLATCH_REFRELE(connp->conn_latch);
 		connp->conn_latch = NULL;
@@ -4827,6 +4862,83 @@
 	return (0);
 }
 
+int
+ip_snmpmod_close(queue_t *q)
+{
+	conn_t *connp = Q_TO_CONN(q);
+	ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+	qprocsoff(q);
+
+	if (connp->conn_flags & IPCL_UDPMOD)
+		udp_close_free(connp);
+
+	if (connp->conn_cred != NULL) {
+		crfree(connp->conn_cred);
+		connp->conn_cred = NULL;
+	}
+	CONN_DEC_REF(connp);
+	q->q_ptr = WR(q)->q_ptr = NULL;
+	return (0);
+}
+
+/*
+ * Write side put procedure for TCP module or UDP module instance.  TCP/UDP
+ * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP.
+ * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ.
+ * M_FLUSH messages and ioctls are only passed downstream; we don't flush our
+ * queues as we never enqueue messages there and we don't handle any ioctls.
+ * Everything else is freed.
+ */
+void
+ip_snmpmod_wput(queue_t *q, mblk_t *mp)
+{
+	conn_t	*connp = q->q_ptr;
+	pfi_t	setfn;
+	pfi_t	getfn;
+
+	ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+	switch (DB_TYPE(mp)) {
+	case M_PROTO:
+	case M_PCPROTO:
+		if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
+		    ((((union T_primitives *)mp->b_rptr)->type ==
+			T_SVR4_OPTMGMT_REQ) ||
+		    (((union T_primitives *)mp->b_rptr)->type ==
+			T_OPTMGMT_REQ))) {
+			/*
+			 * This is the only TPI primitive supported. Its
+			 * handling does not require tcp_t, but it does require
+			 * conn_t to check permissions.
+			 */
+			cred_t	*cr = DB_CREDDEF(mp, connp->conn_cred);
+
+			if (connp->conn_flags & IPCL_TCPMOD) {
+				setfn = tcp_snmp_set;
+				getfn = tcp_snmp_get;
+			} else {
+				setfn = udp_snmp_set;
+				getfn = udp_snmp_get;
+			}
+			if (!snmpcom_req(q, mp, setfn, getfn, cr)) {
+				freemsg(mp);
+				return;
+			}
+		} else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
+		    != NULL)
+			qreply(q, mp);
+		break;
+	case M_FLUSH:
+	case M_IOCTL:
+		putnext(q, mp);
+		break;
+	default:
+		freemsg(mp);
+		break;
+	}
+}
+
 /* Return the IP checksum for the IP header at "iph". */
 uint16_t
 ip_csum_hdr(ipha_t *ipha)
@@ -5081,7 +5193,7 @@
  * Send an ICMP error after patching up the packet appropriately.  Returns
  * non-zero if the appropriate MIB should be bumped; zero otherwise.
  */
-static int
+static boolean_t
 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
     uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid)
 {
@@ -5103,8 +5215,8 @@
 		 * ipsec_check_global_policy() assumes M_DATA as clear
 		 * and M_CTL as secure.
 		 */
-		db_type = mp->b_datap->db_type;
-		mp->b_datap->db_type = M_DATA;
+		db_type = DB_TYPE(mp);
+		DB_TYPE(mp) = M_DATA;
 		secure = B_FALSE;
 	}
 	/*
@@ -5119,17 +5231,17 @@
 		first_mp = ipsec_check_global_policy(first_mp, NULL,
 		    ipha, NULL, mctl_present);
 		if (first_mp == NULL)
-			return (0);
+			return (B_FALSE);
 	}
 
 	if (!mctl_present)
-		mp->b_datap->db_type = db_type;
+		DB_TYPE(mp) = db_type;
 
 	if (flags & IP_FF_SEND_ICMP) {
 		if (flags & IP_FF_HDR_COMPLETE) {
 			if (ip_hdr_complete(ipha, zoneid)) {
 				freemsg(first_mp);
-				return (1);
+				return (B_TRUE);
 			}
 		}
 		if (flags & IP_FF_CKSUM) {
@@ -5152,10 +5264,10 @@
 		}
 	} else {
 		freemsg(first_mp);
-		return (0);
-	}
-
-	return (1);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
 }
 
 #ifdef DEBUG
@@ -5592,7 +5704,7 @@
 			}
 
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart = (intptr_t)sqp;
+			DB_CKSUMSTART(mp) = (intptr_t)sqp;
 			syn_present = B_TRUE;
 		}
 	}
@@ -5720,7 +5832,6 @@
     boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
     boolean_t ip_policy)
 {
-	queue_t		*rq = connp->conn_rq;
 	boolean_t	mctl_present = (first_mp != NULL);
 	uint32_t	in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
 	uint32_t	ill_index;
@@ -5730,7 +5841,7 @@
 	else
 		first_mp = mp;
 
-	if (!canputnext(rq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		BUMP_MIB(&ip_mib, udpInOverflows);
 		freemsg(first_mp);
 		return;
@@ -5776,7 +5887,9 @@
 		mp = ip_add_info(mp, recv_ill, in_flags);
 	}
 	BUMP_MIB(&ip_mib, ipInDelivers);
-	putnext(rq, mp);
+
+	/* Send it upstream */
+	CONN_UDP_RECV(connp, mp);
 }
 
 /*
@@ -8454,7 +8567,6 @@
 		return (ip_modopen(q, devp, flag, sflag, credp));
 	}
 
-
 	/*
 	 * We are opening as a device. This is an IP client stream, and we
 	 * allocate an conn_t as the instance data.
@@ -8463,6 +8575,9 @@
 	connp->conn_upq = q;
 	q->q_ptr = WR(q)->q_ptr = connp;
 
+	if (flag & SO_SOCKSTR)
+		connp->conn_flags |= IPCL_SOCKET;
+
 	/* Minor tells us which /dev entry was opened */
 	if (geteminor(*devp) == IPV6_MINOR) {
 		connp->conn_flags |= IPCL_ISV6;
@@ -8474,9 +8589,7 @@
 		connp->conn_pkt_isv6 = B_FALSE;
 	}
 
-
-	if ((connp->conn_dev =
-	    inet_minor_alloc(ip_minor_arena)) == 0) {
+	if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
 		q->q_ptr = WR(q)->q_ptr = NULL;
 		CONN_DEC_REF(connp);
 		return (EBUSY);
@@ -10734,381 +10847,455 @@
 }
 
 /*
- * Do fragmentation reassembly.
- * returns B_TRUE if successful else B_FALSE.
+ * Fragmentation reassembly.  Each ILL has a hash table for
+ * queuing packets undergoing reassembly for all IPIFs
+ * associated with the ILL.  The hash is based on the packet
+ * IP ident field.  The ILL frag hash table was allocated
+ * as a timer block at the time the ILL was created.  Whenever
+ * there is anything on the reassembly queue, the timer will
+ * be running.  Returns B_TRUE if successful else B_FALSE;
  * frees mp on failure.
  */
 static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha)
+ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+    uint32_t *cksum_val, uint16_t *cksum_flags)
 {
 	uint32_t	frag_offset_flags;
-	ill_t   *ill = (ill_t *)q->q_ptr;
-	mblk_t *mp = *mpp;
-	mblk_t *t_mp;
+	ill_t		*ill = (ill_t *)q->q_ptr;
+	mblk_t		*mp = *mpp;
+	mblk_t		*t_mp;
 	ipaddr_t	dst;
+	uint8_t		proto = ipha->ipha_protocol;
+	uint32_t	sum_val;
+	uint16_t	sum_flags;
+	ipf_t		*ipf;
+	ipf_t		**ipfp;
+	ipfb_t		*ipfb;
+	uint16_t	ident;
+	uint32_t	offset;
+	ipaddr_t	src;
+	uint_t		hdr_length;
+	uint32_t	end;
+	mblk_t		*mp1;
+	mblk_t		*tail_mp;
+	size_t		count;
+	size_t		msg_len;
+	uint8_t		ecn_info = 0;
+	uint32_t	packet_size;
+	boolean_t	pruned = B_FALSE;
+
+	if (cksum_val != NULL)
+		*cksum_val = 0;
+	if (cksum_flags != NULL)
+		*cksum_flags = 0;
 
 	/*
 	 * Drop the fragmented as early as possible, if
 	 * we don't have resource(s) to re-assemble.
 	 */
-
 	if (ip_reass_queue_bytes == 0) {
 		freemsg(mp);
 		return (B_FALSE);
 	}
 
+	/* Check for fragmentation offset; return if there's none */
+	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
+	    (IPH_MF | IPH_OFFSET)) == 0)
+		return (B_TRUE);
+
+	/*
+	 * We utilize hardware computed checksum info only for UDP since
+	 * IP fragmentation is a normal occurence for the protocol.  In
+	 * addition, checksum offload support for IP fragments carrying
+	 * UDP payload is commonly implemented across network adapters.
+	 */
+	ASSERT(ill != NULL);
+	if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+		mblk_t *mp1 = mp->b_cont;
+		int32_t len;
+
+		/* Record checksum information from the packet */
+		sum_val = (uint32_t)DB_CKSUM16(mp);
+		sum_flags = DB_CKSUMFLAGS(mp);
+
+		/* IP payload offset from beginning of mblk */
+		offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
+
+		if ((sum_flags & HCK_PARTIALCKSUM) &&
+		    (mp1 == NULL || mp1->b_cont == NULL) &&
+		    offset >= DB_CKSUMSTART(mp) &&
+		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
+			uint32_t adj;
+			/*
+			 * Partial checksum has been calculated by hardware
+			 * and attached to the packet; in addition, any
+			 * prepended extraneous data is even byte aligned.
+			 * If any such data exists, we adjust the checksum;
+			 * this would also handle any postpended data.
+			 */
+			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+			    mp, mp1, len, adj);
+
+			/* One's complement subtract extraneous checksum */
+			if (adj >= sum_val)
+				sum_val = ~(adj - sum_val) & 0xFFFF;
+			else
+				sum_val -= adj;
+		}
+	} else {
+		sum_val = 0;
+		sum_flags = 0;
+	}
+
+	/* Clear hardware checksumming flag */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	ident = ipha->ipha_ident;
+	offset = (frag_offset_flags << 3) & 0xFFFF;
+	src = ipha->ipha_src;
 	dst = ipha->ipha_dst;
-
-	/* Clear hardware checksumming flag if set */
-	mp->b_datap->db_struioun.cksum.flags = 0;
-
-	/* Check for fragmentation offset. */
-	frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
-	    (IPH_MF | IPH_OFFSET);
-	if (frag_offset_flags) {
-		ipf_t		*ipf;
-		ipf_t		**ipfp;
-		ipfb_t		*ipfb;
-		uint16_t	ident;
-		uint32_t	offset;
-		ipaddr_t	src;
-		uint_t		hdr_length;
-		uint32_t	end;
-		uint8_t		proto;
-		mblk_t		*mp1;
-		mblk_t		*tail_mp;
-		size_t		count;
-		size_t		msg_len;
-		uint8_t		ecn_info = 0;
-		uint32_t	packet_size;
-		boolean_t 	pruned = B_FALSE;
-
-		ident = ipha->ipha_ident;
-		offset = (frag_offset_flags << 3) & 0xFFFF;
-		src = ipha->ipha_src;
-		hdr_length = IPH_HDR_LENGTH(ipha);
-		end = ntohs(ipha->ipha_length) - hdr_length;
-
-		/*
-		 * if end == 0 then we have a packet with no data, so just
-		 * free it.
-		 */
-		if (end == 0) {
+	hdr_length = IPH_HDR_LENGTH(ipha);
+	end = ntohs(ipha->ipha_length) - hdr_length;
+
+	/* If end == 0 then we have a packet with no data, so just free it */
+	if (end == 0) {
+		freemsg(mp);
+		return (B_FALSE);
+	}
+
+	/* Record the ECN field info. */
+	ecn_info = (ipha->ipha_type_of_service & 0x3);
+	if (offset != 0) {
+		/*
+		 * If this isn't the first piece, strip the header, and
+		 * add the offset to the end value.
+		 */
+		mp->b_rptr += hdr_length;
+		end += offset;
+	}
+
+	msg_len = MBLKSIZE(mp);
+	tail_mp = mp;
+	while (tail_mp->b_cont != NULL) {
+		tail_mp = tail_mp->b_cont;
+		msg_len += MBLKSIZE(tail_mp);
+	}
+
+	/* If the reassembly list for this ILL will get too big, prune it */
+	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
+	    ip_reass_queue_bytes) {
+		ill_frag_prune(ill,
+		    (ip_reass_queue_bytes < msg_len) ? 0 :
+		    (ip_reass_queue_bytes - msg_len));
+		pruned = B_TRUE;
+	}
+
+	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
+	mutex_enter(&ipfb->ipfb_lock);
+
+	ipfp = &ipfb->ipfb_ipf;
+	/* Try to find an existing fragment queue for this packet. */
+	for (;;) {
+		ipf = ipfp[0];
+		if (ipf != NULL) {
+			/*
+			 * It has to match on ident and src/dst address.
+			 */
+			if (ipf->ipf_ident == ident &&
+			    ipf->ipf_src == src &&
+			    ipf->ipf_dst == dst &&
+			    ipf->ipf_protocol == proto) {
+				/*
+				 * If we have received too many
+				 * duplicate fragments for this packet
+				 * free it.
+				 */
+				if (ipf->ipf_num_dups > ip_max_frag_dups) {
+					ill_frag_free_pkts(ill, ipfb, ipf, 1);
+					freemsg(mp);
+					mutex_exit(&ipfb->ipfb_lock);
+					return (B_FALSE);
+				}
+				/* Found it. */
+				break;
+			}
+			ipfp = &ipf->ipf_hash_next;
+			continue;
+		}
+
+		/*
+		 * If we pruned the list, do we want to store this new
+		 * fragment?. We apply an optimization here based on the
+		 * fact that most fragments will be received in order.
+		 * So if the offset of this incoming fragment is zero,
+		 * it is the first fragment of a new packet. We will
+		 * keep it.  Otherwise drop the fragment, as we have
+		 * probably pruned the packet already (since the
+		 * packet cannot be found).
+		 */
+		if (pruned && offset != 0) {
+			mutex_exit(&ipfb->ipfb_lock);
 			freemsg(mp);
 			return (B_FALSE);
 		}
-		proto = ipha->ipha_protocol;
-
-		/*
-		 * Fragmentation reassembly.  Each ILL has a hash table for
-		 * queuing packets undergoing reassembly for all IPIFs
-		 * associated with the ILL.  The hash is based on the packet
-		 * IP ident field.  The ILL frag hash table was allocated
-		 * as a timer block at the time the ILL was created.  Whenever
-		 * there is anything on the reassembly queue, the timer will
-		 * be running.
-		 */
-		ASSERT(ill != NULL);
-
-		/* Record the ECN field info. */
-		ecn_info = (ipha->ipha_type_of_service & 0x3);
-		if (offset != 0) {
-			/*
-			 * If this isn't the first piece, strip the header, and
-			 * add the offset to the end value.
-			 */
-			mp->b_rptr += hdr_length;
-			end += offset;
-		}
-
-		msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
-		tail_mp = mp;
-		while (tail_mp->b_cont != NULL) {
-			tail_mp = tail_mp->b_cont;
-			msg_len += tail_mp->b_datap->db_lim -
-			    tail_mp->b_datap->db_base;
-		}
-
-		/*
-		 * If the reassembly list for this ILL will get too big
-		 * prune it.
-		 */
-		if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
-		    ip_reass_queue_bytes) {
-			ill_frag_prune(ill,
-			    (ip_reass_queue_bytes < msg_len) ? 0 :
-			    (ip_reass_queue_bytes - msg_len));
-			pruned = B_TRUE;
-		}
-
-		ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
-		mutex_enter(&ipfb->ipfb_lock);
-
-		ipfp = &ipfb->ipfb_ipf;
-		/* Try to find an existing fragment queue for this packet. */
-		for (;;) {
-			ipf = ipfp[0];
-			if (ipf != NULL) {
-				/*
-				 * It has to match on ident and src/dst address.
-				 */
-				if (ipf->ipf_ident == ident &&
-				    ipf->ipf_src == src &&
-				    ipf->ipf_dst == dst &&
-				    ipf->ipf_protocol == proto) {
-					/*
-					 * If we have received too many
-					 * duplicate fragments for this packet
-					 * free it.
-					 */
-					if (ipf->ipf_num_dups >
-					    ip_max_frag_dups) {
-						ill_frag_free_pkts(ill, ipfb,
-						    ipf, 1);
-						freemsg(mp);
-						mutex_exit(&ipfb->ipfb_lock);
-						return (B_FALSE);
-					}
-					/* Found it. */
-					break;
-				}
-				ipfp = &ipf->ipf_hash_next;
-				continue;
-			}
-
-			/*
-			 * If we pruned the list, do we want to store this new
-			 * fragment?. We apply an optimization here based on the
-			 * fact that most fragments will be received in order.
-			 * So if the offset of this incoming fragment is zero,
-			 * it is the first fragment of a new packet. We will
-			 * keep it.  Otherwise drop the fragment, as we have
-			 * probably pruned the packet already (since the
-			 * packet cannot be found).
-			 */
-			if (pruned && offset != 0) {
-				mutex_exit(&ipfb->ipfb_lock);
-				freemsg(mp);
-				return (B_FALSE);
-			}
-
-			if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS)  {
-				/*
-				 * Too many fragmented packets in this hash
-				 * bucket. Free the oldest.
-				 */
-				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
-				    1);
-			}
-
-			/* New guy.  Allocate a frag message. */
-			mp1 = allocb(sizeof (*ipf), BPRI_MED);
-			if (mp1 == NULL) {
-				BUMP_MIB(&ip_mib, ipInDiscards);
-				freemsg(mp);
+
+		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS)  {
+			/*
+			 * Too many fragmented packets in this hash
+			 * bucket. Free the oldest.
+			 */
+			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
+		}
+
+		/* New guy.  Allocate a frag message. */
+		mp1 = allocb(sizeof (*ipf), BPRI_MED);
+		if (mp1 == NULL) {
+			BUMP_MIB(&ip_mib, ipInDiscards);
+			freemsg(mp);
 reass_done:
-				mutex_exit(&ipfb->ipfb_lock);
-				return (B_FALSE);
-			}
-
-
-			BUMP_MIB(&ip_mib, ipReasmReqds);
-			mp1->b_cont = mp;
-
-			/* Initialize the fragment header. */
-			ipf = (ipf_t *)mp1->b_rptr;
-			ipf->ipf_mp = mp1;
-			ipf->ipf_ptphn = ipfp;
-			ipfp[0] = ipf;
-			ipf->ipf_hash_next = NULL;
-			ipf->ipf_ident = ident;
-			ipf->ipf_protocol = proto;
-			ipf->ipf_src = src;
-			ipf->ipf_dst = dst;
-			ipf->ipf_nf_hdr_len = 0;
-			/* Record reassembly start time. */
-			ipf->ipf_timestamp = gethrestime_sec();
-			/* Record ipf generation and account for frag header */
-			ipf->ipf_gen = ill->ill_ipf_gen++;
-			ipf->ipf_count = mp1->b_datap->db_lim -
-			    mp1->b_datap->db_base;
-			ipf->ipf_last_frag_seen = B_FALSE;
-			ipf->ipf_ecn = ecn_info;
-			ipf->ipf_num_dups = 0;
-			ipfb->ipfb_frag_pkts++;
-
-			/*
-			 * We handle reassembly two ways.  In the easy case,
-			 * where all the fragments show up in order, we do
-			 * minimal bookkeeping, and just clip new pieces on
-			 * the end.  If we ever see a hole, then we go off
-			 * to ip_reassemble which has to mark the pieces and
-			 * keep track of the number of holes, etc.  Obviously,
-			 * the point of having both mechanisms is so we can
-			 * handle the easy case as efficiently as possible.
-			 */
-			if (offset == 0) {
-				/* Easy case, in-order reassembly so far. */
-				ipf->ipf_count += msg_len;
-				ipf->ipf_tail_mp = tail_mp;
-				/*
-				 * Keep track of next expected offset in
-				 * ipf_end.
-				 */
-				ipf->ipf_end = end;
-				ipf->ipf_nf_hdr_len = hdr_length;
-			} else {
-				/* Hard case, hole at the beginning. */
-				ipf->ipf_tail_mp = NULL;
-				/*
-				 * ipf_end == 0 means that we have given up
-				 * on easy reassembly.
-				 */
-				ipf->ipf_end = 0;
-				/*
-				 * ipf_hole_cnt is set by ip_reassemble.
-				 * ipf_count is updated by ip_reassemble.
-				 * No need to check for return value here
-				 * as we don't expect reassembly to complete
-				 * or fail for the first fragment itself.
-				 */
-				(void) ip_reassemble(mp, ipf,
-				    (frag_offset_flags & IPH_OFFSET) << 3,
-				    (frag_offset_flags & IPH_MF), ill, msg_len);
-			}
-			/* Update per ipfb and ill byte counts */
-			ipfb->ipfb_count += ipf->ipf_count;
-			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
-			ill->ill_frag_count += ipf->ipf_count;
-			ASSERT(ill->ill_frag_count > 0); /* Wraparound */
-			/* If the frag timer wasn't already going, start it. */
-			mutex_enter(&ill->ill_lock);
-			ill_frag_timer_start(ill);
-			mutex_exit(&ill->ill_lock);
-			goto reass_done;
-		}
-
-		/*
-		 * We have a new piece of a datagram which is already being
-		 * reassembled.  Update the ECN info if all IP fragments
-		 * are ECN capable.  If there is one which is not, clear
-		 * all the info.  If there is at least one which has CE
-		 * code point, IP needs to report that up to transport.
-		 */
-		if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
-			if (ecn_info == IPH_ECN_CE)
-				ipf->ipf_ecn = IPH_ECN_CE;
-		} else {
-			ipf->ipf_ecn = IPH_ECN_NECT;
-		}
-		if (offset && ipf->ipf_end == offset) {
-			/* The new fragment fits at the end */
-			ipf->ipf_tail_mp->b_cont = mp;
-			/* Update the byte count */
+			mutex_exit(&ipfb->ipfb_lock);
+			return (B_FALSE);
+		}
+
+
+		BUMP_MIB(&ip_mib, ipReasmReqds);
+		mp1->b_cont = mp;
+
+		/* Initialize the fragment header. */
+		ipf = (ipf_t *)mp1->b_rptr;
+		ipf->ipf_mp = mp1;
+		ipf->ipf_ptphn = ipfp;
+		ipfp[0] = ipf;
+		ipf->ipf_hash_next = NULL;
+		ipf->ipf_ident = ident;
+		ipf->ipf_protocol = proto;
+		ipf->ipf_src = src;
+		ipf->ipf_dst = dst;
+		ipf->ipf_nf_hdr_len = 0;
+		/* Record reassembly start time. */
+		ipf->ipf_timestamp = gethrestime_sec();
+		/* Record ipf generation and account for frag header */
+		ipf->ipf_gen = ill->ill_ipf_gen++;
+		ipf->ipf_count = MBLKSIZE(mp1);
+		ipf->ipf_last_frag_seen = B_FALSE;
+		ipf->ipf_ecn = ecn_info;
+		ipf->ipf_num_dups = 0;
+		ipfb->ipfb_frag_pkts++;
+		ipf->ipf_checksum = 0;
+		ipf->ipf_checksum_flags = 0;
+
+		/* Store checksum value in fragment header */
+		if (sum_flags != 0) {
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			ipf->ipf_checksum = sum_val;
+			ipf->ipf_checksum_flags = sum_flags;
+		}
+
+		/*
+		 * We handle reassembly two ways.  In the easy case,
+		 * where all the fragments show up in order, we do
+		 * minimal bookkeeping, and just clip new pieces on
+		 * the end.  If we ever see a hole, then we go off
+		 * to ip_reassemble which has to mark the pieces and
+		 * keep track of the number of holes, etc.  Obviously,
+		 * the point of having both mechanisms is so we can
+		 * handle the easy case as efficiently as possible.
+		 */
+		if (offset == 0) {
+			/* Easy case, in-order reassembly so far. */
 			ipf->ipf_count += msg_len;
-			/* Update per ipfb and ill byte counts */
-			ipfb->ipfb_count += msg_len;
-			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
-			ill->ill_frag_count += msg_len;
-			ASSERT(ill->ill_frag_count > 0); /* Wraparound */
-			if (frag_offset_flags & IPH_MF) {
-				/* More to come. */
-				ipf->ipf_end = end;
-				ipf->ipf_tail_mp = tail_mp;
-				goto reass_done;
-			}
-		} else {
-			/* Go do the hard cases. */
-			int ret;
-
-			if (offset == 0)
-				ipf->ipf_nf_hdr_len = hdr_length;
-
-			/* Save current byte count */
-			count = ipf->ipf_count;
-			ret = ip_reassemble(mp, ipf,
+			ipf->ipf_tail_mp = tail_mp;
+			/*
+			 * Keep track of next expected offset in
+			 * ipf_end.
+			 */
+			ipf->ipf_end = end;
+			ipf->ipf_nf_hdr_len = hdr_length;
+		} else {
+			/* Hard case, hole at the beginning. */
+			ipf->ipf_tail_mp = NULL;
+			/*
+			 * ipf_end == 0 means that we have given up
+			 * on easy reassembly.
+			 */
+			ipf->ipf_end = 0;
+
+			/* Forget checksum offload from now on */
+			ipf->ipf_checksum_flags = 0;
+
+			/*
+			 * ipf_hole_cnt is set by ip_reassemble.
+			 * ipf_count is updated by ip_reassemble.
+			 * No need to check for return value here
+			 * as we don't expect reassembly to complete
+			 * or fail for the first fragment itself.
+			 */
+			(void) ip_reassemble(mp, ipf,
 			    (frag_offset_flags & IPH_OFFSET) << 3,
 			    (frag_offset_flags & IPH_MF), ill, msg_len);
-			/* Count of bytes added and subtracted (freeb()ed) */
-			count = ipf->ipf_count - count;
-			if (count) {
-				/* Update per ipfb and ill byte counts */
-				ipfb->ipfb_count += count;
-				ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
-				ill->ill_frag_count += count;
-				ASSERT(ill->ill_frag_count > 0);
-			}
-			if (ret == IP_REASS_PARTIAL) {
-				goto reass_done;
-			} else if (ret == IP_REASS_FAILED) {
-				/* Reassembly failed. Free up all resources */
-				ill_frag_free_pkts(ill, ipfb, ipf, 1);
-				for (t_mp = mp; t_mp != NULL;
-				    t_mp = t_mp->b_cont) {
-					IP_REASS_SET_START(t_mp, 0);
-					IP_REASS_SET_END(t_mp, 0);
-				}
-				freemsg(mp);
-				goto reass_done;
-			}
-			/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
-		}
-		/*
-		 * We have completed reassembly.  Unhook the frag header from
-		 * the reassembly list.
-		 *
-		 * Before we free the frag header, record the ECN info
-		 * to report back to the transport.
-		 */
-		ecn_info = ipf->ipf_ecn;
-		BUMP_MIB(&ip_mib, ipReasmOKs);
-		ipfp = ipf->ipf_ptphn;
-		mp1 = ipf->ipf_mp;
+		}
+		/* Update per ipfb and ill byte counts */
+		ipfb->ipfb_count += ipf->ipf_count;
+		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
+		ill->ill_frag_count += ipf->ipf_count;
+		ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+		/* If the frag timer wasn't already going, start it. */
+		mutex_enter(&ill->ill_lock);
+		ill_frag_timer_start(ill);
+		mutex_exit(&ill->ill_lock);
+		goto reass_done;
+	}
+
+	/*
+	 * If the packet's flag has changed (it could be coming up
+	 * from an interface different than the previous, therefore
+	 * possibly different checksum capability), then forget about
+	 * any stored checksum states.  Otherwise add the value to
+	 * the existing one stored in the fragment header.
+	 */
+	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+		sum_val += ipf->ipf_checksum;
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		ipf->ipf_checksum = sum_val;
+	} else if (ipf->ipf_checksum_flags != 0) {
+		/* Forget checksum offload from now on */
+		ipf->ipf_checksum_flags = 0;
+	}
+
+	/*
+	 * We have a new piece of a datagram which is already being
+	 * reassembled.  Update the ECN info if all IP fragments
+	 * are ECN capable.  If there is one which is not, clear
+	 * all the info.  If there is at least one which has CE
+	 * code point, IP needs to report that up to transport.
+	 */
+	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
+		if (ecn_info == IPH_ECN_CE)
+			ipf->ipf_ecn = IPH_ECN_CE;
+	} else {
+		ipf->ipf_ecn = IPH_ECN_NECT;
+	}
+	if (offset && ipf->ipf_end == offset) {
+		/* The new fragment fits at the end */
+		ipf->ipf_tail_mp->b_cont = mp;
+		/* Update the byte count */
+		ipf->ipf_count += msg_len;
+		/* Update per ipfb and ill byte counts */
+		ipfb->ipfb_count += msg_len;
+		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
+		ill->ill_frag_count += msg_len;
+		ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+		if (frag_offset_flags & IPH_MF) {
+			/* More to come. */
+			ipf->ipf_end = end;
+			ipf->ipf_tail_mp = tail_mp;
+			goto reass_done;
+		}
+	} else {
+		/* Go do the hard cases. */
+		int ret;
+
+		if (offset == 0)
+			ipf->ipf_nf_hdr_len = hdr_length;
+
+		/* Save current byte count */
 		count = ipf->ipf_count;
-		ipf = ipf->ipf_hash_next;
-		if (ipf)
-			ipf->ipf_ptphn = ipfp;
-		ipfp[0] = ipf;
-		ill->ill_frag_count -= count;
-		ASSERT(ipfb->ipfb_count >= count);
-		ipfb->ipfb_count -= count;
-		ipfb->ipfb_frag_pkts--;
-		mutex_exit(&ipfb->ipfb_lock);
-		/* Ditch the frag header. */
-		mp = mp1->b_cont;
-
-		freeb(mp1);
-
-		/* Restore original IP length in header. */
-		packet_size = (uint32_t)msgdsize(mp);
-		if (packet_size > IP_MAXPACKET) {
+		ret = ip_reassemble(mp, ipf,
+		    (frag_offset_flags & IPH_OFFSET) << 3,
+		    (frag_offset_flags & IPH_MF), ill, msg_len);
+		/* Count of bytes added and subtracted (freeb()ed) */
+		count = ipf->ipf_count - count;
+		if (count) {
+			/* Update per ipfb and ill byte counts */
+			ipfb->ipfb_count += count;
+			ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+			ill->ill_frag_count += count;
+			ASSERT(ill->ill_frag_count > 0);
+		}
+		if (ret == IP_REASS_PARTIAL) {
+			goto reass_done;
+		} else if (ret == IP_REASS_FAILED) {
+			/* Reassembly failed. Free up all resources */
+			ill_frag_free_pkts(ill, ipfb, ipf, 1);
+			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
+				IP_REASS_SET_START(t_mp, 0);
+				IP_REASS_SET_END(t_mp, 0);
+			}
 			freemsg(mp);
-			BUMP_MIB(&ip_mib, ipInHdrErrors);
+			goto reass_done;
+		}
+		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
+	}
+	/*
+	 * We have completed reassembly.  Unhook the frag header from
+	 * the reassembly list.
+	 *
+	 * Before we free the frag header, record the ECN info
+	 * to report back to the transport.
+	 */
+	ecn_info = ipf->ipf_ecn;
+	BUMP_MIB(&ip_mib, ipReasmOKs);
+	ipfp = ipf->ipf_ptphn;
+
+	/* We need to supply these to caller */
+	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+		sum_val = ipf->ipf_checksum;
+	else
+		sum_val = 0;
+
+	mp1 = ipf->ipf_mp;
+	count = ipf->ipf_count;
+	ipf = ipf->ipf_hash_next;
+	if (ipf != NULL)
+		ipf->ipf_ptphn = ipfp;
+	ipfp[0] = ipf;
+	ill->ill_frag_count -= count;
+	ASSERT(ipfb->ipfb_count >= count);
+	ipfb->ipfb_count -= count;
+	ipfb->ipfb_frag_pkts--;
+	mutex_exit(&ipfb->ipfb_lock);
+	/* Ditch the frag header. */
+	mp = mp1->b_cont;
+
+	freeb(mp1);
+
+	/* Restore original IP length in header. */
+	packet_size = (uint32_t)msgdsize(mp);
+	if (packet_size > IP_MAXPACKET) {
+		freemsg(mp);
+		BUMP_MIB(&ip_mib, ipInHdrErrors);
+		return (B_FALSE);
+	}
+
+	if (DB_REF(mp) > 1) {
+		mblk_t *mp2 = copymsg(mp);
+
+		freemsg(mp);
+		if (mp2 == NULL) {
+			BUMP_MIB(&ip_mib, ipInDiscards);
 			return (B_FALSE);
 		}
-
-		if (mp->b_datap->db_ref > 1) {
-			mblk_t *mp2;
-
-			mp2 = copymsg(mp);
-			freemsg(mp);
-			if (!mp2) {
-				BUMP_MIB(&ip_mib, ipInDiscards);
-				return (B_FALSE);
-			}
-			mp = mp2;
-		}
-		ipha = (ipha_t *)mp->b_rptr;
-
-		ipha->ipha_length = htons((uint16_t)packet_size);
-		/* We're now complete, zip the frag state */
-		ipha->ipha_fragment_offset_and_flags = 0;
-		/* Record the ECN info. */
-		ipha->ipha_type_of_service &= 0xFC;
-		ipha->ipha_type_of_service |= ecn_info;
-		*mpp = mp;
-
-	}
+		mp = mp2;
+	}
+	ipha = (ipha_t *)mp->b_rptr;
+
+	ipha->ipha_length = htons((uint16_t)packet_size);
+	/* We're now complete, zip the frag state */
+	ipha->ipha_fragment_offset_and_flags = 0;
+	/* Record the ECN info. */
+	ipha->ipha_type_of_service &= 0xFC;
+	ipha->ipha_type_of_service |= ecn_info;
+	*mpp = mp;
+
+	/* Reassembly is successful; return checksum information if needed */
+	if (cksum_val != NULL)
+		*cksum_val = sum_val;
+	if (cksum_flags != NULL)
+		*cksum_flags = sum_flags;
+
 	return (B_TRUE);
 }
 
@@ -11156,16 +11343,12 @@
 {
 	uint32_t	sum;
 	uint32_t	u1;
-	uint32_t	u2;
 	boolean_t	mctl_present;
 	conn_t		*connp;
 	mblk_t		*first_mp;
-	mblk_t		*mp1;
-	dblk_t		*dp;
 	uint16_t	*up;
 	ill_t		*ill = (ill_t *)q->q_ptr;
-	uint32_t	ports;
-	boolean_t	cksum_computed = B_FALSE;
+	uint16_t	reass_hck_flags = 0;
 
 #define	rptr    ((uchar_t *)ipha)
 
@@ -11182,19 +11365,13 @@
 	    IP_SIMPLE_HDR_LENGTH_IN_WORDS);
 
 	/* IP options present */
-	if (u1)
+	if (u1 != 0)
 		goto ipoptions;
 
-#define	IS_IPHDR_HWCKSUM(mctl_present, mp, ill)				\
-	((!mctl_present) && (mp->b_datap->db_struioun.cksum.flags &	\
-	HCK_IPV4_HDRCKSUM) && (ill->ill_capabilities &			\
-	ILL_CAPAB_HCKSUM) && dohwcksum)
-
 	/* Check the IP header checksum.  */
-	if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+	if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 		/* Clear the IP header h/w cksum flag */
-		mp->b_datap->db_struioun.cksum.flags &=
-		    ~HCK_IPV4_HDRCKSUM;
+		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 	} else {
 #define	uph	((uint16_t *)ipha)
 		sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
@@ -11207,7 +11384,7 @@
 		 * Don't verify header checksum if this packet is coming
 		 * back from AH/ESP as we already did it.
 		 */
-		if (!mctl_present && (sum && sum != 0xFFFF)) {
+		if (!mctl_present && sum != 0 && sum != 0xFFFF) {
 			BUMP_MIB(&ip_mib, ipInCksumErrs);
 			freemsg(first_mp);
 			return;
@@ -11236,133 +11413,52 @@
 	/* packet does not contain complete IP & UDP headers */
 	if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
 		goto udppullup;
+
 	/* up points to UDP header */
 	up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
 #define	iphs    ((uint16_t *)ipha)
 
-#define	IP_CKSUM_RECV(len, u1, u2, mp, mp1, error, dp) {		\
-	boolean_t	doswcksum = B_TRUE;				\
-	uint_t		hcksumflags = 0;				\
-									\
-	hcksumflags = dp->db_struioun.cksum.flags;			\
-									\
-	/* Clear the hardware checksum flags; they have been consumed */\
-	dp->db_struioun.cksum.flags = 0;				\
-	if (hcksumflags && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&\
-		dohwcksum) {						\
-		if (hcksumflags & HCK_FULLCKSUM) {			\
-			/* 						\
-			 * Full checksum has been computed by the	\
-			 * hardware and has been attached. 		\
-			 */						\
-			doswcksum = B_FALSE;				\
-			if (!(hcksumflags & HCK_FULLCKSUM_OK) &&	\
-			    (dp->db_cksum16 != 0xffff)) {		\
-				ipcsumdbg("full hwcksumerr\n", mp);	\
-				goto error;				\
-			}						\
-		} else if ((hcksumflags & HCK_PARTIALCKSUM) &&		\
-		    (((len = (IP_SIMPLE_HDR_LENGTH - dp->db_cksumstart))\
-		    & 1) == 0)) {					\
-			uint32_t	tot_len = 0;			\
-									\
-			doswcksum = B_FALSE;				\
-			/* Partial checksum computed */			\
-			u1 += dp->db_cksum16;				\
-			tot_len = mp->b_wptr - mp->b_rptr;		\
-			if (!mp1)					\
-				mp1 = mp;				\
-			else						\
-				tot_len += mp1->b_wptr - mp1->b_rptr;	\
-			if (len > 0) {					\
-				/* 					\
-				 * Prepended extraneous data. Adjust	\
-				 * checksum.				\
-				 */					\
-				u2 = IP_BCSUM_PARTIAL((uchar_t *)(rptr +\
-				    dp->db_cksumstart),	(int32_t)len, 	\
-				    0);					\
-			} else						\
-				u2 = 0;					\
-			if ((len = (dp->db_cksumend - tot_len)) > 0) {	\
-				/* 					\
-				 * Postpended extraneous data. Adjust	\
-				 * checksum.				\
-				 */					\
-				uint32_t	u3;			\
-									\
-				u3 = IP_BCSUM_PARTIAL(mp1->b_wptr, 	\
-				    (int32_t)len, 0);			\
-				if ((uintptr_t)mp1->b_wptr & 1)		\
-					/*				\
-					 * Postpended extraneous data	\
-					 * was odd byte aligned, so 	\
-					 * swap resulting checksum 	\
-					 * bytes.			\
-					 */				\
-					u2 += ((u3 << 8) & 0xffff) | 	\
-					    (u3 >> 8);			\
-				else					\
-					u2 += u3;			\
-				u2 = (u2 & 0xFFFF) + ((int)(u2) >> 16);	\
-			}						\
-			/*						\
-			 * One's complement subtract extraneous checksum\
-			 */						\
-			if (u2 >= u1)					\
-				u1 = ~(u2 - u1) & 0xFFFF;		\
-			else						\
-				u1 -= u2;				\
-			u1 = (u1 & 0xFFFF) + ((int)u1 >> 16);		\
-			if (~(u1) & 0xFFFF) {				\
-				ipcsumdbg("partial hwcksumerr\n", mp);	\
-				goto error;				\
-			}						\
-		} 							\
-	} 								\
-	if (doswcksum) {						\
-		IP_STAT(ip_in_sw_cksum);				\
-		if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -		\
-		    (uchar_t *)ipha), u1)) != 0) {			\
-			ipcsumdbg("swcksumerr\n", mp);			\
-			goto error;					\
-		}							\
-	}								\
-}
-
-	dp = mp->b_datap;
 	/* if udp hdr cksum != 0, then need to checksum udp packet */
-	if (up[3]) {
-		cksum_computed = B_TRUE;
-		/* multiple mblks of udp data? */
-		if ((mp1 = mp->b_cont) != NULL) {
-			/* more than two? */
-			if (mp1->b_cont)
-				goto multipktudp;
-		}
+	if (up[3] != 0) {
+		mblk_t *mp1 = mp->b_cont;
+		boolean_t cksum_err;
+		uint16_t hck_flags = 0;
 
 		/* Pseudo-header checksum */
 		u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
 		    iphs[9] + up[2];
-		if (!mctl_present) {
-			ssize_t len = 0;
-
-			IP_CKSUM_RECV(len, u1, u2, mp, mp1, udpcksumerr, dp);
-		} else {
-multipktudp:
+
+		/*
+		 * Revert to software checksum calculation if the interface
+		 * isn't capable of checksum offload or if IPsec is present.
+		 */
+		if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+			hck_flags = DB_CKSUMFLAGS(mp);
+
+		if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
 			IP_STAT(ip_in_sw_cksum);
-			if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -
-			    (uchar_t *)ipha), u1)) != 0) {
-udpcksumerr:
-				ip1dbg(("ip_udp_input: bad udp checksum\n"));
-				BUMP_MIB(&ip_mib, udpInCksumErrs);
-				freemsg(first_mp);
-				return;
-			}
-		}
-	}
-
-	/* broadcast IP packet? */
+
+		IP_CKSUM_RECV(hck_flags, u1,
+		    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+		    (int32_t)((uchar_t *)up - rptr),
+		    mp, mp1, cksum_err);
+
+		if (cksum_err) {
+			BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+			if (hck_flags & HCK_FULLCKSUM)
+				IP_STAT(ip_udp_in_full_hw_cksum_err);
+			else if (hck_flags & HCK_PARTIALCKSUM)
+				IP_STAT(ip_udp_in_part_hw_cksum_err);
+			else
+				IP_STAT(ip_udp_in_sw_cksum_err);
+
+			freemsg(first_mp);
+			return;
+		}
+	}
+
+	/* Non-fragmented broadcast or multicast packet? */
 	if (ire->ire_type == IRE_BROADCAST)
 		goto udpslowpath;
 
@@ -11371,7 +11467,7 @@
 		ASSERT(connp->conn_upq != NULL);
 		IP_STAT(ip_udp_fast_path);
 
-		if (!canputnext(connp->conn_upq)) {
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			freemsg(mp);
 			BUMP_MIB(&ip_mib, udpInOverflows);
 		} else {
@@ -11383,7 +11479,8 @@
 			 */
 			if (ip_udp_check(q, connp, recv_ill,
 			    ipha, &mp, &first_mp, mctl_present)) {
-				putnext(connp->conn_upq, mp);
+				/* Send it upstream */
+				CONN_UDP_RECV(connp, mp);
 			}
 		}
 		/*
@@ -11416,9 +11513,13 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		/*
+		 * "sum" and "reass_hck_flags" are non-zero if the
+		 * reassembled packet has a valid hardware computed
+		 * checksum information associated with it.
+		 */
+		if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
 			goto slow_done;
-		}
 		/*
 		 * Make sure that first_mp points back to mp as
 		 * the mp we came in with could have changed in
@@ -11432,7 +11533,7 @@
 	/* Now we have a complete datagram, destined for this machine. */
 	u1 = IPH_HDR_LENGTH(ipha);
 	/* Pull up the UDP header, if necessary. */
-	if ((mp->b_wptr - mp->b_rptr) < (u1 + UDPH_SIZE)) {
+	if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
 udppullup:
 		if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
 			BUMP_MIB(&ip_mib, ipInDiscards);
@@ -11441,30 +11542,43 @@
 		}
 		ipha = (ipha_t *)mp->b_rptr;
 	}
-	/*
-	 * Validate the checksum.  This code is a bit funny looking
-	 * but may help out the compiler in this crucial spot.
+
+	/*
+	 * Validate the checksum for the reassembled packet; for the
+	 * pullup case we calculate the payload checksum in software.
 	 */
 	up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
-	if (!cksum_computed && up[3]) {
-		IP_STAT(ip_in_sw_cksum);
-		sum = IP_CSUM(mp, (int32_t)((uchar_t *)up - (uchar_t *)ipha),
-		    IP_UDP_CSUM_COMP + iphs[6] +
-		    iphs[7] + iphs[8] +
-		    iphs[9] + up[2]);
-		if (sum != 0) {
-			ip1dbg(("ip_udp_input: bad udp checksum\n"));
-				BUMP_MIB(&ip_mib, udpInCksumErrs);
-				freemsg(first_mp);
-				goto slow_done;
+	if (up[3] != 0) {
+		boolean_t cksum_err;
+
+		if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+			IP_STAT(ip_in_sw_cksum);
+
+		IP_CKSUM_RECV_REASS(reass_hck_flags,
+		    (int32_t)((uchar_t *)up - (uchar_t *)ipha),
+		    IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
+		    iphs[9] + up[2], sum, cksum_err);
+
+		if (cksum_err) {
+			BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+			if (reass_hck_flags & HCK_FULLCKSUM)
+				IP_STAT(ip_udp_in_full_hw_cksum_err);
+			else if (reass_hck_flags & HCK_PARTIALCKSUM)
+				IP_STAT(ip_udp_in_part_hw_cksum_err);
+			else
+				IP_STAT(ip_udp_in_sw_cksum_err);
+
+			freemsg(first_mp);
+			goto slow_done;
 		}
 	}
 udpslowpath:
 
-	ports = *(uint32_t *)up;
-	/* Clear hardware checksum flag */
-	mp->b_datap->db_struioun.cksum.flags = 0;
-	ip_fanout_udp(q, first_mp, ill, ipha, ports,
+	/* Clear hardware checksum flag to be safe */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
 	    (ire->ire_type == IRE_BROADCAST),
 	    IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO,
 	    mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
@@ -11473,6 +11587,7 @@
 	IP_STAT(ip_udp_slow_path);
 	return;
 
+#undef  iphs
 #undef  rptr
 }
 
@@ -11485,17 +11600,17 @@
 	conn_t		*connp;
 	uint32_t	sum;
 	uint32_t	u1;
-	uint32_t	u2;
 	uint16_t	*up;
 	int		offset;
 	ssize_t		len;
 	mblk_t		*mp1;
-	dblk_t		*dp;
 	boolean_t	syn_present = B_FALSE;
 	tcph_t		*tcph;
 	uint_t		ip_hdr_len;
 	ill_t		*ill = (ill_t *)q->q_ptr;
 	zoneid_t	zoneid = ire->ire_zoneid;
+	boolean_t	cksum_err;
+	uint16_t	hck_flags = 0;
 
 #define	rptr	((uchar_t *)ipha)
 
@@ -11514,10 +11629,9 @@
 		goto ipoptions;
 	} else {
 		/* Check the IP header checksum.  */
-		if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 			/* Clear the IP header h/w cksum flag */
-			mp->b_datap->db_struioun.cksum.flags &=
-			    ~HCK_IPV4_HDRCKSUM;
+			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 		} else {
 #define	uph	((uint16_t *)ipha)
 			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -11596,30 +11710,32 @@
 #endif
 	u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
 
-
-	/*
-	 * If the packet has gone through AH/ESP, do the checksum here
-	 * itself.
-	 *
-	 * If it has not gone through IPSEC processing and not a duped
-	 * mblk, then look for driver checksummed mblk. We validate or
-	 * postpone the checksum to TCP for single copy checksum.
-	 *
-	 * Note that we only honor HW cksum in the fastpath.
-	 */
-	dp = mp->b_datap;
-	if (!mctl_present) {
-		IP_CKSUM_RECV(len, u1, u2, mp, mp1, tcpcksumerr, dp);
-	} else {
+	/*
+	 * Revert to software checksum calculation if the interface
+	 * isn't capable of checksum offload or if IPsec is present.
+	 */
+	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+		hck_flags = DB_CKSUMFLAGS(mp);
+
+	if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
 		IP_STAT(ip_in_sw_cksum);
-		if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr),
-		    u1)) != 0) {
-tcpcksumerr:
-			BUMP_MIB(&ip_mib, tcpInErrs);
-			ip1dbg(("ip_tcp_input: bad tcp checksum \n"));
-			freemsg(first_mp);
-			goto slow_done;
-		}
+
+	IP_CKSUM_RECV(hck_flags, u1,
+	    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+	    (int32_t)((uchar_t *)up - rptr),
+	    mp, mp1, cksum_err);
+
+	if (cksum_err) {
+		BUMP_MIB(&ip_mib, tcpInErrs);
+
+		if (hck_flags & HCK_FULLCKSUM)
+			IP_STAT(ip_tcp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP_STAT(ip_tcp_in_part_hw_cksum_err);
+		else
+			IP_STAT(ip_tcp_in_sw_cksum_err);
+
+		goto error;
 	}
 
 try_again:
@@ -11654,7 +11770,7 @@
 	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
 		if (IPCL_IS_TCP(connp)) {
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart =
+			DB_CKSUMSTART(mp) =
 			    (intptr_t)ip_squeue_get(ill_ring);
 			if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
 			    !CONN_INBOUND_POLICY_PRESENT(connp)) {
@@ -11800,7 +11916,7 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
 			if (mctl_present)
 				freeb(first_mp);
 			goto slow_done;
@@ -11876,9 +11992,10 @@
 	 * ICMP's back, then this flag may need to be cleared in
 	 * other places as well.
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
+
 	u1 = (uint32_t)(len - u1);	/* TCP datagram length. */
 #ifdef	_BIG_ENDIAN
 	u1 += IPPROTO_TCP;
@@ -11890,7 +12007,7 @@
 	 * Not M_DATA mblk or its a dup, so do the checksum now.
 	 */
 	IP_STAT(ip_in_sw_cksum);
-	if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1)) {
+	if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
 		BUMP_MIB(&ip_mib, tcpInErrs);
 		goto error;
 	}
@@ -11937,12 +12054,12 @@
 		goto ipoptions;
 	} else {
 		/* Check the IP header checksum.  */
-		if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 			/*
 			 * Since there is no SCTP h/w cksum support yet, just
 			 * clear the flag.
 			 */
-			mp->b_datap->db_struioun.cksum.flags = 0;
+			DB_CKSUMFLAGS(mp) = 0;
 		} else {
 #define	uph	((uint16_t *)ipha)
 			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -12031,7 +12148,7 @@
 	return;
 
 ipoptions:
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	if (!ip_options_cksum(q, first_mp, ipha, ire))
 		goto slow_done;
 
@@ -12041,7 +12158,7 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha))
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
 			goto slow_done;
 		/*
 		 * Make sure that first_mp points back to mp as
@@ -12183,7 +12300,7 @@
 	 * Clear the indication that this may have a hardware checksum
 	 * as we are not using it
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Now hand the packet to ip_newroute.
@@ -12351,7 +12468,7 @@
 			 * Clear the indication that this may have
 			 * hardware checksum as we are not using it.
 			 */
-			mp->b_datap->db_struioun.cksum.flags = 0;
+			DB_CKSUMFLAGS(mp) = 0;
 			icmp_unreachable(q, mp,
 			    ICMP_SOURCE_ROUTE_FAILED);
 			ire_refrele(ire);
@@ -12361,7 +12478,7 @@
 	}
 
 	/* Packet is being forwarded. Turning off hwcksum flag. */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	if (ip_g_send_redirects) {
 		/*
 		 * Check whether the incoming interface and outgoing
@@ -12435,15 +12552,17 @@
 {
 	queue_t		*q;
 	ire_t		*ire;
+	uint16_t	hcksumflags;
 
 	q = *qp;
 	ire = *irep;
 
 	/*
 	 * Clear the indication that this may have hardware
-	 * checksum as we are not using it.
-	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	 * checksum as we are not using it for forwarding.
+	 */
+	hcksumflags = DB_CKSUMFLAGS(mp);
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Directed broadcast forwarding: if the packet came in over a
@@ -12613,6 +12732,9 @@
 	}
 
 	*irep = ire;
+
+	/* Restore any hardware checksum flags */
+	DB_CKSUMFLAGS(mp) = hcksumflags;
 	return (B_FALSE);
 }
 
@@ -12632,7 +12754,7 @@
 		 * Clear the indication that this may have hardware
 		 * checksum as we are not using it.
 		 */
-		mp->b_datap->db_struioun.cksum.flags = 0;
+		DB_CKSUMFLAGS(mp) = 0;
 		retval = ip_mforward(ill, ipha, mp);
 		/* ip_mforward updates mib variables if needed */
 		/* clear b_prev - used by ip_mroute_decap */
@@ -12951,7 +13073,7 @@
 			/*
 			 * Also SIOC[GS]TUN* ioctls can come here.
 			 */
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 			TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
 			    "ip_input_end: q %p (%S)", q, "uninit");
 			return;
@@ -13300,9 +13422,20 @@
 			continue;
 		}
 
-		/* broadcast? */
+		/*
+		 * Broadcast IRE may indicate either broadcast or
+		 * multicast packet
+		 */
 		if (ire->ire_type == IRE_BROADCAST) {
-			if (ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
+			/*
+			 * Skip broadcast checks if packet is UDP multicast;
+			 * we'd rather not enter ip_rput_process_broadcast()
+			 * unless the packet is broadcast for real, since
+			 * that routine is a no-op for multicast.
+			 */
+			if ((ipha->ipha_protocol != IPPROTO_UDP ||
+			    !CLASSD(ipha->ipha_dst)) &&
+			    ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
 			    dst, cgtp_flt_pkt, ll_multicast)) {
 				continue;
 			}
@@ -13533,24 +13666,6 @@
 }
 
 /*
- * This function is used to free a message that has gone through
- * mi_copyin processing which modifies the M_IOCTL mblk's b_next
- * and b_prev pointers. We use this function to set b_next/b_prev
- * to NULL and free them.
- */
-void
-ip_ioctl_freemsg(mblk_t *mp)
-{
-	mblk_t	*bp = mp;
-
-	for (; bp != NULL; bp = bp->b_cont) {
-		bp->b_prev = NULL;
-		bp->b_next = NULL;
-	}
-	freemsg(mp);
-}
-
-/*
  * Handling of DLPI messages that require exclusive access to the ipsq.
  *
  * Need to do ill_pending_mp_release on ioctl completion, which could
@@ -14483,7 +14598,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				ASSERT(ipsq->ipsq_current_ipif != NULL);
 				ASSERT(connp != NULL);
 				ip_ioctl_finish(CONNP_TO_WQ(connp), mp,
@@ -14515,7 +14630,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				if (iocp->ioc_error == 0)
 					mp->b_datap->db_type = M_IOCDATA;
 				ASSERT(connp != NULL);
@@ -14596,7 +14711,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				if (iocp->ioc_error == 0)
 					iocp->ioc_error = EINVAL;
 				ASSERT(connp != NULL);
@@ -15321,7 +15436,7 @@
 		 */
 		ASSERT(!mctl_present);
 		ASSERT(first_mp == mp);
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
 			return;
 		}
 		/*
@@ -15337,7 +15452,7 @@
 	 * Clear hardware checksumming flag as it is currently only
 	 * used by TCP and UDP.
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/* Now we have a complete datagram, destined for this machine. */
 	u1 = IPH_HDR_LENGTH(ipha);
@@ -15839,7 +15954,7 @@
 bad_src_route:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
 	return (B_FALSE);
 
@@ -16022,14 +16137,14 @@
 param_prob:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_param_problem(q, mp, (uint8_t)code);
 	return (-1);
 
 bad_src_route:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
 	return (-1);
 }
@@ -17571,7 +17686,7 @@
  * upper level protocol.  We remove this conn from any fanout hash list it is
  * on, and zero out the bind information.  No reply is expected up above.
  */
-static void
+mblk_t *
 ip_unbind(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
@@ -17591,7 +17706,7 @@
 	 * original message.
 	 */
 	if (mp == NULL)
-		return;
+		return (NULL);
 
 	/*
 	 * Don't bzero the ports if its TCP since TCP still needs the
@@ -17601,7 +17716,7 @@
 	if (!IPCL_IS_TCP(connp))
 		bzero(&connp->u_port, sizeof (connp->u_port));
 
-	qreply(q, mp);
+	return (mp);
 }
 
 /*
@@ -17657,7 +17772,9 @@
 	/* is queue flow controlled? */
 	if ((q->q_first != NULL || connp->conn_draining) &&
 	    (caller == IP_WPUT)) {
-		goto doputq;
+		ASSERT(!need_decref);
+		(void) putq(q, mp);
+		return;
 	}
 
 	/* Multidata transmit? */
@@ -17992,11 +18109,6 @@
 		CONN_DEC_REF(connp);
 	return;
 
-doputq:
-	ASSERT(!need_decref);
-	(void) putq(q, mp);
-	return;
-
 qnext:
 	/*
 	 * Upper Level Protocols pass down complete IP datagrams
@@ -18933,7 +19045,7 @@
  * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
  * the above holds.
  */
-static ipif_t *
+ipif_t *
 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
 {
 	ipif_t	*ipif;
@@ -19414,7 +19526,6 @@
 	boolean_t	multirt_send = B_FALSE;
 	int		err;
 	zoneid_t	zoneid;
-	boolean_t	iphdrhwcksum = B_FALSE;
 
 	TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
 	    "ip_wput_ire_start: q %p", q);
@@ -19749,102 +19860,6 @@
 	/* pseudo checksum (do it in parts for IP header checksum) */
 	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
 
-#define	FRAGMENT_NEEDED(mtu, size)	\
-	(((mtu) < (unsigned int)(size)) ? B_TRUE : B_FALSE)
-
-#define	IS_FASTPATH(ire, bp) 					\
-	((ire)->ire_fp_mp != NULL &&				\
-	(MBLKHEAD((bp)) >= (MBLKL((ire)->ire_fp_mp))))		\
-
-#define	IPH_UDPH_CHECKSUMP(ipha, hlen) \
-	((uint16_t *)(((uchar_t *)ipha)+(hlen + UDP_CHECKSUM_OFFSET)))
-#define	IPH_TCPH_CHECKSUMP(ipha, hlen) \
-	    ((uint16_t *)(((uchar_t *)ipha)+(hlen+TCP_CHECKSUM_OFFSET)))
-
-#define	IP_CKSUM_XMIT(ill, ire, mp, up, proto, hlen, max_frag,		\
-	    ipsec_len) { 						\
-	uint32_t	sum;						\
-	uint32_t	xmit_capab = HCKSUM_INET_FULL_V4 |		\
-			    HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM;	\
-	boolean_t	cksum_offload = B_FALSE;			\
-									\
-	/*								\
-	 * The ire fp mp can change due to the arrival of a		\
-	 * DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST		\
-	 * and IRE_MIPRTUN. Hence the ire_fp_mp has to be accessed	\
-	 * only under the ire_lock in such cases.			\
-	 */								\
-	LOCK_IRE_FP_MP(ire);						\
-	if ((ill) && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&	\
-	    (ill->ill_hcksum_capab->ill_hcksum_txflags &		\
-	    xmit_capab) && (!FRAGMENT_NEEDED(max_frag, 			\
-	    (LENGTH + ipsec_len))) && (!(ire->ire_flags & 		\
-	    RTF_MULTIRT)) && (ipsec_len == 0) && 			\
-	    IS_FASTPATH((ire), (mp)) &&	(dohwcksum)) { 			\
-		/*							\
-		 * Underlying interface supports hardware checksumming.	\
-		 * So postpone the checksum to the interface driver	\
-		 */							\
-									\
-		if ((hlen) == IP_SIMPLE_HDR_LENGTH) {			       \
-			if (ill->ill_hcksum_capab->ill_hcksum_txflags &        \
-			    HCKSUM_IPHDRCKSUM) {			       \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_IPV4_HDRCKSUM;			       \
-				/* seed the cksum field to 0 */		       \
-				ipha->ipha_hdr_checksum = 0;		       \
-				iphdrhwcksum = B_TRUE;			       \
-			}						       \
-			/*						       \
-			 * If underlying h/w supports full h/w checksumming    \
-			 * and no IP options are present, then offload	       \
-			 * full checksumming to the hardware.		       \
-			 *						       \
-			 * If h/w can do partial checksumming then offload     \
-			 * unless the startpoint offset, including mac-header, \
-			 * is too big for the interface to some of our	       \
-			 * hardware (CE and ERI) which have 6 bit fields.      \
-			 * Sigh.					       \
-			 * Unhappily we don't have the mac-header size here    \
-			 * so punt for any options.			       \
-			 */						       \
-			if (ill->ill_hcksum_capab->ill_hcksum_txflags &        \
-			    HCKSUM_INET_FULL_V4) {			       \
-				UNLOCK_IRE_FP_MP(ire);			       \
-				/* Seed the checksum field to 0 */	       \
-				*up = 0;				       \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_FULLCKSUM;			       \
-				cksum_offload = B_TRUE;			       \
-			} else if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
-			    HCKSUM_INET_PARTIAL) {			       \
-				UNLOCK_IRE_FP_MP(ire);			       \
-				sum = *up + cksum + proto;		       \
-				sum = (sum & 0xFFFF) + (sum >> 16);	       \
-				*up = (sum & 0xFFFF) + (sum >> 16);	       \
-				/*					       \
-				 * All offsets are relative to the beginning   \
-				 * of the IP header.			       \
-				 */					       \
-				mp->b_datap->db_cksumstart = hlen;	       \
-				mp->b_datap->db_cksumstuff = 		       \
-				    (PROTO == IPPROTO_UDP) ?		       \
-				    (hlen) + UDP_CHECKSUM_OFFSET :	       \
-				    (hlen) + TCP_CHECKSUM_OFFSET;	       \
-				mp->b_datap->db_cksumend = ipha->ipha_length;  \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_PARTIALCKSUM;			       \
-				cksum_offload = B_TRUE;			       \
-			}						       \
-		}							\
-	} 								\
-	if (!cksum_offload) {						\
-		UNLOCK_IRE_FP_MP(ire);					\
-		IP_STAT(ip_out_sw_cksum);				\
-		(sum) = IP_CSUM((mp), (hlen), cksum + proto);		\
-		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
-	}								\
-}
 	if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
 		queue_t *dev_q = stq->q_next;
 
@@ -19856,10 +19871,16 @@
 		    (ip_hdr_included != IP_HDR_INCLUDED)) {
 			hlen = (V_HLEN & 0xF) << 2;
 			up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-			if (*up) {
-				IP_CKSUM_XMIT(ill, ire, mp, up,
-				    IP_UDP_CSUM_COMP, hlen, max_frag,
-				    ipsec_len);
+			if (*up != 0) {
+				IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
+				    hlen, LENGTH, max_frag, ipsec_len, cksum);
+				/* Software checksum? */
+				if (DB_CKSUMFLAGS(mp) == 0) {
+					IP_STAT(ip_out_sw_cksum);
+					IP_STAT_UPDATE(
+					    ip_udp_out_sw_cksum_bytes,
+					    LENGTH - hlen);
+				}
 			}
 		}
 	} else if (ip_hdr_included != IP_HDR_INCLUDED) {
@@ -19873,8 +19894,14 @@
 			 * replicated via several interfaces, and not all of
 			 * them may have this capability.
 			 */
-			IP_CKSUM_XMIT(ill, ire, mp, up,
-			    IP_TCP_CSUM_COMP, hlen, max_frag, ipsec_len);
+			IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
+			    LENGTH, max_frag, ipsec_len, cksum);
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP_STAT(ip_out_sw_cksum);
+				IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+				    LENGTH - hlen);
+			}
 		} else {
 			sctp_hdr_t	*sctph;
 
@@ -19904,7 +19931,7 @@
 	cksum += ttl_protocol;
 
 	/* fragment the packet */
-	if (FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len)))
+	if (max_frag < (uint_t)(LENGTH + ipsec_len))
 		goto fragmentit;
 	/*
 	 * Don't use frag_flag if packet is pre-built or source
@@ -19918,8 +19945,8 @@
 		ipha->ipha_fragment_offset_and_flags |=
 		    htons(ire->ire_frag_flag);
 
-	if (!iphdrhwcksum) {
-		/* checksum */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+		/* calculate IP header checksum */
 		cksum += ipha->ipha_ident;
 		cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
 		cksum += ipha->ipha_fragment_offset_and_flags;
@@ -20258,7 +20285,11 @@
 			hlen = (V_HLEN & 0xF) << 2;
 			up = IPH_TCPH_CHECKSUMP(ipha, hlen);
 			IP_STAT(ip_out_sw_cksum);
+			IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+			    LENGTH - hlen);
 			*up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
+			if (*up == 0)
+				*up = 0xFFFF;
 		} else if (PROTO == IPPROTO_SCTP &&
 		    (ip_hdr_included != IP_HDR_INCLUDED)) {
 			sctp_hdr_t	*sctph;
@@ -20338,17 +20369,18 @@
 				 */
 				hlen = (V_HLEN & 0xF) << 2;
 				up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-				if (*up) {
-					uint_t	sum;
-
-					/*
-					 * NOTE: watch out for compiler high
-					 * bits
-					 */
-					IP_STAT(ip_out_sw_cksum);
-					sum = IP_CSUM(mp, hlen,
-					    cksum + IP_UDP_CSUM_COMP);
-					*up = (uint16_t)(sum ? sum : ~sum);
+				max_frag = ire->ire_max_frag;
+				if (*up != 0) {
+					IP_CKSUM_XMIT(ire_ill, ire, mp, ipha,
+					    up, PROTO, hlen, LENGTH, max_frag,
+					    ipsec_len, cksum);
+					/* Software checksum? */
+					if (DB_CKSUMFLAGS(mp) == 0) {
+						IP_STAT(ip_out_sw_cksum);
+						IP_STAT_UPDATE(
+						    ip_udp_out_sw_cksum_bytes,
+						    LENGTH - hlen);
+					}
 				}
 			}
 		}
@@ -20369,9 +20401,7 @@
 			    conn_multicast_loop));
 
 			/*  Forget header checksum offload */
-			mp->b_datap->db_struioun.cksum.flags &=
-			    ~HCK_IPV4_HDRCKSUM;
-			iphdrhwcksum = B_FALSE;
+			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 
 			/*
 			 * Local loopback of multicasts?  Check the
@@ -20459,10 +20489,8 @@
 		}
 		max_frag = ire->ire_max_frag;
 		cksum += ttl_protocol;
-		if (!FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) {
+		if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
 			/* No fragmentation required for this one. */
-			/* Complete the IP header checksum. */
-			cksum += ipha->ipha_ident;
 			/*
 			 * Don't use frag_flag if packet is pre-built or source
 			 * routed or if multicast (since multicast packets do
@@ -20475,26 +20503,32 @@
 				ipha->ipha_fragment_offset_and_flags |=
 				    htons(ire->ire_frag_flag);
 
-			cksum += (v_hlen_tos_len >> 16)+
-			    (v_hlen_tos_len & 0xFFFF);
-			cksum += ipha->ipha_fragment_offset_and_flags;
-			hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-			if (hlen) {
-			    checksumoptions:
-				/*
-				 * Account for the IP Options in the IP
-				 * header checksum.
-				 */
-				up = (uint16_t *)(rptr+IP_SIMPLE_HDR_LENGTH);
-				do {
-					cksum += up[0];
-					cksum += up[1];
-					up += 2;
-				} while (--hlen);
-			}
-			cksum = ((cksum & 0xFFFF) + (cksum >> 16));
-			cksum = ~(cksum + (cksum >> 16));
-			ipha->ipha_hdr_checksum = (uint16_t)cksum;
+			if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+				/* Complete the IP header checksum. */
+				cksum += ipha->ipha_ident;
+				cksum += (v_hlen_tos_len >> 16)+
+				    (v_hlen_tos_len & 0xFFFF);
+				cksum += ipha->ipha_fragment_offset_and_flags;
+				hlen = (V_HLEN & 0xF) -
+				    IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+				if (hlen) {
+				    checksumoptions:
+					/*
+					 * Account for the IP Options in the IP
+					 * header checksum.
+					 */
+					up = (uint16_t *)(rptr+
+					    IP_SIMPLE_HDR_LENGTH);
+					do {
+						cksum += up[0];
+						cksum += up[1];
+						up += 2;
+					} while (--hlen);
+				}
+				cksum = ((cksum & 0xFFFF) + (cksum >> 16));
+				cksum = ~(cksum + (cksum >> 16));
+				ipha->ipha_hdr_checksum = (uint16_t)cksum;
+			}
 			if (ipsec_len != 0) {
 				ipsec_out_process(q, first_mp, ire, ill_index);
 				if (!next_mp) {
@@ -20991,6 +21025,298 @@
 }
 
 /*
+ * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
+ * block chain. We could rewrite to handle arbitrary message block chains but
+ * that would make the code complicated and slow. Right now there three
+ * restrictions:
+ *
+ *   1. The first message block must contain the complete IP header and
+ *	at least 1 byte of payload data.
+ *   2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
+ *	so that we can use a single Multidata message.
+ *   3. No frag must be distributed over two or more message blocks so
+ *	that we don't need more than two packet descriptors per frag.
+ *
+ * The above restrictions allow us to support userland applications (which
+ * will send down a single message block) and NFS over UDP (which will
+ * send down a chain of at most three message blocks).
+ *
+ * We also don't use MDT for payloads with less than or equal to
+ * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
+ */
+boolean_t
+ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
+{
+	int	blocks;
+	ssize_t	total, missing, size;
+
+	ASSERT(mp != NULL);
+	ASSERT(hdr_len > 0);
+
+	size = MBLKL(mp) - hdr_len;
+	if (size <= 0)
+		return (B_FALSE);
+
+	/* The first mblk contains the header and some payload. */
+	blocks = 1;
+	total = size;
+	size %= len;
+	missing = (size == 0) ? 0 : (len - size);
+	mp = mp->b_cont;
+
+	while (mp != NULL) {
+		/*
+		 * Give up if we encounter a zero length message block.
+		 * In practice, this should rarely happen and therefore
+		 * not worth the trouble of freeing and re-linking the
+		 * mblk from the chain to handle such case.
+		 */
+		if ((size = MBLKL(mp)) == 0)
+			return (B_FALSE);
+
+		/* Too many payload buffers for a single Multidata message? */
+		if (++blocks > MULTIDATA_MAX_PBUFS)
+			return (B_FALSE);
+
+		total += size;
+		/* Is a frag distributed over two or more message blocks? */
+		if (missing > size)
+			return (B_FALSE);
+		size -= missing;
+
+		size %= len;
+		missing = (size == 0) ? 0 : (len - size);
+
+		mp = mp->b_cont;
+	}
+
+	return (total > ip_wput_frag_mdt_min);
+}
+
+/*
+ * Outbound IPv4 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
+    uint32_t frag_flag, int offset)
+{
+	ipha_t		*ipha_orig;
+	int		i1, ip_data_end;
+	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
+	mblk_t		*hdr_mp, *md_mp = NULL;
+	unsigned char	*hdr_ptr, *pld_ptr;
+	multidata_t	*mmd;
+	ip_pdescinfo_t	pdi;
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(MBLKL(mp) > sizeof (ipha_t));
+
+	ipha_orig = (ipha_t *)mp->b_rptr;
+	mp->b_rptr += sizeof (ipha_t);
+
+	/* Calculate how many packets we will send out */
+	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+	pkts = (i1 + len - 1) / len;
+	ASSERT(pkts > 1);
+
+	/* Allocate a message block which will hold all the IP Headers. */
+	wroff = ip_wroff_extra;
+	hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
+
+	i1 = pkts * hdr_chunk_len;
+	/*
+	 * Create the header buffer, Multidata and destination address
+	 * and SAP attribute that should be associated with it.
+	 */
+	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+	    ((hdr_mp->b_wptr += i1),
+	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+	    !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) {
+		freemsg(mp);
+		if (md_mp == NULL) {
+			freemsg(hdr_mp);
+		} else {
+free_mmd:		IP_STAT(ip_frag_mdt_discarded);
+			freemsg(md_mp);
+		}
+		IP_STAT(ip_frag_mdt_allocfail);
+		UPDATE_MIB(&ip_mib, ipOutDiscards, pkts);
+		return;
+	}
+	IP_STAT(ip_frag_mdt_allocd);
+
+	/*
+	 * Add a payload buffer to the Multidata; this operation must not
+	 * fail, or otherwise our logic in this routine is broken.  There
+	 * is no memory allocation done by the routine, so any returned
+	 * failure simply tells us that we've done something wrong.
+	 *
+	 * A failure tells us that either we're adding the same payload
+	 * buffer more than once, or we're trying to add more buffers than
+	 * allowed.  None of the above cases should happen, and we panic
+	 * because either there's horrible heap corruption, and/or
+	 * programming mistake.
+	 */
+	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+		goto pbuf_panic;
+
+	hdr_ptr = hdr_mp->b_rptr;
+	pld_ptr = mp->b_rptr;
+
+	/* Establish the ending byte offset, based on the starting offset. */
+	offset <<= 3;
+	ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
+	    IP_SIMPLE_HDR_LENGTH;
+
+	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+	while (pld_ptr < mp->b_wptr) {
+		ipha_t		*ipha;
+		uint16_t	offset_and_flags;
+		uint16_t	ip_len;
+		int		error;
+
+		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+		ipha = (ipha_t *)(hdr_ptr + wroff);
+		ASSERT(OK_32PTR(ipha));
+		*ipha = *ipha_orig;
+
+		if (ip_data_end - offset > len) {
+			offset_and_flags = IPH_MF;
+		} else {
+			/*
+			 * Last frag. Set len to the length of this last piece.
+			 */
+			len = ip_data_end - offset;
+			/* A frag of a frag might have IPH_MF non-zero */
+			offset_and_flags =
+			    ntohs(ipha->ipha_fragment_offset_and_flags) &
+			    IPH_MF;
+		}
+		offset_and_flags |= (uint16_t)(offset >> 3);
+		offset_and_flags |= (uint16_t)frag_flag;
+		/* Store the offset and flags in the IP header. */
+		ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
+
+		/* Store the length in the IP header. */
+		ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
+		ipha->ipha_length = htons(ip_len);
+
+		/*
+		 * Set the IP header checksum.  Note that mp is just
+		 * the header, so this is easy to pass to ip_csum.
+		 */
+		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+		/*
+		 * Record offset and size of header and data of the next packet
+		 * in the multidata message.
+		 */
+		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
+		PDESC_PLD_INIT(&pdi);
+		i1 = MIN(mp->b_wptr - pld_ptr, len);
+		ASSERT(i1 > 0);
+		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+		if (i1 == len) {
+			pld_ptr += len;
+		} else {
+			i1 = len - i1;
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			ASSERT(MBLKL(mp) >= i1);
+			/*
+			 * Attach the next payload message block to the
+			 * multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+			pld_ptr = mp->b_rptr + i1;
+		}
+
+		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+		    KM_NOSLEEP)) == NULL) {
+			/*
+			 * Any failure other than ENOMEM indicates that we
+			 * have passed in invalid pdesc info or parameters
+			 * to mmd_addpdesc, which must not happen.
+			 *
+			 * EINVAL is a result of failure on boundary checks
+			 * against the pdesc info contents.  It should not
+			 * happen, and we panic because either there's
+			 * horrible heap corruption, and/or programming
+			 * mistake.
+			 */
+			if (error != ENOMEM) {
+				cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
+				    "pdesc logic error detected for "
+				    "mmd %p pinfo %p (%d)\n",
+				    (void *)mmd, (void *)&pdi, error);
+				/* NOTREACHED */
+			}
+			IP_STAT(ip_frag_mdt_addpdescfail);
+			/* Free unattached payload message blocks as well */
+			md_mp->b_cont = mp->b_cont;
+			goto free_mmd;
+		}
+
+		/* Advance fragment offset. */
+		offset += len;
+
+		/* Advance to location for next header in the buffer. */
+		hdr_ptr += hdr_chunk_len;
+
+		/* Did we reach the next payload message block? */
+		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+			mp = mp->b_cont;
+			/*
+			 * Attach the next message block with payload
+			 * data to the multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			pld_ptr = mp->b_rptr;
+		}
+	}
+
+	ASSERT(hdr_mp->b_wptr == hdr_ptr);
+	ASSERT(mp->b_wptr == pld_ptr);
+
+	/* Update IP statistics */
+	UPDATE_MIB(&ip_mib, ipFragCreates, pkts);
+	BUMP_MIB(&ip_mib, ipFragOKs);
+	IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts);
+
+	if (pkt_type == OB_PKT) {
+		ire->ire_ob_pkt_count += pkts;
+		if (ire->ire_ipif != NULL)
+			atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+	} else {
+		/*
+		 * The type is IB_PKT in the forwarding path and in
+		 * the mobile IP case when the packet is being reverse-
+		 * tunneled to the home agent.
+		 */
+		ire->ire_ib_pkt_count += pkts;
+		ASSERT(!IRE_IS_LOCAL(ire));
+		if (ire->ire_type & IRE_BROADCAST)
+			atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
+		else
+			atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
+	}
+	ire->ire_last_used_time = lbolt;
+	/* Send it down */
+	putnext(ire->ire_stq, md_mp);
+	return;
+
+pbuf_panic:
+	cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
+	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+	    pbuf_idx);
+	/* NOTREACHED */
+}
+
+/*
  * Outbound IP fragmentation routine.
  *
  * NOTE : This routine does not ire_refrele the ire that is passed in
@@ -21000,29 +21326,30 @@
 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
     uint32_t frag_flag)
 {
-	int	i1;
-	mblk_t	*ll_hdr_mp;
-	int 	ll_hdr_len;
-	int	hdr_len;
-	mblk_t	*hdr_mp;
-	ipha_t	*ipha;
-	int	ip_data_end;
-	int	len;
-	mblk_t	*mp = mp_orig;
-	int	offset;
-	queue_t	*q;
+	int		i1;
+	mblk_t		*ll_hdr_mp;
+	int 		ll_hdr_len;
+	int		hdr_len;
+	mblk_t		*hdr_mp;
+	ipha_t		*ipha;
+	int		ip_data_end;
+	int		len;
+	mblk_t		*mp = mp_orig;
+	int		offset;
+	queue_t		*q;
 	uint32_t	v_hlen_tos_len;
-	mblk_t	*first_mp;
-	boolean_t mctl_present;
-	mblk_t	*xmit_mp;
-	mblk_t	*carve_mp;
-	ire_t   *ire1 = NULL;
-	ire_t   *save_ire = NULL;
-	mblk_t  *next_mp = NULL;
-	boolean_t last_frag = B_FALSE;
-	boolean_t multirt_send = B_FALSE;
-	ire_t *first_ire = NULL;
-	irb_t *irb = NULL;
+	mblk_t		*first_mp;
+	boolean_t	mctl_present;
+	ill_t		*ill;
+	mblk_t		*xmit_mp;
+	mblk_t		*carve_mp;
+	ire_t		*ire1 = NULL;
+	ire_t		*save_ire = NULL;
+	mblk_t  	*next_mp = NULL;
+	boolean_t	last_frag = B_FALSE;
+	boolean_t	multirt_send = B_FALSE;
+	ire_t		*first_ire = NULL;
+	irb_t		*irb = NULL;
 
 	TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
 	    "ip_wput_frag_start:");
@@ -21036,6 +21363,7 @@
 		mctl_present = B_FALSE;
 	}
 
+	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
 	ipha = (ipha_t *)mp->b_rptr;
 
 	/*
@@ -21079,8 +21407,37 @@
 	}
 
 	hdr_len = (V_HLEN & 0xF) << 2;
+
 	ipha->ipha_hdr_checksum = 0;
 
+	/*
+	 * Establish the number of bytes maximum per frag, after putting
+	 * in the header.
+	 */
+	len = (max_frag - hdr_len) & ~7;
+
+	/* Check if we can use MDT to send out the frags. */
+	ASSERT(!IRE_IS_LOCAL(ire));
+	if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound &&
+	    !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) &&
+	    (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) &&
+	    IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
+		ASSERT(ill->ill_mdt_capab != NULL);
+		if (!ill->ill_mdt_capab->ill_mdt_on) {
+			/*
+			 * If MDT has been previously turned off in the past,
+			 * and we currently can do MDT (due to IPQoS policy
+			 * removal, etc.) then enable it for this interface.
+			 */
+			ill->ill_mdt_capab->ill_mdt_on = 1;
+			ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
+			    ill->ill_name));
+		}
+		ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
+		    offset);
+		return;
+	}
+
 	/* Get a copy of the header for the trailing frags */
 	hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset);
 	if (!hdr_mp) {
@@ -21100,12 +21457,6 @@
 	offset <<= 3;
 	ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
 
-	/*
-	 * Establish the number of bytes maximum per frag, after putting
-	 * in the header.
-	 */
-	len = (max_frag - hdr_len) & ~7;
-
 	/* Store the length of the first fragment in the IP header. */
 	i1 = len + hdr_len;
 	ASSERT(i1 <= IP_MAXPACKET);
@@ -22565,8 +22916,6 @@
 	zoneid_t zoneid;
 	uint32_t cksum;
 	uint16_t *up;
-	/* Hack until the UDP merge into IP happens. */
-	extern boolean_t udp_compute_checksum(void);
 #ifdef	_BIG_ENDIAN
 #define	LENGTH	(v_hlen_tos_len & 0xFFFF)
 #else
@@ -22741,6 +23090,8 @@
 
 		offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET;
 		IP_STAT(ip_out_sw_cksum);
+		IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes,
+		    ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH));
 #define	iphs	((uint16_t *)ipha)
 		cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
 		    iphs[9] + ntohs(htons(ipha->ipha_length) -
@@ -23790,10 +24141,10 @@
 void
 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
 {
-	conn_t *connp = (conn_t *)arg;
+	conn_t *connp = arg;
 	tcp_t	*tcp;
 
-	ASSERT(connp != NULL && connp->conn_tcp != NULL);
+	ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL);
 	tcp = connp->conn_tcp;
 
 	if (connp->conn_tcp->tcp_state == TCPS_CLOSED)
@@ -23801,7 +24152,6 @@
 	else
 		tcp_rput_other(tcp, mp);
 	CONN_OPER_PENDING_DONE(connp);
-
 }
 
 /* Called from ip_wput for all non data messages */
@@ -24031,31 +24381,48 @@
 		case T_BIND_REQ: {
 			/* Request can get queued in bind */
 			ASSERT(connp != NULL);
+			/*
+			 * Both TCP and UDP call ip_bind_{v4,v6}() directly
+			 * instead of going through this path.  We only get
+			 * here in the following cases:
+			 *
+			 * a. Bind retries, where ipsq is non-NULL.
+			 * b. T_BIND_REQ is issued from non TCP/UDP
+			 *    transport, e.g. icmp for raw socket,
+			 *    in which case ipsq will be NULL.
+			 */
+			ASSERT(ipsq != NULL ||
+			    (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp)));
+
 			/* Don't increment refcnt if this is a re-entry */
 			if (ipsq == NULL)
 				CONN_INC_REF(connp);
-			mp = connp->conn_af_isv6 ?
-			    ip_bind_v6(q, mp, connp, NULL) :
-				ip_bind_v4(q, mp, connp);
-			if (mp != NULL) {
-				tcp_t	*tcp;
-
-				tcp = connp->conn_tcp;
-				if (tcp != NULL) {
-					if (ipsq == NULL) {
-						tcp_rput_other(tcp, mp);
-					} else {
-						CONN_INC_REF(connp);
-						squeue_fill(connp->conn_sqp, mp,
-						    ip_resume_tcp_bind,
-						    connp, SQTAG_TCP_RPUTOTHER);
-						return;
-					}
-				} else {
-					qreply(q, mp);
-				}
-				CONN_OPER_PENDING_DONE(connp);
-			}
+			mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
+			    connp, NULL) : ip_bind_v4(q, mp, connp);
+			if (mp == NULL)
+				return;
+			if (IPCL_IS_TCP(connp)) {
+				/*
+				 * In the case of TCP endpoint we
+				 * come here only for bind retries
+				 */
+				ASSERT(ipsq != NULL);
+				CONN_INC_REF(connp);
+				squeue_fill(connp->conn_sqp, mp,
+				    ip_resume_tcp_bind, connp,
+				    SQTAG_BIND_RETRY);
+				return;
+			} else if (IPCL_IS_UDP(connp)) {
+				/*
+				 * In the case of UDP endpoint we
+				 * come here only for bind retries
+				 */
+				ASSERT(ipsq != NULL);
+				udp_resume_bind(connp, mp);
+				return;
+			}
+			qreply(q, mp);
+			CONN_OPER_PENDING_DONE(connp);
 			return;
 		}
 		case T_SVR4_OPTMGMT_REQ:
@@ -24111,7 +24478,8 @@
 			}
 			return;
 		case T_UNBIND_REQ:
-			ip_unbind(q, mp);
+			mp = ip_unbind(q, mp);
+			qreply(q, mp);
 			return;
 		default:
 			/*

--- a/usr/src/uts/common/inet/ip/ip6.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip6.c	Sat Oct 22 22:50:14 2005 -0700
@@ -58,6 +58,7 @@
 #include <sys/policy.h>
 #include <net/if.h>
 #include <net/if_arp.h>
+#include <net/if_types.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <sys/sockio.h>
@@ -74,9 +75,12 @@
 #include <inet/snmpcom.h>
 
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip6_asp.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
 #include <inet/ipp_common.h>
 
 #include <inet/ip_multi.h>
@@ -103,20 +107,51 @@
 /*
  * IP statistics.
  */
-#define	IP6_STAT(x)	(ip6_statistics.x.value.ui64++)
+#define	IP6_STAT(x)		(ip6_statistics.x.value.ui64++)
+#define	IP6_STAT_UPDATE(x, n)	(ip6_statistics.x.value.ui64 += (n))
 
 typedef struct ip6_stat {
 	kstat_named_t	ip6_udp_fast_path;
 	kstat_named_t	ip6_udp_slow_path;
 	kstat_named_t	ip6_udp_fannorm;
 	kstat_named_t	ip6_udp_fanmb;
+	kstat_named_t   ip6_out_sw_cksum;
+	kstat_named_t   ip6_in_sw_cksum;
+	kstat_named_t	ip6_tcp_in_full_hw_cksum_err;
+	kstat_named_t	ip6_tcp_in_part_hw_cksum_err;
+	kstat_named_t	ip6_tcp_in_sw_cksum_err;
+	kstat_named_t	ip6_tcp_out_sw_cksum_bytes;
+	kstat_named_t	ip6_udp_in_full_hw_cksum_err;
+	kstat_named_t	ip6_udp_in_part_hw_cksum_err;
+	kstat_named_t	ip6_udp_in_sw_cksum_err;
+	kstat_named_t	ip6_udp_out_sw_cksum_bytes;
+	kstat_named_t	ip6_frag_mdt_pkt_out;
+	kstat_named_t	ip6_frag_mdt_discarded;
+	kstat_named_t	ip6_frag_mdt_allocfail;
+	kstat_named_t	ip6_frag_mdt_addpdescfail;
+	kstat_named_t	ip6_frag_mdt_allocd;
 } ip6_stat_t;
 
 static ip6_stat_t ip6_statistics = {
-	{ "ip6_udp_fast_path", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_slow_path", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_fannorm", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_fanmb", 	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fannorm",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fanmb",			KSTAT_DATA_UINT64 },
+	{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip6_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_allocd",		KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *ip6_kstat;
@@ -221,7 +256,7 @@
 static int	ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
     uint8_t *, uint_t, uint8_t);
 static mblk_t	*ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
-    ip6_frag_t *, uint_t, uint_t *);
+    ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *);
 static void	ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
     conn_t *, int, int, int);
@@ -2302,7 +2337,8 @@
 			connp->conn_recv = tcp_input;
 	}
 	/* Update qinfo if v4/v6 changed */
-	if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && !IS_TCP_CONN(connp)) {
+	if ((orig_pkt_isv6 != connp->conn_pkt_isv6) &&
+	    !(IPCL_IS_TCP(connp) || IPCL_IS_UDP(connp))) {
 		if (connp->conn_pkt_isv6)
 			ip_setqinfo(RD(q), IPV6_MINOR, B_TRUE);
 		else
@@ -2531,7 +2567,6 @@
     void *dummy_arg)
 {
 	conn_t	*connp = NULL;
-	tcp_t *tcp;
 	t_scalar_t prim;
 
 	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
@@ -2543,24 +2578,24 @@
 	prim = ((union T_primitives *)mp->b_rptr)->type;
 	ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ);
 
-	tcp = connp->conn_tcp;
-	if (tcp != NULL) {
+	if (IPCL_IS_TCP(connp)) {
 		/* Pass sticky_ipp for scope_id and pktinfo */
-		mp = ip_bind_v6(q, mp, connp, &tcp->tcp_sticky_ipp);
+		mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp);
 	} else {
 		/* For UDP and ICMP */
 		mp = ip_bind_v6(q, mp, connp, NULL);
 	}
 	if (mp != NULL) {
-		if (tcp != NULL) {
+		if (IPCL_IS_TCP(connp)) {
 			CONN_INC_REF(connp);
-			squeue_fill(connp->conn_sqp, mp,
-			    ip_resume_tcp_bind, connp, SQTAG_TCP_RPUTOTHER);
-			return;
+			squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind,
+			    connp, SQTAG_TCP_RPUTOTHER);
+		} else if (IPCL_IS_UDP(connp)) {
+			udp_resume_bind(connp, mp);
 		} else {
 			qreply(q, mp);
-		}
-		CONN_OPER_PENDING_DONE(connp);
+			CONN_OPER_PENDING_DONE(connp);
+		}
 	}
 }
 
@@ -2719,7 +2754,7 @@
 	if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
 	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
 	    (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
-	    (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+	    ILL_MDT_CAPABLE(md_ill)) {
 		md_dst_ire = dst_ire;
 		IRE_REFHOLD(md_dst_ire);
 	}
@@ -2936,7 +2971,7 @@
 		 */
 		error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
 		    connp->conn_ports,
-		    IS_TCP_CONN(connp) ? connp->conn_tcp->tcp_bound_if : 0);
+		    IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
 	}
 	if (error == 0) {
 		connp->conn_fully_bound = B_TRUE;
@@ -3411,8 +3446,7 @@
 		ASSERT((dp->db_struioflag & STRUIO_IP) == 0);
 
 		/* Initiate IPPf processing, if needed. */
-		if (IPP_ENABLED(IPP_LOCAL_IN) &&
-			(flags & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))) {
+		if (IPP_ENABLED(IPP_LOCAL_IN) && (flags & IP6_NO_IPPOLICY)) {
 			ill_index = ill->ill_phyint->phyint_ifindex;
 			ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
 			if (first_mp == NULL) {
@@ -3447,14 +3481,14 @@
 			}
 
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart = (intptr_t)sqp;
+			DB_CKSUMSTART(mp) = (intptr_t)sqp;
 
 			/*
 			 * db_cksumstuff is unused in the incoming
 			 * path; Thus store the ifindex here. It will
 			 * be cleared in tcp_conn_create_v6().
 			 */
-			mp->b_datap->db_cksumstuff =
+			DB_CKSUMSTUFF(mp) =
 			    (intptr_t)ill->ill_phyint->phyint_ifindex;
 			syn_present = B_TRUE;
 		}
@@ -3587,7 +3621,6 @@
     ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present,
     zoneid_t zoneid)
 {
-	queue_t		*rq;
 	uint32_t	dstport, srcport;
 	in6_addr_t	dst;
 	mblk_t		*first_mp;
@@ -3637,9 +3670,8 @@
 		/* Found a client */
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
-
-		if (!canputnext(rq)) {
+
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			freemsg(first_mp);
 			BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 			CONN_DEC_REF(connp);
@@ -3691,7 +3723,10 @@
 			}
 		}
 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-		putnext(rq, mp);
+
+		/* Send it upstream */
+		CONN_UDP_RECV(connp, mp);
+
 		IP6_STAT(ip6_udp_fannorm);
 		CONN_DEC_REF(connp);
 		if (mctl_present)
@@ -3746,7 +3781,6 @@
 		mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
 		/*
 		 * For link-local always add ifindex so that transport
 		 * can set sin6_scope_id. Avoid it for ICMP error
@@ -3762,7 +3796,7 @@
 			BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
 			goto next_one;
 		}
-		if (!canputnext(rq)) {
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 			freemsg(mp1);
 			goto next_one;
@@ -3778,7 +3812,9 @@
 			if (mctl_present)
 				freeb(first_mp1);
 			BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-			putnext(rq, mp1);
+
+			/* Send it upstream */
+			CONN_UDP_RECV(connp, mp1);
 		}
 next_one:
 		mutex_enter(&connfp->connf_lock);
@@ -3791,7 +3827,6 @@
 
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
-	rq = connp->conn_rq;
 
 	/* Initiate IPPF processing */
 	if (IP6_IN_IPP(flags)) {
@@ -3830,7 +3865,7 @@
 			first_mp = mp;
 		}
 	}
-	if (!canputnext(rq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 		freemsg(mp);
 	} else {
@@ -3844,7 +3879,9 @@
 			}
 		}
 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-		putnext(rq, mp);
+
+		/* Send it upstream */
+		CONN_UDP_RECV(connp, mp);
 	}
 	IP6_STAT(ip6_udp_fanmb);
 	CONN_DEC_REF(connp);
@@ -6447,7 +6484,7 @@
 		 */
 		if ((mp->b_datap->db_type != M_PCPROTO) ||
 		    (dl->dl_primitive == DL_UNITDATA_IND)) {
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 			return;
 		}
 	}
@@ -6835,14 +6872,16 @@
 	mblk_t		*first_mp1;
 	boolean_t	no_forward;
 	ip6_hbh_t	*hbhhdr;
-	boolean_t	no_cksum = (flags & IP6_IN_NOCKSUM);
 	boolean_t	ll_multicast = (flags & IP6_IN_LLMCAST);
 	conn_t		*connp;
-	int		off;
 	ilm_t		*ilm;
 	uint32_t	ports;
 	uint_t		ipif_id = 0;
 	zoneid_t	zoneid = GLOBAL_ZONEID;
+	uint16_t	hck_flags, reass_hck_flags;
+	uint32_t	reass_sum;
+	boolean_t	cksum_err;
+	mblk_t		*mp1;
 
 	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
 
@@ -6899,11 +6938,14 @@
 		pkt_len -= diff;
 	}
 
-	/*
-	 * XXX When zero-copy support is added, this turning off of
-	 * checksum flag  will need to be done more selectively.
-	 */
-	mp->b_datap->db_struioun.cksum.flags &= ~HCK_PARTIALCKSUM;
+	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+		hck_flags = DB_CKSUMFLAGS(mp);
+	else
+		hck_flags = 0;
+
+	/* Clear checksum flags in case we need to forward */
+	DB_CKSUMFLAGS(mp) = 0;
+	reass_sum = reass_hck_flags = 0;
 
 	nexthdr = ip6h->ip6_nxt;
 
@@ -7168,7 +7210,6 @@
 			/* TBD add site-local check at site boundary? */
 		} else if (ipv6_send_redirects) {
 			in6_addr_t	*v6targ;
-			mblk_t		*mp1;
 			in6_addr_t	gw_addr_v6;
 			ire_t		*src_ire_v6 = NULL;
 
@@ -7313,7 +7354,6 @@
 		case IPPROTO_TCP: {
 			uint16_t	*up;
 			uint32_t	sum;
-			dblk_t		*dp;
 			int		offset;
 
 			hdr_len = pkt_len - remlen;
@@ -7336,6 +7376,7 @@
 					freemsg(first_mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + hdr_len;
 			}
@@ -7368,30 +7409,12 @@
 						freemsg(first_mp);
 						return;
 					}
+					hck_flags = 0;
 					ip6h = (ip6_t *)mp->b_rptr;
 					whereptr = (uint8_t *)ip6h + hdr_len;
 				}
 			}
 
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum) {
-				if (mp->b_datap->db_type == M_DATA) {
-					/*
-					 * M_DATA mblk, so init mblk (chain)
-					 * for no struio().
-					 */
-					mblk_t  *mp1 = mp;
-
-					do {
-						mp1->b_datap->db_struioflag = 0;
-					} while ((mp1 = mp1->b_cont) != NULL);
-				}
-				goto tcp_fanout;
-			}
-
 			up = (uint16_t *)&ip6h->ip6_src;
 			/*
 			 * TCP checksum calculation.  First sum up the
@@ -7400,44 +7423,38 @@
 			 *  -	Destination IPv6 address
 			 *  -	TCP payload length
 			 *  -	TCP protocol ID
-			 * XXX need zero-copy support here
 			 */
 			sum = htons(IPPROTO_TCP + remlen) +
 			    up[0] + up[1] + up[2] + up[3] +
 			    up[4] + up[5] + up[6] + up[7] +
 			    up[8] + up[9] + up[10] + up[11] +
 			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold initial sum */
 			sum = (sum & 0xffff) + (sum >> 16);
-			dp = mp->b_datap;
-			if (dp->db_type != M_DATA || dp->db_ref > 1) {
-				/*
-				 * Not M_DATA mblk or its a dup, so do the
-				 * checksum now.
-				 */
-				sum = IP_CSUM(mp, hdr_len, sum);
-				if (sum) {
-					/* checksum failed */
-					ip1dbg(("ip_rput_data_v6: TCP checksum"
-					    " failed %x off %d\n",
-					    sum, hdr_len));
-					BUMP_MIB(&ip_mib, tcpInErrs);
-					freemsg(first_mp);
-					return;
-				}
-			} else {
-				/*
-				 * M_DATA mblk and not a dup
-				 * compute checksum here
-				 */
-				off = (int)(whereptr - mp->b_rptr);
-
-				if (IP_CSUM(mp, off, sum)) {
-					BUMP_MIB(&ip_mib, tcpInErrs);
-					ipcsumdbg("ip_rput_data_v6 "
-					    "swcksumerr\n", mp);
-					freemsg(first_mp);
-					return;
-				}
+
+			mp1 = mp->b_cont;
+
+			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+				IP6_STAT(ip6_in_sw_cksum);
+
+			IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+			    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+			    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+			    mp, mp1, cksum_err);
+
+			if (cksum_err) {
+				BUMP_MIB(&ip_mib, tcpInErrs);
+
+				if (hck_flags & HCK_FULLCKSUM)
+					IP6_STAT(ip6_tcp_in_full_hw_cksum_err);
+				else if (hck_flags & HCK_PARTIALCKSUM)
+					IP6_STAT(ip6_tcp_in_part_hw_cksum_err);
+				else
+					IP6_STAT(ip6_tcp_in_sw_cksum_err);
+
+				freemsg(first_mp);
+				return;
 			}
 tcp_fanout:
 			ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill,
@@ -7468,18 +7485,16 @@
 			}
 
 			sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len);
-			if (!no_cksum) {
-				/* checksum */
-				pktsum = sctph->sh_chksum;
-				sctph->sh_chksum = 0;
-				calcsum = sctp_cksum(mp, hdr_len);
-				if (calcsum != pktsum) {
-					BUMP_MIB(&sctp_mib, sctpChecksumError);
-					freemsg(mp);
-					return;
-				}
-				sctph->sh_chksum = pktsum;
-			}
+			/* checksum */
+			pktsum = sctph->sh_chksum;
+			sctph->sh_chksum = 0;
+			calcsum = sctp_cksum(mp, hdr_len);
+			if (calcsum != pktsum) {
+				BUMP_MIB(&sctp_mib, sctpChecksumError);
+				freemsg(mp);
+				return;
+			}
+			sctph->sh_chksum = pktsum;
 			ports = *(uint32_t *)(mp->b_rptr + hdr_len);
 			if ((connp = sctp_find_conn(&ip6h->ip6_src,
 			    &ip6h->ip6_dst, ports, ipif_id, zoneid)) == NULL) {
@@ -7501,8 +7516,6 @@
 
 			hdr_len = pkt_len - remlen;
 
-#define	UDPH_SIZE 8
-
 			if (hada_mp != NULL) {
 				ip0dbg(("udp hada drop\n"));
 				goto hada_drop;
@@ -7519,16 +7532,10 @@
 					freemsg(first_mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + hdr_len;
 			}
-#undef UDPH_SIZE
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum)
-				goto udp_fanout;
 
 			/*
 			 *  Before going through the regular checksum
@@ -7568,15 +7575,37 @@
 			    up[8] + up[9] + up[10] + up[11] +
 			    up[12] + up[13] + up[14] + up[15];
 
+			/* Fold initial sum */
 			sum = (sum & 0xffff) + (sum >> 16);
-			/* Next sum in the UDP packet */
-			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum) {
-				/* UDP checksum failed */
-				ip1dbg(("ip_rput_data_v6: UDP checksum "
-				    "failed %x\n",
-				    sum));
+
+			if (reass_hck_flags != 0) {
+				hck_flags = reass_hck_flags;
+
+				IP_CKSUM_RECV_REASS(hck_flags,
+				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+				    sum, reass_sum, cksum_err);
+			} else {
+				mp1 = mp->b_cont;
+
+				IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+				    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+				    mp, mp1, cksum_err);
+			}
+
+			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+				IP6_STAT(ip6_in_sw_cksum);
+
+			if (cksum_err) {
 				BUMP_MIB(ill->ill_ip6_mib, udpInCksumErrs);
+
+				if (hck_flags & HCK_FULLCKSUM)
+					IP6_STAT(ip6_udp_in_full_hw_cksum_err);
+				else if (hck_flags & HCK_PARTIALCKSUM)
+					IP6_STAT(ip6_udp_in_part_hw_cksum_err);
+				else
+					IP6_STAT(ip6_udp_in_sw_cksum_err);
+
 				freemsg(first_mp);
 				return;
 			}
@@ -7592,13 +7621,6 @@
 				goto hada_drop;
 			}
 
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum)
-				goto icmp_fanout;
-
 			up = (uint16_t *)&ip6h->ip6_src;
 			sum = htons(IPPROTO_ICMPV6 + remlen) +
 			    up[0] + up[1] + up[2] + up[3] +
@@ -7607,7 +7629,7 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum) {
+			if (sum != 0) {
 				/* IPv6 ICMP checksum failed */
 				ip1dbg(("ip_rput_data_v6: ICMPv6 checksum "
 				    "failed %x\n",
@@ -7795,6 +7817,7 @@
 					freemsg(mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + pkt_len - remlen;
 			}
@@ -7820,8 +7843,12 @@
 				}
 			}
 
+			/* Restore the flags */
+			DB_CKSUMFLAGS(mp) = hck_flags;
+
 			mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
-			    remlen - used, &prev_nexthdr_offset);
+			    remlen - used, &prev_nexthdr_offset,
+			    &reass_sum, &reass_hck_flags);
 			if (mp == NULL) {
 				/* Reassembly is still pending */
 				return;
@@ -8032,7 +8059,7 @@
 		return;
 	}
 
-	if (!canputnext(connp->conn_upq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		freemsg(first_mp);
 		BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 		CONN_DEC_REF(connp);
@@ -8062,7 +8089,9 @@
 	IP6_STAT(ip6_udp_fast_path);
 	BUMP_MIB(ill->ill_ip6_mib, ipv6InReceives);
 	BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-	putnext(connp->conn_upq, mp);
+
+	/* Send it upstream */
+	CONN_UDP_RECV(connp, mp);
 
 	CONN_DEC_REF(connp);
 	freemsg(hada_mp);
@@ -8086,7 +8115,8 @@
  */
 static mblk_t *
 ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
-    ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset)
+    ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
+    uint32_t *cksum_val, uint16_t *cksum_flags)
 {
 	ill_t		*ill = (ill_t *)q->q_ptr;
 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
@@ -8107,6 +8137,62 @@
 	mblk_t		*tail_mp;
 	mblk_t		*t_mp;
 	boolean_t	pruned = B_FALSE;
+	uint32_t	sum_val;
+	uint16_t	sum_flags;
+
+
+	if (cksum_val != NULL)
+		*cksum_val = 0;
+	if (cksum_flags != NULL)
+		*cksum_flags = 0;
+
+	/*
+	 * We utilize hardware computed checksum info only for UDP since
+	 * IP fragmentation is a normal occurence for the protocol.  In
+	 * addition, checksum offload support for IP fragments carrying
+	 * UDP payload is commonly implemented across network adapters.
+	 */
+	ASSERT(ill != NULL);
+	if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+		mblk_t *mp1 = mp->b_cont;
+		int32_t len;
+
+		/* Record checksum information from the packet */
+		sum_val = (uint32_t)DB_CKSUM16(mp);
+		sum_flags = DB_CKSUMFLAGS(mp);
+
+		/* fragmented payload offset from beginning of mblk */
+		offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
+
+		if ((sum_flags & HCK_PARTIALCKSUM) &&
+		    (mp1 == NULL || mp1->b_cont == NULL) &&
+		    offset >= (uint16_t)DB_CKSUMSTART(mp) &&
+		    ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) {
+			uint32_t adj;
+			/*
+			 * Partial checksum has been calculated by hardware
+			 * and attached to the packet; in addition, any
+			 * prepended extraneous data is even byte aligned.
+			 * If any such data exists, we adjust the checksum;
+			 * this would also handle any postpended data.
+			 */
+			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+			    mp, mp1, len, adj);
+
+			/* One's complement subtract extraneous checksum */
+			if (adj >= sum_val)
+				sum_val = ~(adj - sum_val) & 0xFFFF;
+			else
+				sum_val -= adj;
+		}
+	} else {
+		sum_val = 0;
+		sum_flags = 0;
+	}
+
+	/* Clear hardware checksumming flag */
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Note: Fragment offset in header is in 8-octet units.
@@ -8159,7 +8245,6 @@
 	 * Drop the fragmented as early as possible, if
 	 * we don't have resource(s) to re-assemble.
 	 */
-
 	if (ip_reass_queue_bytes == 0) {
 		freemsg(mp);
 		return (NULL);
@@ -8183,12 +8268,11 @@
 	 * there is anything on the reassembly queue, the timer will
 	 * be running.
 	 */
-	msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
+	msg_len = MBLKSIZE(mp);
 	tail_mp = mp;
 	while (tail_mp->b_cont != NULL) {
 		tail_mp = tail_mp->b_cont;
-		msg_len += tail_mp->b_datap->db_lim -
-		    tail_mp->b_datap->db_base;
+		msg_len += MBLKSIZE(tail_mp);
 	}
 	/*
 	 * If the reassembly list for this ILL will get too big
@@ -8287,7 +8371,7 @@
 		ipf->ipf_timestamp = gethrestime_sec();
 		/* Record ipf generation and account for frag header */
 		ipf->ipf_gen = ill->ill_ipf_gen++;
-		ipf->ipf_count = mp1->b_datap->db_lim - mp1->b_datap->db_base;
+		ipf->ipf_count = MBLKSIZE(mp1);
 		ipf->ipf_protocol = nexthdr;
 		ipf->ipf_nf_hdr_len = 0;
 		ipf->ipf_prev_nexthdr_offset = 0;
@@ -8295,6 +8379,16 @@
 		ipf->ipf_ecn = ecn_info;
 		ipf->ipf_num_dups = 0;
 		ipfb->ipfb_frag_pkts++;
+		ipf->ipf_checksum = 0;
+		ipf->ipf_checksum_flags = 0;
+
+		/* Store checksum value in fragment header */
+		if (sum_flags != 0) {
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			ipf->ipf_checksum = sum_val;
+			ipf->ipf_checksum_flags = sum_flags;
+		}
 
 		/*
 		 * We handle reassembly two ways.  In the easy case,
@@ -8326,6 +8420,10 @@
 			 * on easy reassembly.
 			 */
 			ipf->ipf_end = 0;
+
+			/* Forget checksum offload from now on */
+			ipf->ipf_checksum_flags = 0;
+
 			/*
 			 * ipf_hole_cnt is set by ip_reassemble.
 			 * ipf_count is updated by ip_reassemble.
@@ -8349,6 +8447,23 @@
 	}
 
 	/*
+	 * If the packet's flag has changed (it could be coming up
+	 * from an interface different than the previous, therefore
+	 * possibly different checksum capability), then forget about
+	 * any stored checksum states.  Otherwise add the value to
+	 * the existing one stored in the fragment header.
+	 */
+	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+		sum_val += ipf->ipf_checksum;
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		ipf->ipf_checksum = sum_val;
+	} else if (ipf->ipf_checksum_flags != 0) {
+		/* Forget checksum offload from now on */
+		ipf->ipf_checksum_flags = 0;
+	}
+
+	/*
 	 * We have a new piece of a datagram which is already being
 	 * reassembled.  Update the ECN info if all IP fragments
 	 * are ECN capable.  If there is one which is not, clear
@@ -8443,6 +8558,13 @@
 	nexthdr = ipf->ipf_protocol;
 	*prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
 	ipfp = ipf->ipf_ptphn;
+
+	/* We need to supply these to caller */
+	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+		sum_val = ipf->ipf_checksum;
+	else
+		sum_val = 0;
+
 	mp1 = ipf->ipf_mp;
 	count = ipf->ipf_count;
 	ipf = ipf->ipf_hash_next;
@@ -8508,6 +8630,12 @@
 	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
 	ip6h->ip6_vcf |= htonl(ecn_info << 20);
 
+	/* Reassembly is successful; return checksum information if needed */
+	if (cksum_val != NULL)
+		*cksum_val = sum_val;
+	if (cksum_flags != NULL)
+		*cksum_flags = sum_flags;
+
 	return (mp);
 }
 
@@ -9954,7 +10082,7 @@
 	if (q->q_next == NULL) {
 		connp = Q_TO_CONN(q);
 
-		if (IS_TCP_CONN(connp)) {
+		if (IPCL_IS_TCP(connp)) {
 			/* change conn_send for the tcp_v4_connections */
 			connp->conn_send = ip_output;
 		} else if (connp->conn_ulp == IPPROTO_SCTP) {
@@ -10426,12 +10554,52 @@
 		uint32_t	sum;
 		uint_t		ill_index =  ((ill_t *)ire->ire_stq->q_ptr)->
 		    ill_phyint->phyint_ifindex;
+		queue_t		*dev_q = ire->ire_stq->q_next;
 
 		/*
 		 * non-NULL send-to queue - packet is to be sent
 		 * out an interface.
 		 */
 
+		/* Driver is flow-controlling? */
+		if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
+		    ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+			/*
+			 * Queue packet if we have an conn to give back
+			 * pressure.  We can't queue packets intended for
+			 * hardware acceleration since we've tossed that
+			 * state already.  If the packet is being fed back
+			 * from ire_send_v6, we don't know the position in
+			 * the queue to enqueue the packet and we discard
+			 * the packet.
+			 */
+			ASSERT(mp == first_mp);
+			if (ip_output_queue && connp != NULL &&
+			    !mctl_present && caller != IRE_SEND) {
+				if (caller == IP_WSRV) {
+					connp->conn_did_putbq = 1;
+					(void) putbq(connp->conn_wq, mp);
+					conn_drain_insert(connp);
+					/*
+					 * caller == IP_WSRV implies we are
+					 * the service thread, and the
+					 * queue is already noenabled.
+					 * The check for canput and
+					 * the putbq is not atomic.
+					 * So we need to check again.
+					 */
+					if (canput(dev_q))
+						connp->conn_did_putbq = 0;
+				} else {
+					(void) putq(connp->conn_wq, mp);
+				}
+				return;
+			}
+			BUMP_MIB(mibptr, ipv6OutDiscards);
+			freemsg(mp);
+			return;
+		}
+
 		/*
 		 * Look for reachability confirmations from the transport.
 		 */
@@ -10490,20 +10658,20 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			*insp = IP_CSUM(mp, hdr_length, sum);
+			if (*insp == 0)
+				*insp = 0xFFFF;
 		} else if (nexthdr == IPPROTO_TCP) {
 			uint16_t	*up;
 
 			/*
 			 * Check for full IPv6 header + enough TCP header
 			 * to get at the checksum field.
-			 * XXX need hardware checksum support.
-			 */
-#define	TCP_CSUM_OFFSET	16
-#define	TCP_CSUM_SIZE	2
+			 */
 			if ((mp->b_wptr - mp->b_rptr) <
-			    (hdr_length + TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+			    (hdr_length + TCP_CHECKSUM_OFFSET +
+			    TCP_CHECKSUM_SIZE)) {
 				if (!pullupmsg(mp, hdr_length +
-				    TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+				    TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) {
 					ip1dbg(("ip_wput_v6: TCP hdr pullupmsg"
 					    " failed\n"));
 					BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10519,30 +10687,28 @@
 			 * into the tcp checksum field, so we don't
 			 * need to explicitly sum it in here.
 			 */
-			if (hdr_length == IPV6_HDR_LEN) {
-				/* src, dst, tcp consequtive */
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + TCP_CSUM_OFFSET);
-				*up = IP_CSUM(mp,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_TCP));
-			} else {
-				sum = htons(IPPROTO_TCP) +
-				    up[0] + up[1] + up[2] + up[3] +
-				    up[4] + up[5] + up[6] + up[7] +
-				    up[8] + up[9] + up[10] + up[11] +
-				    up[12] + up[13] + up[14] + up[15];
-				/*
-				 * Fold the initial sum.
-				 */
-				sum = (sum & 0xffff) + (sum >> 16);
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    hdr_length + TCP_CSUM_OFFSET);
-				*up = IP_CSUM(mp, hdr_length, sum);
-			}
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
-
+			sum = up[0] + up[1] + up[2] + up[3] +
+			    up[4] + up[5] + up[6] + up[7] +
+			    up[8] + up[9] + up[10] + up[11] +
+			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold the initial sum */
+			sum = (sum & 0xffff) + (sum >> 16);
+
+			up = (uint16_t *)(((uchar_t *)ip6h) +
+			    hdr_length + TCP_CHECKSUM_OFFSET);
+
+			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP,
+			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+			    ire->ire_max_frag, mctl_present, sum);
+
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP6_STAT(ip6_out_sw_cksum);
+				IP6_STAT_UPDATE(ip6_tcp_out_sw_cksum_bytes,
+				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+				    hdr_length);
+			}
 		} else if (nexthdr == IPPROTO_UDP) {
 			uint16_t	*up;
 
@@ -10550,12 +10716,10 @@
 			 * check for full IPv6 header + enough UDP header
 			 * to get at the UDP checksum field
 			 */
-#define	UDP_CSUM_OFFSET	6
-#define	UDP_CSUM_SIZE	2
 			if ((mp->b_wptr - mp->b_rptr) < (hdr_length +
-			    UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+			    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
 				if (!pullupmsg(mp, hdr_length +
-				    UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+				    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
 					ip1dbg(("ip_wput_v6: UDP hdr pullupmsg"
 					    " failed\n"));
 					BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10570,34 +10734,28 @@
 			 * into the udp checksum field, so we don't
 			 * need to explicitly sum it in here.
 			 */
-			if (hdr_length == IPV6_HDR_LEN) {
-				/* src, dst, udp consequtive */
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + UDP_CSUM_OFFSET);
-				*up = IP_CSUM(mp,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_UDP));
-			} else {
-				sum = htons(IPPROTO_UDP) +
-				    up[0] + up[1] + up[2] + up[3] +
-				    up[4] + up[5] + up[6] + up[7] +
-				    up[8] + up[9] + up[10] + up[11] +
-				    up[12] + up[13] + up[14] + up[15];
-				sum = (sum & 0xffff) + (sum >> 16);
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    hdr_length + UDP_CSUM_OFFSET);
-				*up = IP_CSUM(mp, hdr_length, sum);
-			}
-
-			/*
-			 * According to RFC 2460, UDP in IPv6 shouldn't
-			 * appear with all zero checksum on the wire and
-			 * should be changed to 0xffff.
-			 */
-			if (*up == 0)
-				*up = 0xffff;
-#undef UDP_CSUM_OFFSET
-#undef UDP_CSUM_SIZE
+			sum = up[0] + up[1] + up[2] + up[3] +
+			    up[4] + up[5] + up[6] + up[7] +
+			    up[8] + up[9] + up[10] + up[11] +
+			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold the initial sum */
+			sum = (sum & 0xffff) + (sum >> 16);
+
+			up = (uint16_t *)(((uchar_t *)ip6h) +
+			    hdr_length + UDP_CHECKSUM_OFFSET);
+
+			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP,
+			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+			    ire->ire_max_frag, mctl_present, sum);
+
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP6_STAT(ip6_out_sw_cksum);
+				IP6_STAT_UPDATE(ip6_udp_out_sw_cksum_bytes,
+				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+				    hdr_length);
+			}
 		} else if (nexthdr == IPPROTO_ICMPV6) {
 			uint16_t	*up;
 			icmp6_t *icmp6;
@@ -10627,6 +10785,9 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum);
+			if (icmp6->icmp6_cksum == 0)
+				icmp6->icmp6_cksum = 0xFFFF;
+
 			/* Update output mib stats */
 			icmp_update_out_mib_v6(ill, icmp6);
 		} else if (nexthdr == IPPROTO_SCTP) {
@@ -10764,6 +10925,223 @@
 }
 
 /*
+ * Outbound IPv6 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk,
+    size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset)
+{
+	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
+	mblk_t		*hdr_mp, *md_mp = NULL;
+	int		i1;
+	multidata_t	*mmd;
+	unsigned char	*hdr_ptr, *pld_ptr;
+	ip_pdescinfo_t	pdi;
+	uint32_t	ident;
+	size_t		len;
+	uint16_t	offset;
+	queue_t		*stq = ire->ire_stq;
+	ill_t		*ill = (ill_t *)stq->q_ptr;
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(MBLKL(mp) > unfragmentable_len);
+
+	/*
+	 * Move read ptr past unfragmentable portion, we don't want this part
+	 * of the data in our fragments.
+	 */
+	mp->b_rptr += unfragmentable_len;
+
+	/* Calculate how many packets we will send out  */
+	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+	pkts = (i1 + max_chunk - 1) / max_chunk;
+	ASSERT(pkts > 1);
+
+	/* Allocate a message block which will hold all the IP Headers. */
+	wroff = ip_wroff_extra;
+	hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t);
+
+	i1 = pkts * hdr_chunk_len;
+	/*
+	 * Create the header buffer, Multidata and destination address
+	 * and SAP attribute that should be associated with it.
+	 */
+	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+	    ((hdr_mp->b_wptr += i1),
+	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+	    !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
+		freemsg(mp);
+		if (md_mp == NULL) {
+			freemsg(hdr_mp);
+		} else {
+free_mmd:		IP6_STAT(ip6_frag_mdt_discarded);
+			freemsg(md_mp);
+		}
+		IP6_STAT(ip6_frag_mdt_allocfail);
+		BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragFails);
+		UPDATE_MIB(ill->ill_ip6_mib, ipv6OutDiscards, pkts);
+		return;
+	}
+	IP6_STAT(ip6_frag_mdt_allocd);
+
+	/*
+	 * Add a payload buffer to the Multidata; this operation must not
+	 * fail, or otherwise our logic in this routine is broken.  There
+	 * is no memory allocation done by the routine, so any returned
+	 * failure simply tells us that we've done something wrong.
+	 *
+	 * A failure tells us that either we're adding the same payload
+	 * buffer more than once, or we're trying to add more buffers than
+	 * allowed.  None of the above cases should happen, and we panic
+	 * because either there's horrible heap corruption, and/or
+	 * programming mistake.
+	 */
+	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) {
+		goto pbuf_panic;
+	}
+
+	hdr_ptr = hdr_mp->b_rptr;
+	pld_ptr = mp->b_rptr;
+
+	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+	ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1));
+
+	/*
+	 * len is the total length of the fragmentable data in this
+	 * datagram.  For each fragment sent, we will decrement len
+	 * by the amount of fragmentable data sent in that fragment
+	 * until len reaches zero.
+	 */
+	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
+
+	offset = 0;
+	prev_nexthdr_offset += wroff;
+
+	while (len != 0) {
+		size_t		mlen;
+		ip6_t		*fip6h;
+		ip6_frag_t	*fraghdr;
+		int		error;
+
+		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+		mlen = MIN(len, max_chunk);
+		len -= mlen;
+
+		fip6h = (ip6_t *)(hdr_ptr + wroff);
+		ASSERT(OK_32PTR(fip6h));
+		bcopy(ip6h, fip6h, unfragmentable_len);
+		hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
+
+		fip6h->ip6_plen = htons((uint16_t)(mlen +
+		    unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
+
+		fraghdr = (ip6_frag_t *)((unsigned char *)fip6h +
+		    unfragmentable_len);
+		fraghdr->ip6f_nxt = nexthdr;
+		fraghdr->ip6f_reserved = 0;
+		fraghdr->ip6f_offlg = htons(offset) |
+		    ((len != 0) ? IP6F_MORE_FRAG : 0);
+		fraghdr->ip6f_ident = ident;
+
+		/*
+		 * Record offset and size of header and data of the next packet
+		 * in the multidata message.
+		 */
+		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff,
+		    unfragmentable_len + sizeof (ip6_frag_t), 0);
+		PDESC_PLD_INIT(&pdi);
+		i1 = MIN(mp->b_wptr - pld_ptr, mlen);
+		ASSERT(i1 > 0);
+		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+		if (i1 == mlen) {
+			pld_ptr += mlen;
+		} else {
+			i1 = mlen - i1;
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			ASSERT(MBLKL(mp) >= i1);
+			/*
+			 * Attach the next payload message block to the
+			 * multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+			pld_ptr = mp->b_rptr + i1;
+		}
+
+		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+		    KM_NOSLEEP)) == NULL) {
+			/*
+			 * Any failure other than ENOMEM indicates that we
+			 * have passed in invalid pdesc info or parameters
+			 * to mmd_addpdesc, which must not happen.
+			 *
+			 * EINVAL is a result of failure on boundary checks
+			 * against the pdesc info contents.  It should not
+			 * happen, and we panic because either there's
+			 * horrible heap corruption, and/or programming
+			 * mistake.
+			 */
+			if (error != ENOMEM) {
+				cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: "
+				    "pdesc logic error detected for "
+				    "mmd %p pinfo %p (%d)\n",
+				    (void *)mmd, (void *)&pdi, error);
+				/* NOTREACHED */
+			}
+			IP6_STAT(ip6_frag_mdt_addpdescfail);
+			/* Free unattached payload message blocks as well */
+			md_mp->b_cont = mp->b_cont;
+			goto free_mmd;
+		}
+
+		/* Advance fragment offset. */
+		offset += mlen;
+
+		/* Advance to location for next header in the buffer. */
+		hdr_ptr += hdr_chunk_len;
+
+		/* Did we reach the next payload message block? */
+		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+			mp = mp->b_cont;
+			/*
+			 * Attach the next message block with payload
+			 * data to the multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			pld_ptr = mp->b_rptr;
+		}
+	}
+
+	ASSERT(hdr_mp->b_wptr == hdr_ptr);
+	ASSERT(mp->b_wptr == pld_ptr);
+
+	/* Update IP statistics */
+	UPDATE_MIB(ill->ill_ip6_mib, ipv6OutFragCreates, pkts);
+	BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragOKs);
+	IP6_STAT_UPDATE(ip6_frag_mdt_pkt_out, pkts);
+
+	ire->ire_ob_pkt_count += pkts;
+	if (ire->ire_ipif != NULL)
+		atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+
+	ire->ire_last_used_time = lbolt;
+	/* Send it down */
+	putnext(stq, md_mp);
+	return;
+
+pbuf_panic:
+	cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic "
+	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+	    pbuf_idx);
+	/* NOTREACHED */
+}
+
+/*
  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
  * We have not optimized this in terms of number of mblks
  * allocated. For instance, for each fragment sent we always allocate a
@@ -10779,7 +11157,7 @@
  */
 void
 ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
-    boolean_t caller, int max_frag)
+    int caller, int max_frag)
 {
 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
 	ip6_t		*fip6h;
@@ -10849,6 +11227,19 @@
 	}
 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
 
+	max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
+	    sizeof (ip6_frag_t)) & ~7;
+
+	/* Check if we can use MDT to send out the frags. */
+	ASSERT(!IRE_IS_LOCAL(ire));
+	if (ip_multidata_outbound && reachable == 0 &&
+	    !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) &&
+	    IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) {
+		ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len,
+		    nexthdr, prev_nexthdr_offset);
+		return;
+	}
+
 	/*
 	 * Allocate an mblk with enough room for the link-layer
 	 * header, the unfragmentable part of the datagram, and the
@@ -10875,7 +11266,7 @@
 
 	fraghdr->ip6f_nxt = nexthdr;
 	fraghdr->ip6f_reserved = 0;
-	fraghdr->ip6f_offlg = htons(0);
+	fraghdr->ip6f_offlg = 0;
 	fraghdr->ip6f_ident = htonl(ident);
 
 	/*
@@ -10886,9 +11277,6 @@
 	 */
 	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
 
-	max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
-	    sizeof (ip6_frag_t)) & ~7;
-
 	/*
 	 * Move read ptr past unfragmentable portion, we don't want this part
 	 * of the data in our fragments.
@@ -11117,7 +11505,9 @@
 		}
 	}
 
-	if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || canput(stq->q_next)) {
+	/* Flow-control check has been done in ip_wput_ire_v6 */
+	if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT ||
+	    caller == IP_WSRV || canput(stq->q_next)) {
 		uint32_t ill_index;
 
 		/*
@@ -11164,7 +11554,7 @@
 				ill = ire_to_ill(ire);
 			}
 			IRB_REFRELE(irb);
-		} else if (connp != NULL && IS_TCP_CONN(connp) &&
+		} else if (connp != NULL && IPCL_IS_TCP(connp) &&
 		    connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt &&
 		    ILL_MDT_USABLE(ill)) {
 			/*
@@ -11583,7 +11973,7 @@
 				(void) putbq(connp->conn_wq, mp);
 				conn_drain_insert(connp);
 				/*
-				 * called_from_wsrv implies we are
+				 * caller == IP_WSRV implies we are
 				 * the service thread, and the
 				 * queue is already noenabled.
 				 * The check for canput and

--- a/usr/src/uts/common/inet/ip/ip_if.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Sat Oct 22 22:50:14 2005 -0700
@@ -80,6 +80,7 @@
 #include <inet/ip_rts.h>
 #include <inet/ip_ndp.h>
 #include <inet/ip_if.h>
+#include <inet/ip_impl.h>
 #include <inet/tun.h>
 #include <inet/sctp_ip.h>
 
@@ -1232,10 +1233,10 @@
 	} else {
 		/*
 		 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
-		 * be just ip_ioctl_freemsg. we have to restart it
+		 * be just inet_freemsg. we have to restart it
 		 * otherwise the thread will be stuck.
 		 */
-		ip_ioctl_freemsg(mp);
+		inet_freemsg(mp);
 	}
 	return (B_TRUE);
 }
@@ -1344,10 +1345,10 @@
 		} else {
 			/*
 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
-			 * this can't be just ip_ioctl_freemsg. we have to
+			 * this can't be just inet_freemsg. we have to
 			 * restart it otherwise the thread will be stuck.
 			 */
-			ip_ioctl_freemsg(curr);
+			inet_freemsg(curr);
 		}
 	}
 }
@@ -1384,7 +1385,7 @@
 	if (curr != NULL) {
 		mutex_exit(&connp->conn_lock);
 		CONN_DEC_REF(connp);
-		ip_ioctl_freemsg(curr);
+		inet_freemsg(curr);
 		return;
 	}
 	/*
@@ -2042,7 +2043,7 @@
 	dl_capability_sub_t *dl_subcap;
 	int size;
 
-	if (!(ill->ill_capabilities & ILL_CAPAB_MDT))
+	if (!ILL_MDT_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_mdt_capab != NULL);
@@ -2857,6 +2858,9 @@
 	bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t));
 	ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
 
+	ip1dbg(("ill_capability_poll_capable: asking interface %s "
+	    "to enable polling\n", ill->ill_name));
+
 	/* nmp points to a DL_CAPABILITY_REQ message to enable polling */
 	ill_dlpi_send(ill, nmp);
 }
@@ -2944,6 +2948,8 @@
 			ASSERT(ill->ill_poll_capab != NULL);
 			ill->ill_capabilities |= ILL_CAPAB_POLL;
 		}
+		ip1dbg(("ill_capability_poll_ack: interface %s "
+		    "has enabled polling\n", ill->ill_name));
 		break;
 	}
 }
@@ -3048,8 +3054,9 @@
 		return;
 	}
 
-#define	CURR_HCKSUM_CAPAB \
-	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM)
+#define	CURR_HCKSUM_CAPAB				\
+	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
+	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
 
 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
@@ -3126,10 +3133,11 @@
 		 * hardware checksum acceleration.
 		 */
 		ill_dlpi_send(ill, nmp);
-	} else
+	} else {
 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
 		    "advertised %x hardware checksum capability flags\n",
 		    ill->ill_name, ihck->hcksum_txflags));
+	}
 }
 
 static void
@@ -3140,7 +3148,7 @@
 	dl_capability_sub_t *dl_subcap;
 	int size;
 
-	if (!(ill->ill_capabilities & ILL_CAPAB_HCKSUM))
+	if (!ILL_HCKSUM_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_hcksum_capab != NULL);
@@ -7300,7 +7308,7 @@
 				ASSERT(mp_next == NULL);
 				ipsq->ipsq_mptail = prev;
 			}
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 		} else {
 			prev = mp;
 		}
@@ -8838,7 +8846,7 @@
 		if (mp1 != NULL)
 			freeb(mp1);
 		if (pending_mp != NULL)
-			ip_ioctl_freemsg(pending_mp);
+			inet_freemsg(pending_mp);
 		return (ENOMEM);
 	}
 
@@ -8848,7 +8856,7 @@
 	    (caddr_t)&ipaddr);
 	if (mp2 == NULL) {
 		freeb(mp1);
-		ip_ioctl_freemsg(pending_mp);
+		inet_freemsg(pending_mp);
 		return (ENOMEM);
 	}
 	/* Put together the chain. */
@@ -9743,7 +9751,7 @@
 	pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
 	if (pending_mp == NULL) {
 		ASSERT(connp == NULL);
-		ip_ioctl_freemsg(mp);
+		inet_freemsg(mp);
 		return;
 	}
 	ASSERT(connp != NULL);
@@ -9760,7 +9768,7 @@
 	 */
 	orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
 	orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
-	ip_ioctl_freemsg(pending_mp);
+	inet_freemsg(pending_mp);
 
 	/*
 	 * We're done if there was an error or if this is not an SIOCG{X}ARP
@@ -18114,6 +18122,8 @@
 	icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
 	bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
 	icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xffff;
 
 	put(ipif->ipif_wq, mp);

--- a/usr/src/uts/common/inet/ip/ip_multi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_multi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -65,6 +65,7 @@
 #include <inet/ipsec_impl.h>
 #include <inet/sctp_ip.h>
 #include <inet/ip_listutils.h>
+#include <inet/udp_impl.h>
 
 #include <netinet/igmp.h>
 
@@ -1186,14 +1187,39 @@
 ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags,
     zoneid_t zoneid)
 {
-	mblk_t		*mp;
-	mblk_t		*ipsec_mp;
-
-	/* TODO this could use dup'ed messages except for the IP header. */
-	mp = ip_copymsg(mp_orig);
+	mblk_t	*mp;
+	mblk_t	*ipsec_mp;
+
+	if (DB_TYPE(mp_orig) == M_DATA &&
+	    ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) {
+		uint_t hdrsz;
+
+		hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) +
+		    sizeof (udpha_t);
+		ASSERT(MBLKL(mp_orig) >= hdrsz);
+
+		if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) &&
+		    (mp_orig = dupmsg(mp_orig)) != NULL) {
+			bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz);
+			mp->b_wptr += hdrsz;
+			mp->b_cont = mp_orig;
+			mp_orig->b_rptr += hdrsz;
+			if (MBLKL(mp_orig) == 0) {
+				mp->b_cont = mp_orig->b_cont;
+				mp_orig->b_cont = NULL;
+				freeb(mp_orig);
+			}
+		} else if (mp != NULL) {
+			freeb(mp);
+			mp = NULL;
+		}
+	} else {
+		mp = ip_copymsg(mp_orig);
+	}
+
 	if (mp == NULL)
 		return;
-	if (mp->b_datap->db_type == M_CTL) {
+	if (DB_TYPE(mp) == M_CTL) {
 		ipsec_mp = mp;
 		mp = mp->b_cont;
 	} else {
@@ -2553,7 +2579,7 @@
 	zoneid = connp->conn_zoneid;
 
 	/* don't allow multicast operations on a tcp conn */
-	if (IS_TCP_CONN(connp))
+	if (IPCL_IS_TCP(connp))
 		return (ENOPROTOOPT);
 
 	if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) {

--- a/usr/src/uts/common/inet/ip/ip_ndp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -144,7 +144,6 @@
 	mblk_t		*mp;
 	mblk_t		*template;
 	nce_t		**ncep;
-	int		err = 0;
 	boolean_t	dropped = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&ndp_g_lock));
@@ -280,8 +279,15 @@
 		mutex_exit(&nce->nce_lock);
 		mutex_enter(&ndp_g_lock);
 	}
-done:
-	return (err);
+	/*
+	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
+	 * We call nce_fastpath from nce_update if the link layer address of
+	 * the peer changes from nce_update
+	 */
+	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
+		nce_fastpath(nce);
+	return (0);
 }
 
 int
@@ -1028,7 +1034,6 @@
 		 * Cache entry with a proper resolver cookie was
 		 * created.
 		 */
-		nce_fastpath(nce);
 		NCE_REFRELE(nce);
 		break;
 	case EEXIST:
@@ -1108,7 +1113,6 @@
 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
 		return (err);
 	}
-	nce_fastpath(nce);
 	NCE_REFRELE(nce);
 	return (0);
 }
@@ -2168,8 +2172,7 @@
 
 	ASSERT(ll_addr != NULL);
 	/* Always called before fast_path_probe */
-	if (nce->nce_fp_mp != NULL)
-		return;
+	ASSERT(nce->nce_fp_mp == NULL);
 	if (ill->ill_sap_length != 0) {
 		/*
 		 * Copy the SAP type specified in the
@@ -2265,8 +2268,8 @@
 		if (nce->nce_fp_mp != NULL) {
 			freemsg(nce->nce_fp_mp);
 			nce->nce_fp_mp = NULL;
-			need_fastpath_update = B_TRUE;
 		}
+		need_fastpath_update = B_TRUE;
 	}
 	mutex_exit(&nce->nce_lock);
 	if (need_stop_timer) {

--- a/usr/src/uts/common/inet/ip/ipclassifier.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c	Sat Oct 22 22:50:14 2005 -0700
@@ -233,6 +233,7 @@
 #include <inet/ip_rts.h>
 #include <inet/optcom.h>
 #include <inet/ip_ndp.h>
+#include <inet/udp_impl.h>
 #include <inet/sctp_ip.h>
 
 #include <sys/ethernet.h>
@@ -351,8 +352,7 @@
 
 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
-	    NULL, NULL,
-	    NULL, NULL, NULL, 0);
+	    NULL, NULL, NULL, NULL, NULL, 0);
 
 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
@@ -501,17 +501,19 @@
 	case IPCL_IPCCONN:
 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
 		if (connp == NULL)
-			return (connp);
+			return (NULL);
 		bzero(connp, sizeof (conn_t));
-		mutex_init(&connp->conn_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
+		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
-		connp->conn_flags |= IPCL_IPCCONN;
+		connp->conn_flags = IPCL_IPCCONN;
 		connp->conn_ref = 1;
 		IPCL_DEBUG_LVL(1,
 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
 		ipcl_globalhash_insert(connp);
 		break;
+	default:
+		connp = NULL;
+		ASSERT(0);
 	}
 
 	return (connp);
@@ -521,7 +523,6 @@
 ipcl_conn_destroy(conn_t *connp)
 {
 	mblk_t	*mp;
-	tcp_t	*tcp = connp->conn_tcp;
 
 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
 	ASSERT(connp->conn_ref == 0);
@@ -531,6 +532,8 @@
 
 	cv_destroy(&connp->conn_cv);
 	if (connp->conn_flags & IPCL_TCPCONN) {
+		tcp_t	*tcp = connp->conn_tcp;
+
 		mutex_destroy(&connp->conn_lock);
 		ASSERT(connp->conn_tcp != NULL);
 		tcp_free(tcp);
@@ -567,6 +570,7 @@
 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
 		sctp_free(connp);
 	} else {
+		ASSERT(connp->conn_udp == NULL);
 		mutex_destroy(&connp->conn_lock);
 		kmem_cache_free(ipcl_conn_cache, connp);
 	}
@@ -1863,6 +1867,57 @@
 	return (NULL);
 }
 
+/*
+ * ipcl_get_next_conn
+ *	get the next entry in the conn global list
+ *	and put a reference on the next_conn.
+ *	decrement the reference on the current conn.
+ *
+ * This is an iterator based walker function that also provides for
+ * some selection by the caller. It walks through the conn_hash bucket
+ * searching for the next valid connp in the list, and selects connections
+ * that are neither closed nor condemned. It also REFHOLDS the conn
+ * thus ensuring that the conn exists when the caller uses the conn.
+ */
+conn_t *
+ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
+{
+	conn_t	*next_connp;
+
+	if (connfp == NULL)
+		return (NULL);
+
+	mutex_enter(&connfp->connf_lock);
+
+	next_connp = (connp == NULL) ?
+	    connfp->connf_head : connp->conn_g_next;
+
+	while (next_connp != NULL) {
+		mutex_enter(&next_connp->conn_lock);
+		if (!(next_connp->conn_flags & conn_flags) ||
+		    (next_connp->conn_state_flags &
+		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
+			/*
+			 * This conn has been condemned or
+			 * is closing, or the flags don't match
+			 */
+			mutex_exit(&next_connp->conn_lock);
+			next_connp = next_connp->conn_g_next;
+			continue;
+		}
+		CONN_INC_REF_LOCKED(next_connp);
+		mutex_exit(&next_connp->conn_lock);
+		break;
+	}
+
+	mutex_exit(&connfp->connf_lock);
+
+	if (connp != NULL)
+		CONN_DEC_REF(connp);
+
+	return (next_connp);
+}
+
 #ifdef CONN_DEBUG
 /*
  * Trace of the last NBUF refhold/refrele

--- a/usr/src/uts/common/inet/ip/tun.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/tun.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -3693,6 +3693,8 @@
 	*nicmp = *icmp;
 	nicmp->icmph_checksum = 0;
 	nicmp->icmph_checksum = IP_CSUM(send_mp, sizeof (ipha_t), 0);
+	if (nicmp->icmph_checksum == 0)
+		nicmp->icmph_checksum = 0xffff;
 
 	/* let ip know we are an icmp message */
 	atomic_add_64(&atp->tun_HCInOctets,
@@ -3757,6 +3759,8 @@
 	    up[12] + up[13] + up[14] + up[15];
 	sum = (sum & 0xffff) + (sum >> 16);
 	nicmp6->icmp6_cksum = IP_CSUM(send_mp, IPV6_HDR_LEN, sum);
+	if (nicmp6->icmp6_cksum == 0)
+		nicmp6->icmp6_cksum = 0xffff;
 
 	/* let ip know we are an icmp message */
 	atomic_add_64(&atp->tun_HCInOctets,

--- a/usr/src/uts/common/inet/ip6.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip6.h	Sat Oct 22 22:50:14 2005 -0700
@@ -370,8 +370,7 @@
 		    uint16_t *, uint8_t **);
 extern int	ip_hdr_length_v6(mblk_t *, ip6_t *);
 extern uint32_t	ip_massage_options_v6(ip6_t *, ip6_rthdr_t *);
-extern void	ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *,
-		    boolean_t, int);
+extern void	ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int);
 extern void 	ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
     ire_t *);
 extern int	ip_total_hdrs_len_v6(ip6_pkt_t *);

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/ip_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,493 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_INET_IP_IMPL_H
+#define	_INET_IP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * IP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#define	IP_MOD_ID		5701
+
+#ifdef	_BIG_ENDIAN
+#define	IP_HDR_CSUM_TTL_ADJUST	256
+#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
+#define	IP_UDP_CSUM_COMP	IPPROTO_UDP
+#else
+#define	IP_HDR_CSUM_TTL_ADJUST	1
+#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
+#define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
+#endif
+
+#define	TCP_CHECKSUM_OFFSET	16
+#define	TCP_CHECKSUM_SIZE	2
+
+#define	UDP_CHECKSUM_OFFSET	6
+#define	UDP_CHECKSUM_SIZE	2
+
+#define	IPH_TCPH_CHECKSUMP(ipha, hlen)	\
+	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
+
+#define	IPH_UDPH_CHECKSUMP(ipha, hlen)	\
+	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
+
+#define	ILL_HCKSUM_CAPABLE(ill)		\
+	(((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
+/*
+ * Macro that performs software checksum calculation on the IP header.
+ */
+#define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
+	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
+	    ((v_hlen_tos_len) >> 16) +					\
+	    ((v_hlen_tos_len) & 0xFFFF) +				\
+	    (ipha)->ipha_fragment_offset_and_flags;			\
+	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
+	(sum) = ~((sum) + ((sum) >> 16));				\
+	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
+}
+
+#define	IS_IP_HDR_HWCKSUM(ipsec, mp, ill)				\
+	((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&		\
+	ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
+
+/*
+ * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
+ * several checks on the IRE and ILL (among other things) in order to see
+ * whether or not hardware checksum offload is allowed for the outgoing
+ * packet.  It assumes that the caller has held a reference to the IRE.
+ */
+#define	IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end,		\
+	    max_frag, ipsec_len, pseudo) {				\
+	uint32_t _hck_flags;						\
+	/*								\
+	 * We offload checksum calculation to hardware when IPsec isn't	\
+	 * present and if fragmentation isn't required.  We also check	\
+	 * if M_DATA fastpath is safe to be used on the	corresponding	\
+	 * IRE; this check is performed without grabbing ire_lock but	\
+	 * instead by holding a reference to it.  This is sufficient	\
+	 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the	\
+	 * DL_NOTE_FASTPATH_FLUSH indication could come up from the	\
+	 * driver and trigger the IRE (hence fp_mp) deletion.  This is	\
+	 * why only IRE_CACHE type is eligible for offload.		\
+	 *								\
+	 * The presense of IP options also forces the network stack to	\
+	 * calculate the checksum in software.  This is because:	\
+	 *								\
+	 * Wrap around: certain partial-checksum NICs (eri, ce) limit	\
+	 * the size of "start offset" width to 6-bit.  This effectively	\
+	 * sets the largest value of the offset to 64-bytes, starting	\
+	 * from the MAC header.  When the cumulative MAC and IP headers	\
+	 * exceed such limit, the offset will wrap around.  This causes	\
+	 * the checksum to be calculated at the wrong place.		\
+	 *								\
+	 * IPv4 source routing: none of the full-checksum capable NICs	\
+	 * is capable of correctly handling the	IPv4 source-routing	\
+	 * option for purposes of calculating the pseudo-header; the	\
+	 * actual destination is different from the destination in the	\
+	 * header which is that of the next-hop.  (This case may not be	\
+	 * true for NICs which can parse IPv6 extension headers, but	\
+	 * we choose to simplify the implementation by not offloading	\
+	 * checksum when they are present.)				\
+	 *								\
+	 */								\
+	if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) &&			\
+	    !((ire)->ire_flags & RTF_MULTIRT) &&			\
+	    (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) ||	\
+	    (ill)->ill_type == IFT_ETHER) &&				\
+	    (ipsec_len) == 0 &&						\
+	    (((ire)->ire_ipversion == IPV4_VERSION &&			\
+	    (start) == IP_SIMPLE_HDR_LENGTH &&				\
+	    (ire)->ire_fp_mp != NULL &&					\
+	    MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) ||			\
+	    ((ire)->ire_ipversion == IPV6_VERSION &&			\
+	    (start) == IPV6_HDR_LEN &&					\
+	    (ire)->ire_nce->nce_fp_mp != NULL &&			\
+	    MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) &&	\
+	    (max_frag) >= (uint_t)((end) + (ipsec_len)) &&		\
+	    dohwcksum) {						\
+		_hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
+	} else {							\
+		_hck_flags = 0;						\
+	}								\
+	IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp,	\
+	    up, proto, start, end, pseudo);				\
+}
+
+/*
+ * Based on the device capabilities, this macro either marks an outgoing
+ * packet with hardware checksum offload information or calculate the
+ * checksum in software.  If the latter is performed, the checksum field
+ * of the dblk is cleared; otherwise it will be non-zero and contain the
+ * necessary flag(s) for the driver.
+ */
+#define	IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start,	\
+	    end, pseudo) {						\
+	uint32_t _sum;							\
+	/*								\
+	 * Underlying interface supports hardware checksum offload for	\
+	 * the payload; leave the payload checksum for the hardware to	\
+	 * calculate.  N.B: We only need to set up checksum info on the	\
+	 * first mblk.							\
+	 */								\
+	DB_CKSUMFLAGS(mp) = 0;						\
+	if (((ipver) == IPV4_VERSION &&					\
+	    ((hck_flags) & HCKSUM_INET_FULL_V4)) ||			\
+	    ((ipver) == IPV6_VERSION &&					\
+	    ((hck_flags) & HCKSUM_INET_FULL_V6))) {			\
+		/*							\
+		 * Hardware calculates pseudo-header, header and the	\
+		 * payload checksums, so clear the checksum field in	\
+		 * the protocol header.					\
+		 */							\
+		*(up) = 0;						\
+		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;			\
+	} else if ((hck_flags) & HCKSUM_INET_PARTIAL)  {		\
+		/*							\
+		 * Partial checksum offload has been enabled.  Fill	\
+		 * the checksum field in the protocl header with the	\
+		 * pseudo-header checksum value.			\
+		 */							\
+		_sum = ((proto) == IPPROTO_UDP) ?			\
+		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
+		_sum += *(up) + (pseudo);				\
+		_sum = (_sum & 0xFFFF) + (_sum >> 16);			\
+		*(up) = (_sum & 0xFFFF) + (_sum >> 16);			\
+		/*							\
+		 * Offsets are relative to beginning of IP header.	\
+		 */							\
+		DB_CKSUMSTART(mp) = (start);				\
+		DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ?		\
+		    (start) + UDP_CHECKSUM_OFFSET :			\
+		    (start) + TCP_CHECKSUM_OFFSET;			\
+		DB_CKSUMEND(mp) = (end);				\
+		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;			\
+	} else {							\
+		/*							\
+		 * Software checksumming.				\
+		 */							\
+		_sum = ((proto) == IPPROTO_UDP) ?			\
+		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
+		_sum += (pseudo);					\
+		_sum = IP_CSUM(mp, start, _sum);			\
+		*(up) = (uint16_t)(_sum ? _sum : ~_sum);		\
+	}								\
+	/*								\
+	 * Hardware supports IP header checksum offload; clear the	\
+	 * contents of IP header checksum field as expected by NIC.	\
+	 * Do this only if we offloaded either full or partial sum.	\
+	 */								\
+	if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 &&	\
+	    ((hck_flags) & HCKSUM_IPHDRCKSUM)) {			\
+		DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;			\
+		((ipha_t *)(ihp))->ipha_hdr_checksum = 0;		\
+	}								\
+}
+
+/*
+ * Macro to inspect the checksum of a fully-reassembled incoming datagram.
+ */
+#define	IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) {		\
+	(err) = B_FALSE;						\
+	if ((hck_flags) & HCK_FULLCKSUM) {				\
+		/*							\
+		 * The sum of all fragment checksums should		\
+		 * result in -0 (0xFFFF) or otherwise invalid.		\
+		 */							\
+		if ((sum) != 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if ((hck_flags) & HCK_PARTIALCKSUM) {			\
+		(sum) += (pseudo);					\
+		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
+		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
+		if (~(sum) & 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) {		\
+		(err) = B_TRUE;						\
+	}								\
+}
+
+/*
+ * This macro inspects an incoming packet to see if the checksum value
+ * contained in it is valid; if the hardware has provided the information,
+ * the value is verified, otherwise it performs software checksumming.
+ * The checksum value is returned to caller.
+ */
+#define	IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
+	int32_t _len;							\
+									\
+	(err) = B_FALSE;						\
+	if ((hck_flags) & HCK_FULLCKSUM) {				\
+		/*							\
+		 * Full checksum has been computed by the hardware	\
+		 * and has been attached.  If the driver wants us to	\
+		 * verify the correctness of the attached value, in	\
+		 * order to protect against faulty hardware, compare	\
+		 * it against -0 (0xFFFF) to see if it's valid.		\
+		 */							\
+		(sum) = DB_CKSUM16(mp);					\
+		if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
+			(err) = B_TRUE;					\
+	} else if (((hck_flags) & HCK_PARTIALCKSUM) &&			\
+	    ((mp1) == NULL || (mp1)->b_cont == NULL) &&			\
+	    (ulph_off) >= DB_CKSUMSTART(mp) &&				\
+	    ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) {	\
+		uint32_t _adj;						\
+		/*							\
+		 * Partial checksum has been calculated by hardware	\
+		 * and attached to the packet; in addition, any		\
+		 * prepended extraneous data is even byte aligned,	\
+		 * and there are at most two mblks associated with	\
+		 * the packet.  If any such data exists, we adjust	\
+		 * the checksum; also take care any postpended data.	\
+		 */							\
+		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj);	\
+		/*							\
+		 * One's complement subtract extraneous checksum	\
+		 */							\
+		(sum) += DB_CKSUM16(mp);				\
+		if (_adj >= (sum))					\
+			(sum) = ~(_adj - (sum)) & 0xFFFF;		\
+		else							\
+			(sum) -= _adj;					\
+		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
+		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
+		if (~(sum) & 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) {		\
+		(err) = B_TRUE;						\
+	}								\
+}
+
+/*
+ * Macro to adjust a given checksum value depending on any prepended
+ * or postpended data on the packet.  It expects the start offset to
+ * begin at an even boundary and that the packet consists of at most
+ * two mblks.
+ */
+#define	IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) {		\
+	/*								\
+	 * Prepended extraneous data; adjust checksum.			\
+	 */								\
+	if ((len) > 0)							\
+		(adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0);		\
+	else								\
+		(adj) = 0;						\
+	/*								\
+	 * len is now the total length of mblk(s)			\
+	 */								\
+	(len) = MBLKL(mp);						\
+	if ((mp1) == NULL)						\
+		(mp1) = (mp);						\
+	else								\
+		(len) += MBLKL(mp1);					\
+	/*								\
+	 * Postpended extraneous data; adjust checksum.			\
+	 */								\
+	if (((len) = (DB_CKSUMEND(mp) - len)) > 0) {			\
+		uint32_t _pad;						\
+									\
+		_pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0);		\
+		/*							\
+		 * If the postpended extraneous data was odd		\
+		 * byte aligned, swap resulting checksum bytes.		\
+		 */							\
+		if ((uintptr_t)(mp1)->b_wptr & 1)			\
+			(adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8);	\
+		else							\
+			(adj) += _pad;					\
+		(adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16);		\
+	}								\
+}
+
+#define	ILL_MDT_CAPABLE(ill)		\
+	(((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
+
+/*
+ * ioctl identifier and structure for Multidata Transmit update
+ * private M_CTL communication from IP to ULP.
+ */
+#define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
+
+typedef struct ip_mdt_info_s {
+	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
+	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
+} ip_mdt_info_t;
+
+/*
+ * Macro that determines whether or not a given ILL is allowed for MDT.
+ */
+#define	ILL_MDT_USABLE(ill)						\
+	(ILL_MDT_CAPABLE(ill) &&					\
+	ill->ill_mdt_capab != NULL &&					\
+	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
+	ill->ill_mdt_capab->ill_mdt_on != 0)
+
+/*
+ * Macro that determines whether or not a given CONN may be considered
+ * for fast path prior to proceeding further with Multidata.
+ */
+#define	CONN_IS_MD_FASTPATH(connp)	\
+	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
+	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
+	(connp)->conn_xmit_if_ill == NULL &&	/* IP_XMIT_IF */	\
+	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
+	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
+
+/* Definitons for fragmenting IP packets using MDT. */
+
+/*
+ * Smaller and private version of pdescinfo_t used specifically for IP,
+ * which allows for only a single payload span per packet.
+ */
+typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
+
+/*
+ * Macro version of ip_can_frag_mdt() which avoids the function call if we
+ * only examine a single message block.
+ */
+#define	IP_CAN_FRAG_MDT(mp, hdr_len, len)			\
+	(((mp)->b_cont == NULL) ?				\
+	(MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) :	\
+	ip_can_frag_mdt((mp), (hdr_len), (len)))
+
+/*
+ * Macro that determines whether or not a given IPC requires
+ * outbound IPSEC processing.
+ */
+#define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
+	((connp)->conn_out_enforce_policy ||	\
+	((connp)->conn_latch != NULL &&		\
+	(connp)->conn_latch->ipl_out_policy != NULL))
+
+/*
+ * These are used by the synchronous streams code in tcp and udp.
+ */
+#define	STR_WAKEUP_CLEAR(stp) {						\
+	mutex_enter(&stp->sd_lock);					\
+	stp->sd_wakeq &= ~RSLEEP;					\
+	mutex_exit(&stp->sd_lock);					\
+}
+
+#define	STR_WAKEUP_SET(stp) {						\
+	mutex_enter(&stp->sd_lock);					\
+	if (stp->sd_flag & RSLEEP) {					\
+		stp->sd_flag &= ~RSLEEP;				\
+		cv_broadcast(&_RD(stp->sd_wrq)->q_wait);		\
+	} else {							\
+		stp->sd_wakeq |= RSLEEP;				\
+	}								\
+	mutex_exit(&stp->sd_lock);					\
+}
+
+#define	STR_SENDSIG(stp) {						\
+	int _events;							\
+	mutex_enter(&stp->sd_lock);					\
+	if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0)	\
+		strsendsig(stp->sd_siglist, _events, 0, 0);		\
+	if (stp->sd_rput_opt & SR_POLLIN) {				\
+		stp->sd_rput_opt &= ~SR_POLLIN;				\
+		mutex_exit(&stp->sd_lock);				\
+		pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);	\
+	} else {							\
+		mutex_exit(&stp->sd_lock);				\
+	}								\
+}
+
+#define	CONN_UDP_SYNCSTR(connp)						\
+	(IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs)
+
+/*
+ * Macro that checks whether or not a particular UDP conn is
+ * flow-controlling on the read-side.  If udp module is directly
+ * above ip, check to see if the drain queue is full; note here
+ * that we check this without any lock protection because this
+ * is a coarse granularity inbound flow-control.  If the module
+ * above ip is not udp, then use canputnext to determine the
+ * flow-control.
+ *
+ * Note that these checks are done after the conn is found in
+ * the UDP fanout table.  A UDP conn in that table may have its
+ * IPCL_UDP bit cleared from the conn_flags when the application
+ * pops the udp module without issuing an unbind; in this case
+ * IP will still receive packets for the conn and deliver it
+ * upstream via putnext.  This is the reason why we have to test
+ * against IPCL_UDP.
+ */
+#define	CONN_UDP_FLOWCTLD(connp)					\
+	((CONN_UDP_SYNCSTR(connp) &&					\
+	(connp)->conn_udp->udp_drain_qfull) ||				\
+	(!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq)))
+
+/*
+ * Macro that delivers a given message upstream; if udp module
+ * is directly above ip, the message is passed directly into
+ * the stream-less entry point.  Otherwise putnext is used.
+ */
+#define	CONN_UDP_RECV(connp, mp) {					\
+	if (IPCL_IS_UDP(connp))						\
+		udp_conn_recv(connp, mp);				\
+	else								\
+		putnext((connp)->conn_rq, mp);				\
+}
+
+#define	ILL_POLL_CAPABLE(ill)	\
+	(((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0)
+
+/*
+ * Macro that hands off one or more messages directly to DLD
+ * when the interface is marked with ILL_CAPAB_POLL.
+ */
+#define	IP_POLL_ILL_TX(ill, mp) {					\
+	ill_poll_capab_t *ill_poll = ill->ill_poll_capab;		\
+	ASSERT(ILL_POLL_CAPABLE(ill));					\
+	ASSERT(ill_poll != NULL);					\
+	ASSERT(ill_poll->ill_tx != NULL);				\
+	ASSERT(ill_poll->ill_tx_handle != NULL);			\
+	ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);			\
+}
+
+extern int	ip_wput_frag_mdt_min;
+extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_IP_IMPL_H */

--- a/usr/src/uts/common/inet/ipclassifier.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ipclassifier.h	Sat Oct 22 22:50:14 2005 -0700
@@ -37,6 +37,7 @@
 #include <inet/ip.h>
 #include <inet/mi.h>
 #include <inet/tcp.h>
+#include <inet/udp_impl.h>
 #include <inet/ip6.h>
 #include <netinet/in.h>		/* for IPPROTO_* constants */
 #include <sys/sdt.h>
@@ -58,17 +59,19 @@
  */
 
 /* Conn Flags */
-#define	IPCL_BOUND		0x80000000	/* Conn in bind table */
-#define	IPCL_CONNECTED		0x40000000	/* Conn in connected table */
-#define	IPCL_TCP4		0x08000000	/* A TCP connection */
-#define	IPCL_TCP6		0x04000000	/* A TCP6 connection */
-#define	IPCL_EAGER		0x01000000	/* Incoming connection */
+#define	IPCL_UDPMOD		0x00020000	/* Is UDP module instance */
+#define	IPCL_TCPMOD		0x00040000	/* Is TCP module instance */
+#define	IPCL_FULLY_BOUND	0x00080000	/* Bound to correct squeue */
+#define	IPCL_CHECK_POLICY	0x00100000	/* Needs policy checking */
+#define	IPCL_SOCKET		0x00200000	/* Sockfs connection */
+#define	IPCL_ACCEPTOR		0x00400000	/* Sockfs priv acceptor */
 #define	IPCL_CL_LISTENER	0x00800000	/* Cluster listener */
-#define	IPCL_ACCEPTOR		0x00400000	/* Sockfs priv acceptor */
-#define	IPCL_SOCKET		0x00200000	/* Sockfs connection */
-#define	IPCL_CHECK_POLICY	0x00100000	/* Needs policy checking */
-#define	IPCL_FULLY_BOUND	0x00080000	/* Bound to correct squeue */
-#define	IPCL_TCPMOD		0x00040000	/* Is tcp module instance */
+#define	IPCL_EAGER		0x01000000	/* Incoming connection */
+#define	IPCL_UDP		0x02000000	/* A UDP connection */
+#define	IPCL_TCP6		0x04000000	/* A TCP6 connection */
+#define	IPCL_TCP4		0x08000000	/* A TCP connection */
+#define	IPCL_CONNECTED		0x40000000	/* Conn in connected table */
+#define	IPCL_BOUND		0x80000000	/* Conn in bind table */
 
 /* Flags identifying the type of conn */
 #define	IPCL_TCPCONN		0x00000001	/* Flag to indicate cache */
@@ -81,8 +84,6 @@
 #define	IPCL_REMOVED		0x00000020
 #define	IPCL_REUSED		0x00000040
 
-#define	IS_TCP_CONN(connp)	(((connp)->conn_flags & IPCL_TCP) != 0)
-
 #define	IPCL_IS_TCP4(connp)						\
 	(((connp)->conn_flags & IPCL_TCP4))
 
@@ -108,6 +109,13 @@
 #define	IPCL_IS_TCP(connp)						\
 	((connp)->conn_flags & (IPCL_TCP4|IPCL_TCP6))
 
+/*
+ * IPCL_UDP is set on the conn when udp is directly above ip;
+ * this flag is cleared the moment udp is popped.
+ */
+#define	IPCL_IS_UDP(connp)						\
+	((connp)->conn_flags & IPCL_UDP)
+
 #define	IPCL_IS_IPTUN(connp)						\
 	((connp)->conn_ulp == IPPROTO_ENCAP || \
 	(connp)->conn_ulp == IPPROTO_IPV6)
@@ -169,6 +177,8 @@
 		pad_to_bit_31 : 2;
 
 	tcp_t		*conn_tcp;		/* Pointer to the tcp struct */
+	udp_t		*conn_udp;		/* Pointer to the udp struct */
+
 	squeue_t	*conn_sqp;		/* Squeue for processing */
 	edesc_rpf	conn_recv;		/* Pointer to recv routine */
 	void		*conn_pad1;
@@ -483,6 +493,7 @@
 		    uint32_t);
 extern int	ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
 		    const in6_addr_t *, uint32_t, uint_t);
+extern conn_t	*ipcl_get_next_conn(connf_t *, conn_t *, uint32_t);
 
 void ipcl_proto_insert(conn_t *, uint8_t);
 void ipcl_proto_insert_v6(conn_t *, uint8_t);

--- a/usr/src/uts/common/inet/ipp_common.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ipp_common.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,7 +52,7 @@
 
 /* Apply IPQoS policies for inbound traffic? */
 #define	IP6_IN_IPP(flags) (IPP_ENABLED(IPP_LOCAL_IN) &&	\
-	(!((flags) & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))))
+	(!((flags) & IP6_NO_IPPOLICY)))
 
 /* Apply IPQoS policies for oubound traffic? */
 #define	IP6_OUT_IPP(flags)	\

--- a/usr/src/uts/common/inet/led.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/led.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -44,12 +44,12 @@
 #include <sys/types.h>
 
 /*
- * Intel x86 can handle unaligned access. However, the checksum routine
+ * x86 can handle unaligned access. However, the checksum routine
  * assumes that the source is 16 bit aligned so we always make sure
  * that packet headers are 16 bit aligned.
  */
 #define	OK_16PTR(p)	(!((uintptr_t)(p) & 0x1))
-#if defined(__i386)
+#if defined(__x86)
 #define	OK_32PTR(p)	OK_16PTR(p)
 #else
 #define	OK_32PTR(p)	(!((uintptr_t)(p) & 0x3))

--- a/usr/src/uts/common/inet/optcom.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/optcom.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -82,8 +82,6 @@
 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
 static boolean_t opt_bloated_maxsize(opdes_t *);
 
-extern optdb_obj_t tcp_opt_obj;
-
 /* Common code for sending back a T_ERROR_ACK. */
 void
 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
@@ -220,9 +218,12 @@
 	opdes_t	*optd;
 	boolean_t	pass_to_next = B_FALSE;
 	boolean_t	pass_to_ip = B_FALSE;
+	boolean_t	is_tcp;
 	struct T_optmgmt_ack *toa;
 	struct T_optmgmt_req *tor;
 
+	is_tcp = (dbobjp == &tcp_opt_obj);
+
 	/*
 	 * Allocate M_CTL and prepend to the packet for restarting this
 	 * option if needed. IP may need to queue and restart the option
@@ -550,14 +551,14 @@
 				opt1->len = opt->len;
 				bcopy(&opt[1], &opt1[1], opt->len);
 				/*
-				 * Pass the option down to IP only if
-				 * TCP hasn't processed it.
+				 * Pass the option down to IP only
+				 * if TCP hasn't processed it.
 				 */
-				if (dbobjp == &tcp_opt_obj)
+				if (is_tcp)
 					pass_to_ip = B_TRUE;
+			} else {
+				opt1->len = (t_uscalar_t)len;
 			}
-			else
-				opt1->len = (t_uscalar_t)len;
 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
 			    _TPI_ALIGN_OPT(opt1->len));
 		} /* end for loop */
@@ -639,10 +640,10 @@
 				optcom_err_ack(q, mp, TSYSERR, error);
 				freeb(first_mp);
 				return (0);
-			} else if (error < 0 && dbobjp == &tcp_opt_obj) {
+			} else if (error < 0 && is_tcp) {
 				/*
-				 * Pass the option down to IP only if
-				 * TCP hasn't processed it.
+				 * Pass the option down to IP only
+				 * if TCP hasn't processed it.
 				 */
 				pass_to_ip = B_TRUE;
 			}

--- a/usr/src/uts/common/inet/optcom.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/optcom.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -205,6 +205,18 @@
 #define	SETFN_CONN_NEGOTIATE		4 /* semantics for T_CONN_*_REQ */
 
 /*
+ * Object to represent database of options to search passed to
+ * {sock,tpi}optcom_req() interface routine to take care of option
+ * management and associated methods.
+ */
+extern optdb_obj_t tcp_opt_obj;
+extern optdb_obj_t udp_opt_obj;
+extern optdb_obj_t ip_opt_obj;
+
+extern uint_t	tcp_max_optsize;
+extern uint_t	udp_max_optsize;
+
+/*
  * Function prototypes
  */
 extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int);

--- a/usr/src/uts/common/inet/snmpcom.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/snmpcom.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992,1997-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -51,6 +51,11 @@
 #include <inet/optcom.h>
 #include <inet/snmpcom.h>
 
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
 #define	DEFAULT_LENGTH	sizeof (long)
 #define	DATA_MBLK_SIZE	1024
 #define	TOAHDR_SIZE	(sizeof (struct T_optmgmt_ack) +\
@@ -90,10 +95,7 @@
  * ctl buffer.
  */
 int
-snmp_append_data(mpdata, blob, len)
-	mblk_t	*mpdata;
-	char	*blob;
-	int	len;
+snmp_append_data(mblk_t *mpdata, char *blob, int len)
 {
 
 	if (!mpdata)
@@ -169,12 +171,7 @@
  *   for them: getfn() returns 0, setfn() returns 1.
  */
 boolean_t
-snmpcom_req(q, mp, setfn, getfn, credp)
-	queue_t	*q;
-	mblk_t	*mp;
-	pfi_t	setfn;
-	pfi_t	getfn;
-	cred_t	*credp;
+snmpcom_req(queue_t *q, mblk_t *mp, pfi_t setfn, pfi_t getfn, cred_t *credp)
 {
 	mblk_t			*mpctl;
 	struct opthdr		*req;
@@ -184,6 +181,7 @@
 	sor_t			*sreq;
 	struct T_optmgmt_req	*tor = (struct T_optmgmt_req *)mp->b_rptr;
 	struct T_optmgmt_ack	*toa;
+	boolean_t		pass_to_ip = B_FALSE;
 
 	if (mp->b_cont) {	/* don't deal with multiple mblk's */
 		freemsg(mp->b_cont);
@@ -209,6 +207,10 @@
 			req_start->level <= EXPER_RANGE_END)))
 		return (B_FALSE);
 
+	if (setfn == tcp_snmp_set || setfn == udp_snmp_set ||
+	    getfn == tcp_snmp_get || getfn == udp_snmp_get)
+		pass_to_ip = B_TRUE;
+
 	switch (tor->MGMT_flags) {
 
 	case T_NEGOTIATE:
@@ -235,8 +237,10 @@
 				(uchar_t *)&req[1], req->len))
 				goto bad_req4;
 		}
-		if (q->q_next)
+		if (q->q_next != NULL)
 			putnext(q, mp);
+		else if (pass_to_ip)
+			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
 		else
 			freemsg(mp);
 		return (B_TRUE);
@@ -268,9 +272,12 @@
 		 * this is bottom module of stream, send up an EOD ctl msg,
 		 * otherwise pass onto the next guy for processing.
 		 */
-		if (q->q_next) {
+		if (q->q_next != NULL) {
 			putnext(q, mp);
 			return (B_TRUE);
+		} else if (pass_to_ip) {
+			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
+			return (B_TRUE);
 		}
 		if (mp->b_cont) {
 			freemsg(mp->b_cont);

--- a/usr/src/uts/common/inet/squeue.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/squeue.c	Sat Oct 22 22:50:14 2005 -0700
@@ -729,7 +729,8 @@
 #endif
 #if SQUEUE_DEBUG
 	conn_t 	*connp = (conn_t *)arg;
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
 #endif
 
 	ASSERT(proc != NULL);
@@ -954,9 +955,10 @@
 	ASSERT(sqp != NULL);
 	ASSERT(mp != NULL);
 	ASSERT(mp->b_next == NULL);
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
+	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 
-	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 	mutex_enter(&sqp->sq_lock);
 
 	being_processed = (sqp->sq_state & SQS_PROC);
@@ -1100,7 +1102,8 @@
 	ASSERT(sqp != NULL);
 	ASSERT(mp != NULL);
 	ASSERT(mp->b_next == NULL);
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
 
 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 	mutex_enter(&sqp->sq_lock);

--- a/usr/src/uts/common/inet/tcp.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp.h	Sat Oct 22 22:50:14 2005 -0700
@@ -286,11 +286,8 @@
 		tcp_accept_error : 1,	/* Error during TLI accept */
 
 		tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
-		tcp_fused : 1,		/* loopback tcp in fusion mode */
-		tcp_unfusable : 1,	/* fusion not allowed on endpoint */
-		tcp_fused_sigurg : 1,	/* send SIGURG upon draining */
 		tcp_cork : 1,		/* tcp_cork option */
-		tcp_pad_to_bit_31 : 15;
+		tcp_pad_to_bit_31 : 18;
 
 	uint32_t	tcp_if_mtu;	/* Outgoing interface MTU. */
 
@@ -514,10 +511,29 @@
 #define	tcp_ipp_use_min_mtu	tcp_sticky_ipp.ipp_use_min_mtu
 	struct tcp_s *tcp_saved_listener;	/* saved value of listener */
 
+	uint32_t	tcp_in_ack_unsent;	/* ACK for unsent data cnt. */
+
+	/*
+	 * The following fusion-related fields are protected by squeue.
+	 */
 	struct tcp_s *tcp_loopback_peer;	/* peer tcp for loopback */
 	mblk_t	*tcp_fused_sigurg_mp;		/* M_PCSIG mblk for SIGURG */
+	size_t	tcp_fuse_rcv_hiwater;		/* fusion receive queue size */
+	uint_t	tcp_fuse_rcv_unread_hiwater;	/* max # of outstanding pkts */
+	/*
+	 * The following fusion-related fields and bit fields are to be
+	 * manipulated with squeue protection or with tcp_fuse_lock held.
+	 */
+	kmutex_t tcp_fuse_lock;
+	uint_t tcp_fuse_rcv_unread_cnt;	/* # of outstanding pkts */
+	uint32_t
+		tcp_fused : 1,		/* loopback tcp in fusion mode */
+		tcp_unfusable : 1,	/* fusion not allowed on endpoint */
+		tcp_fused_sigurg : 1,	/* send SIGURG upon draining */
+		tcp_direct_sockfs : 1,	/* direct calls to sockfs */
 
-	uint32_t	tcp_in_ack_unsent;	/* ACK for unsent data cnt. */
+		tcp_fuse_syncstr_stopped : 1, /* synchronous streams stopped */
+		tcp_fuse_to_bit_31 : 27;
 
 	/*
 	 * This variable is accessed without any lock protection
@@ -525,6 +541,8 @@
 	 * with the rest which require such condition.
 	 */
 	boolean_t	tcp_issocket;	/* this is a socket tcp */
+
+	uint32_t	tcp_squeue_bytes;
 } tcp_t;
 
 extern void 	tcp_free(tcp_t *tcp);
@@ -537,7 +555,8 @@
 extern void	tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
 extern void 	*tcp_get_conn(void *arg);
 extern void	tcp_time_wait_collector(void *arg);
-
+extern int	tcp_snmp_get(queue_t *, mblk_t *);
+extern int	tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 /*
  * The TCP Fanout structure.
  * The hash tables and their linkage (tcp_*_hash_next, tcp_ptp*hn) are
@@ -610,18 +629,6 @@
 #pragma pack()
 #endif
 
-/* Named Dispatch Parameter Management Structure */
-typedef struct tcpparam_s {
-	uint32_t	tcp_param_min;
-	uint32_t	tcp_param_max;
-	uint32_t	tcp_param_val;
-	char		*tcp_param_name;
-} tcpparam_t;
-
-extern tcpparam_t	tcp_param_arr[];
-
-extern boolean_t	do_tcp_fusion;
-
 #if (defined(_KERNEL) || defined(_KMEMUSER))
 extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp);
 #endif

--- a/usr/src/uts/common/inet/tcp/tcp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -73,6 +73,7 @@
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
 #include <inet/mi.h>
@@ -82,6 +83,7 @@
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
 #include <inet/ipdrop.h>
@@ -230,8 +232,6 @@
 squeue_func_t tcp_squeue_close_proc;
 squeue_func_t tcp_squeue_wput_proc;
 
-extern vmem_t *ip_minor_arena;
-
 /*
  * This controls how tiny a write must be before we try to copy it
  * into the the mblk on the tail of the transmit queue.  Not much
@@ -278,9 +278,6 @@
  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
  */
 
-#define	TCP_COUNTERS 1
-#define	TCP_CLD_COUNTERS 0
-
 #ifndef TCP_DEBUG_COUNTER
 #ifdef DEBUG
 #define	TCP_DEBUG_COUNTER 1
@@ -289,6 +286,7 @@
 #endif
 #endif
 
+#define	TCP_CLD_COUNTERS 0
 
 #define	TCP_TAG_CLEAN_DEATH 1
 #define	TCP_MAX_CLEAN_DEATH_TAG 32
@@ -297,20 +295,6 @@
 static int _lint_dummy_;
 #endif
 
-#if TCP_COUNTERS
-#define	TCP_STAT(x)		(tcp_statistics.x.value.ui64++)
-#define	TCP_STAT_UPDATE(x, n)	(tcp_statistics.x.value.ui64 += (n))
-#define	TCP_STAT_SET(x, n)	(tcp_statistics.x.value.ui64 = (n))
-#elif defined(lint)
-#define	TCP_STAT(x)		ASSERT(_lint_dummy_ == 0);
-#define	TCP_STAT_UPDATE(x, n)	ASSERT(_lint_dummy_ == 0);
-#define	TCP_STAT_SET(x, n)	ASSERT(_lint_dummy_ == 0);
-#else
-#define	TCP_STAT(x)
-#define	TCP_STAT_UPDATE(x, n)
-#define	TCP_STAT_SET(x, n)
-#endif
-
 #if TCP_CLD_COUNTERS
 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
@@ -328,96 +312,7 @@
 #define	TCP_DBGSTAT(x)
 #endif
 
-typedef struct tcp_stat {
-	kstat_named_t	tcp_time_wait;
-	kstat_named_t	tcp_time_wait_syn;
-	kstat_named_t	tcp_time_wait_syn_success;
-	kstat_named_t	tcp_time_wait_syn_fail;
-	kstat_named_t	tcp_reinput_syn;
-	kstat_named_t	tcp_ip_output;
-	kstat_named_t	tcp_detach_non_time_wait;
-	kstat_named_t	tcp_detach_time_wait;
-	kstat_named_t	tcp_time_wait_reap;
-	kstat_named_t	tcp_clean_death_nondetached;
-	kstat_named_t	tcp_reinit_calls;
-	kstat_named_t	tcp_eager_err1;
-	kstat_named_t	tcp_eager_err2;
-	kstat_named_t	tcp_eager_blowoff_calls;
-	kstat_named_t	tcp_eager_blowoff_q;
-	kstat_named_t	tcp_eager_blowoff_q0;
-	kstat_named_t	tcp_not_hard_bound;
-	kstat_named_t	tcp_no_listener;
-	kstat_named_t	tcp_found_eager;
-	kstat_named_t	tcp_wrong_queue;
-	kstat_named_t	tcp_found_eager_binding1;
-	kstat_named_t	tcp_found_eager_bound1;
-	kstat_named_t	tcp_eager_has_listener1;
-	kstat_named_t	tcp_open_alloc;
-	kstat_named_t	tcp_open_detached_alloc;
-	kstat_named_t	tcp_rput_time_wait;
-	kstat_named_t	tcp_listendrop;
-	kstat_named_t	tcp_listendropq0;
-	kstat_named_t	tcp_wrong_rq;
-	kstat_named_t	tcp_rsrv_calls;
-	kstat_named_t	tcp_eagerfree2;
-	kstat_named_t	tcp_eagerfree3;
-	kstat_named_t	tcp_eagerfree4;
-	kstat_named_t	tcp_eagerfree5;
-	kstat_named_t	tcp_timewait_syn_fail;
-	kstat_named_t	tcp_listen_badflags;
-	kstat_named_t	tcp_timeout_calls;
-	kstat_named_t	tcp_timeout_cached_alloc;
-	kstat_named_t	tcp_timeout_cancel_reqs;
-	kstat_named_t	tcp_timeout_canceled;
-	kstat_named_t	tcp_timermp_alloced;
-	kstat_named_t	tcp_timermp_freed;
-	kstat_named_t	tcp_timermp_allocfail;
-	kstat_named_t	tcp_timermp_allocdblfail;
-	kstat_named_t	tcp_push_timer_cnt;
-	kstat_named_t	tcp_ack_timer_cnt;
-	kstat_named_t	tcp_ire_null1;
-	kstat_named_t	tcp_ire_null;
-	kstat_named_t	tcp_ip_send;
-	kstat_named_t	tcp_ip_ire_send;
-	kstat_named_t   tcp_wsrv_called;
-	kstat_named_t   tcp_flwctl_on;
-	kstat_named_t	tcp_timer_fire_early;
-	kstat_named_t	tcp_timer_fire_miss;
-	kstat_named_t	tcp_freelist_cleanup;
-	kstat_named_t	tcp_rput_v6_error;
-	kstat_named_t	tcp_out_sw_cksum;
-	kstat_named_t	tcp_zcopy_on;
-	kstat_named_t	tcp_zcopy_off;
-	kstat_named_t	tcp_zcopy_backoff;
-	kstat_named_t	tcp_zcopy_disable;
-	kstat_named_t	tcp_mdt_pkt_out;
-	kstat_named_t	tcp_mdt_pkt_out_v4;
-	kstat_named_t	tcp_mdt_pkt_out_v6;
-	kstat_named_t	tcp_mdt_discarded;
-	kstat_named_t	tcp_mdt_conn_halted1;
-	kstat_named_t	tcp_mdt_conn_halted2;
-	kstat_named_t	tcp_mdt_conn_halted3;
-	kstat_named_t	tcp_mdt_conn_resumed1;
-	kstat_named_t	tcp_mdt_conn_resumed2;
-	kstat_named_t	tcp_mdt_legacy_small;
-	kstat_named_t	tcp_mdt_legacy_all;
-	kstat_named_t	tcp_mdt_legacy_ret;
-	kstat_named_t	tcp_mdt_allocfail;
-	kstat_named_t	tcp_mdt_addpdescfail;
-	kstat_named_t	tcp_mdt_allocd;
-	kstat_named_t	tcp_mdt_linked;
-	kstat_named_t	tcp_fusion_flowctl;
-	kstat_named_t	tcp_fusion_backenabled;
-	kstat_named_t	tcp_fusion_urg;
-	kstat_named_t	tcp_fusion_putnext;
-	kstat_named_t	tcp_fusion_unfusable;
-	kstat_named_t	tcp_fusion_aborted;
-	kstat_named_t	tcp_fusion_unqualified;
-	kstat_named_t	tcp_in_ack_unsent_drop;
-} tcp_stat_t;
-
-#if (TCP_COUNTERS || TCP_DEBUG_COUNTER)
-static tcp_stat_t tcp_statistics = {
+tcp_stat_t tcp_statistics = {
 	{ "tcp_time_wait",		KSTAT_DATA_UINT64 },
 	{ "tcp_time_wait_syn",		KSTAT_DATA_UINT64 },
 	{ "tcp_time_wait_success",	KSTAT_DATA_UINT64 },
@@ -475,6 +370,7 @@
 	{ "tcp_freelist_cleanup",	KSTAT_DATA_UINT64 },
 	{ "tcp_rput_v6_error",		KSTAT_DATA_UINT64 },
 	{ "tcp_out_sw_cksum",		KSTAT_DATA_UINT64 },
+	{ "tcp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_on",		KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_off",		KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_backoff",		KSTAT_DATA_UINT64 },
@@ -502,13 +398,14 @@
 	{ "tcp_fusion_unfusable",	KSTAT_DATA_UINT64 },
 	{ "tcp_fusion_aborted",		KSTAT_DATA_UINT64 },
 	{ "tcp_fusion_unqualified",	KSTAT_DATA_UINT64 },
+	{ "tcp_fusion_rrw_busy",	KSTAT_DATA_UINT64 },
+	{ "tcp_fusion_rrw_msgcnt",	KSTAT_DATA_UINT64 },
 	{ "tcp_in_ack_unsent_drop",	KSTAT_DATA_UINT64 },
+	{ "tcp_sock_fallback",		KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *tcp_kstat;
 
-#endif
-
 /*
  * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
  * tcp write side.
@@ -519,12 +416,6 @@
 	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
 }
 
-/*
- * Was this tcp created via socket() interface?
- */
-#define	TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket)
-
-
 /* Macros for timestamp comparisons */
 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
@@ -569,8 +460,6 @@
  */
 #define	TCP_OLD_URP_INTERPRETATION	1
 
-#define	TCP_IS_DETACHED(tcp)		((tcp)->tcp_detached)
-
 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
 	(TCP_IS_DETACHED(tcp) && \
 	    (!(tcp)->tcp_hard_binding))
@@ -687,22 +576,6 @@
 kmem_cache_t	*tcp_sack_info_cache;
 kmem_cache_t	*tcp_iphc_cache;
 
-#define	TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim)
-#define	TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id)
-
-/*
- * To restart the TCP retransmission timer.
- */
-#define	TCP_TIMER_RESTART(tcp, intvl) \
-{ \
-	if ((tcp)->tcp_timer_tid != 0) { \
-		(void) TCP_TIMER_CANCEL((tcp),	\
-					(tcp)->tcp_timer_tid); \
-	} \
-	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \
-	    MSEC_TO_TICK(intvl)); \
-}
-
 /*
  * For scalability, we must not run a timer for every TCP connection
  * in TIME_WAIT state.  To see why, consider (for time wait interval of
@@ -951,7 +824,6 @@
 static mblk_t	*tcp_ire_mp(mblk_t *mp);
 static void	tcp_iss_init(tcp_t *tcp);
 static void	tcp_keepalive_killer(void *arg);
-static int	tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk);
 static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
 static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
@@ -985,7 +857,6 @@
 		    tcp_t *thisstream, cred_t *cr);
 
 static uint_t	tcp_rcv_drain(queue_t *q, tcp_t *tcp);
-static void	tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len);
 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
 static boolean_t tcp_send_rst_chk(void);
 static void	tcp_ss_rexmit(tcp_t *tcp);
@@ -994,9 +865,6 @@
 static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
 static void	tcp_rsrv(queue_t *q);
 static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
-static int	tcp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int	tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr,
-		    int len);
 static int	tcp_snmp_state(tcp_t *tcp);
 static int	tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
@@ -1018,7 +886,6 @@
 static void	tcp_timer_callback(void *);
 static in_port_t tcp_update_next_port(in_port_t port, boolean_t random);
 static in_port_t tcp_get_next_priv_port(void);
-static void	tcp_wput(queue_t *q, mblk_t *mp);
 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
 void		tcp_wput_accept(queue_t *q, mblk_t *mp);
 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
@@ -1044,7 +911,6 @@
 		    boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
 static void	tcp_ack_timer(void *arg);
 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
-static void	tcp_push_timer(void *arg);
 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
 		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len);
 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
@@ -1076,9 +942,6 @@
 boolean_t	tcp_reserved_port_check(in_port_t);
 static tcp_t	*tcp_alloc_temp_tcp(in_port_t);
 static int	tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
-static void	tcp_timers_stop(tcp_t *);
-static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
-static clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
 static mblk_t	*tcp_mdt_info_mp(mblk_t *);
 static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
 static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
@@ -1098,7 +961,6 @@
 static void	tcp_kstat_fini(void);
 static int	tcp_kstat_update(kstat_t *kp, int rw);
 void		tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
-conn_t		*tcp_get_next_conn(connf_t *, conn_t *);
 static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
 static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
@@ -1118,14 +980,6 @@
 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
 static void	tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
 
-static void	tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
-static void	tcp_unfuse(tcp_t *);
-static boolean_t tcp_fuse_output(tcp_t *, mblk_t *);
-static void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
-static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
-
-extern mblk_t	*allocb_tryhard(size_t);
-
 /*
  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
  *
@@ -1155,17 +1009,12 @@
 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
     boolean_t);
 
-
-static void	tcp_clrqfull(tcp_t *);
-static void	tcp_setqfull(tcp_t *);
-
 static struct module_info tcp_rinfo =  {
-#define	TCP_MODULE_ID	5105
-	TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
+	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
 };
 
 static struct module_info tcp_winfo =  {
-	TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16
+	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
 };
 
 /*
@@ -1173,11 +1022,12 @@
  * to pass through.
  */
 struct qinit tcp_mod_rinit = {
-	(pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+	(pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo,
 };
 
 struct qinit tcp_mod_winit = {
-	(pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+	(pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL,
+	&tcp_rinfo
 };
 
 /*
@@ -1210,11 +1060,18 @@
 	(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
 };
 
+/*
+ * Entry points for TCP loopback (read side only)
+ */
+struct qinit tcp_loopback_rinit = {
+	(pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0,
+	&tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
+};
+
 struct streamtab tcpinfo = {
 	&tcp_rinit, &tcp_winit
 };
 
-
 extern squeue_func_t tcp_squeue_wput_proc;
 extern squeue_func_t tcp_squeue_timer_proc;
 
@@ -1306,15 +1163,6 @@
 mib2_tcp_t	tcp_mib;	/* SNMP fixed size info */
 kstat_t		*tcp_mibkp;	/* kstat exporting tcp_mib data */
 
-/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX These and other externs should ideally move to a TCP header
- */
-extern optdb_obj_t	tcp_opt_obj;
-extern uint_t		tcp_max_optsize;
-
 boolean_t tcp_icmp_source_quench = B_FALSE;
 /*
  * Following assumes TPI alignment requirements stay along 32 bit
@@ -1454,76 +1302,6 @@
 };
 /* END CSTYLED */
 
-
-#define	tcp_time_wait_interval			tcp_param_arr[0].tcp_param_val
-#define	tcp_conn_req_max_q			tcp_param_arr[1].tcp_param_val
-#define	tcp_conn_req_max_q0			tcp_param_arr[2].tcp_param_val
-#define	tcp_conn_req_min			tcp_param_arr[3].tcp_param_val
-#define	tcp_conn_grace_period			tcp_param_arr[4].tcp_param_val
-#define	tcp_cwnd_max_				tcp_param_arr[5].tcp_param_val
-#define	tcp_dbg					tcp_param_arr[6].tcp_param_val
-#define	tcp_smallest_nonpriv_port		tcp_param_arr[7].tcp_param_val
-#define	tcp_ip_abort_cinterval			tcp_param_arr[8].tcp_param_val
-#define	tcp_ip_abort_linterval			tcp_param_arr[9].tcp_param_val
-#define	tcp_ip_abort_interval			tcp_param_arr[10].tcp_param_val
-#define	tcp_ip_notify_cinterval			tcp_param_arr[11].tcp_param_val
-#define	tcp_ip_notify_interval			tcp_param_arr[12].tcp_param_val
-#define	tcp_ipv4_ttl				tcp_param_arr[13].tcp_param_val
-#define	tcp_keepalive_interval_high		tcp_param_arr[14].tcp_param_max
-#define	tcp_keepalive_interval			tcp_param_arr[14].tcp_param_val
-#define	tcp_keepalive_interval_low		tcp_param_arr[14].tcp_param_min
-#define	tcp_maxpsz_multiplier			tcp_param_arr[15].tcp_param_val
-#define	tcp_mss_def_ipv4			tcp_param_arr[16].tcp_param_val
-#define	tcp_mss_max_ipv4			tcp_param_arr[17].tcp_param_val
-#define	tcp_mss_min				tcp_param_arr[18].tcp_param_val
-#define	tcp_naglim_def				tcp_param_arr[19].tcp_param_val
-#define	tcp_rexmit_interval_initial		tcp_param_arr[20].tcp_param_val
-#define	tcp_rexmit_interval_max			tcp_param_arr[21].tcp_param_val
-#define	tcp_rexmit_interval_min			tcp_param_arr[22].tcp_param_val
-#define	tcp_deferred_ack_interval		tcp_param_arr[23].tcp_param_val
-#define	tcp_snd_lowat_fraction			tcp_param_arr[24].tcp_param_val
-#define	tcp_sth_rcv_hiwat			tcp_param_arr[25].tcp_param_val
-#define	tcp_sth_rcv_lowat			tcp_param_arr[26].tcp_param_val
-#define	tcp_dupack_fast_retransmit		tcp_param_arr[27].tcp_param_val
-#define	tcp_ignore_path_mtu			tcp_param_arr[28].tcp_param_val
-#define	tcp_smallest_anon_port			tcp_param_arr[29].tcp_param_val
-#define	tcp_largest_anon_port			tcp_param_arr[30].tcp_param_val
-#define	tcp_xmit_hiwat				tcp_param_arr[31].tcp_param_val
-#define	tcp_xmit_lowat				tcp_param_arr[32].tcp_param_val
-#define	tcp_recv_hiwat				tcp_param_arr[33].tcp_param_val
-#define	tcp_recv_hiwat_minmss			tcp_param_arr[34].tcp_param_val
-#define	tcp_fin_wait_2_flush_interval		tcp_param_arr[35].tcp_param_val
-#define	tcp_co_min				tcp_param_arr[36].tcp_param_val
-#define	tcp_max_buf				tcp_param_arr[37].tcp_param_val
-#define	tcp_strong_iss				tcp_param_arr[38].tcp_param_val
-#define	tcp_rtt_updates				tcp_param_arr[39].tcp_param_val
-#define	tcp_wscale_always			tcp_param_arr[40].tcp_param_val
-#define	tcp_tstamp_always			tcp_param_arr[41].tcp_param_val
-#define	tcp_tstamp_if_wscale			tcp_param_arr[42].tcp_param_val
-#define	tcp_rexmit_interval_extra		tcp_param_arr[43].tcp_param_val
-#define	tcp_deferred_acks_max			tcp_param_arr[44].tcp_param_val
-#define	tcp_slow_start_after_idle		tcp_param_arr[45].tcp_param_val
-#define	tcp_slow_start_initial			tcp_param_arr[46].tcp_param_val
-#define	tcp_co_timer_interval			tcp_param_arr[47].tcp_param_val
-#define	tcp_sack_permitted			tcp_param_arr[48].tcp_param_val
-#define	tcp_trace				tcp_param_arr[49].tcp_param_val
-#define	tcp_compression_enabled			tcp_param_arr[50].tcp_param_val
-#define	tcp_ipv6_hoplimit			tcp_param_arr[51].tcp_param_val
-#define	tcp_mss_def_ipv6			tcp_param_arr[52].tcp_param_val
-#define	tcp_mss_max_ipv6			tcp_param_arr[53].tcp_param_val
-#define	tcp_rev_src_routes			tcp_param_arr[54].tcp_param_val
-#define	tcp_local_dack_interval			tcp_param_arr[55].tcp_param_val
-#define	tcp_ndd_get_info_interval		tcp_param_arr[56].tcp_param_val
-#define	tcp_local_dacks_max			tcp_param_arr[57].tcp_param_val
-#define	tcp_ecn_permitted			tcp_param_arr[58].tcp_param_val
-#define	tcp_rst_sent_rate_enabled		tcp_param_arr[59].tcp_param_val
-#define	tcp_rst_sent_rate			tcp_param_arr[60].tcp_param_val
-#define	tcp_push_timer_interval			tcp_param_arr[61].tcp_param_val
-#define	tcp_use_smss_as_mss_opt			tcp_param_arr[62].tcp_param_val
-#define	tcp_keepalive_abort_interval_high	tcp_param_arr[63].tcp_param_max
-#define	tcp_keepalive_abort_interval		tcp_param_arr[63].tcp_param_val
-#define	tcp_keepalive_abort_interval_low	tcp_param_arr[63].tcp_param_min
-
 /*
  * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
  * each header fragment in the header buffer.  Each parameter value has
@@ -1720,642 +1498,6 @@
  */
 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
 
-#define	IPH_TCPH_CHECKSUMP(ipha, hlen) \
-	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16)))
-
-#ifdef  _BIG_ENDIAN
-#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
-#else
-#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
-#endif
-
-#define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
-	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
-	    ((v_hlen_tos_len) >> 16) +					\
-	    ((v_hlen_tos_len) & 0xFFFF) +				\
-	    (ipha)->ipha_fragment_offset_and_flags;			\
-	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
-	(sum) = ~((sum) + ((sum) >> 16));				\
-	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
-}
-
-/*
- * Macros that determine whether or not IP processing is needed for TCP.
- */
-#define	TCP_IPOPT_POLICY_V4(tcp)					\
-	((tcp)->tcp_ipversion == IPV4_VERSION &&			\
-	((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH ||		\
-	CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) ||		\
-	CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
-
-#define	TCP_IPOPT_POLICY_V6(tcp)					\
-	((tcp)->tcp_ipversion == IPV6_VERSION &&			\
-	((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN ||			\
-	CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) ||		\
-	CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
-
-#define	TCP_LOOPBACK_IP(tcp)						\
-	(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) ||	\
-	!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
-
-boolean_t do_tcp_fusion = B_TRUE;
-
-/*
- * This routine gets called by the eager tcp upon changing state from
- * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
- * and the active connect tcp such that the regular tcp processings
- * may be bypassed under allowable circumstances.  Because the fusion
- * requires both endpoints to be in the same squeue, it does not work
- * for simultaneous active connects because there is no easy way to
- * switch from one squeue to another once the connection is created.
- * This is different from the eager tcp case where we assign it the
- * same squeue as the one given to the active connect tcp during open.
- */
-static void
-tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
-{
-	conn_t *peer_connp, *connp = tcp->tcp_connp;
-	tcp_t *peer_tcp;
-
-	ASSERT(!tcp->tcp_fused);
-	ASSERT(tcp->tcp_loopback);
-	ASSERT(tcp->tcp_loopback_peer == NULL);
-	/*
-	 * We need to check the listener tcp to make sure it's a socket
-	 * endpoint, but we can't really use tcp_listener since we get
-	 * here after sending up T_CONN_IND and tcp_wput_accept() may be
-	 * called independently, at which point tcp_listener is cleared;
-	 * this is why we use tcp_saved_listener.  The listener itself
-	 * is guaranteed to be around until tcp_accept_finish() is called
-	 * on this eager -- this won't happen until we're done since
-	 * we're inside the eager's perimeter now.
-	 */
-	ASSERT(tcp->tcp_saved_listener != NULL);
-
-	/*
-	 * Lookup peer endpoint; search for the remote endpoint having
-	 * the reversed address-port quadruplet in ESTABLISHED state,
-	 * which is guaranteed to be unique in the system.  Zone check
-	 * is applied accordingly for loopback address, but not for
-	 * local address since we want fusion to happen across Zones.
-	 */
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
-		    (ipha_t *)iphdr, tcph);
-	} else {
-		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
-		    (ip6_t *)iphdr, tcph);
-	}
-
-	/*
-	 * We can only proceed if peer exists, resides in the same squeue
-	 * as our conn and is not raw-socket.  The squeue assignment of
-	 * this eager tcp was done earlier at the time of SYN processing
-	 * in ip_fanout_tcp{_v6}.  Note that similar squeues by itself
-	 * doesn't guarantee a safe condition to fuse, hence we perform
-	 * additional tests below.
-	 */
-	ASSERT(peer_connp == NULL || peer_connp != connp);
-	if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
-	    !IPCL_IS_TCP(peer_connp)) {
-		if (peer_connp != NULL) {
-			TCP_STAT(tcp_fusion_unqualified);
-			CONN_DEC_REF(peer_connp);
-		}
-		return;
-	}
-	peer_tcp = peer_connp->conn_tcp;	/* active connect tcp */
-
-	ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
-	ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
-	ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
-
-	/*
-	 * Fuse the endpoints; we perform further checks against both
-	 * tcp endpoints to ensure that a fusion is allowed to happen.
-	 * In particular we bail out for TPI, non-simple TCP/IP or if
-	 * IPsec/IPQoS policy exists.  We could actually do it for the
-	 * XTI/TLI/TPI case but this requires more testing, so for now
-	 * we handle only the socket case.
-	 */
-	if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
-	    TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) &&
-	    !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
-	    !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
-		mblk_t *mp;
-		struct stroptions *stropt;
-		queue_t *peer_rq = peer_tcp->tcp_rq;
-		size_t sth_hiwat;
-
-		ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
-
-		/*
-		 * We need to drain data on both endpoints during unfuse.
-		 * If we need to send up SIGURG at the time of draining,
-		 * we want to be sure that an mblk is readily available.
-		 * This is why we pre-allocate the M_PCSIG mblks for both
-		 * endpoints which will only be used during/after unfuse.
-		 */
-		if ((mp = allocb(1, BPRI_HI)) == NULL) {
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-		ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
-		tcp->tcp_fused_sigurg_mp = mp;
-
-		if ((mp = allocb(1, BPRI_HI)) == NULL) {
-			freeb(tcp->tcp_fused_sigurg_mp);
-			tcp->tcp_fused_sigurg_mp = NULL;
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
-		peer_tcp->tcp_fused_sigurg_mp = mp;
-
-		/* Allocate M_SETOPTS mblk */
-		mp = allocb(sizeof (*stropt), BPRI_HI);
-		if (mp == NULL) {
-			freeb(tcp->tcp_fused_sigurg_mp);
-			tcp->tcp_fused_sigurg_mp = NULL;
-			freeb(peer_tcp->tcp_fused_sigurg_mp);
-			peer_tcp->tcp_fused_sigurg_mp = NULL;
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-
-		/* Fuse both endpoints */
-		peer_tcp->tcp_loopback_peer = tcp;
-		tcp->tcp_loopback_peer = peer_tcp;
-		peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
-
-		/*
-		 * We never use regular tcp paths in fusion and should
-		 * therefore clear tcp_unsent on both endpoints.  Having
-		 * them set to non-zero values means asking for trouble
-		 * especially after unfuse, where we may end up sending
-		 * through regular tcp paths which expect xmit_list and
-		 * friends to be correctly setup.
-		 */
-		peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
-
-		tcp_timers_stop(tcp);
-		tcp_timers_stop(peer_tcp);
-
-		/*
-		 * Set the stream head's write offset value to zero, since we
-		 * won't be needing any room for TCP/IP headers, and tell it
-		 * to not break up the writes.  This would reduce the amount
-		 * of work done by kmem.  In addition, we set the receive
-		 * buffer to twice that of q_hiwat in order to simulate the
-		 * non-fusion case.  Note that we can only do this for the
-		 * active connect tcp since our eager is still detached;
-		 * it will be dealt with later in tcp_accept_finish().
-		 */
-		DB_TYPE(mp) = M_SETOPTS;
-		mp->b_wptr += sizeof (*stropt);
-
-		sth_hiwat = peer_rq->q_hiwat << 1;
-		if (sth_hiwat > tcp_max_buf)
-			sth_hiwat = tcp_max_buf;
-
-		stropt = (struct stroptions *)mp->b_rptr;
-		stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
-		stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
-		stropt->so_wroff = 0;
-		stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
-
-		/* Send the options up */
-		putnext(peer_rq, mp);
-	} else {
-		TCP_STAT(tcp_fusion_unqualified);
-	}
-	CONN_DEC_REF(peer_connp);
-}
-
-/*
- * Unfuse a previously-fused pair of tcp loopback endpoints.
- */
-static void
-tcp_unfuse(tcp_t *tcp)
-{
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-
-	ASSERT(tcp->tcp_fused && peer_tcp != NULL);
-	ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
-	ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
-	ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
-	ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
-
-	/*
-	 * Drain any pending data; the detached check is needed because
-	 * we may be called from tcp_fuse_output().  Note that in case of
-	 * a detached tcp, the draining will happen later after the tcp
-	 * is unfused.  For non-urgent data, this can be handled by the
-	 * regular tcp_rcv_drain().  If we have urgent data sitting in
-	 * the receive list, we will need to send up a SIGURG signal first
-	 * before draining the data.  All of these will be handled by the
-	 * code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain().
-	 */
-	if (!TCP_IS_DETACHED(tcp)) {
-		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
-		    &tcp->tcp_fused_sigurg_mp);
-	}
-	if (!TCP_IS_DETACHED(peer_tcp)) {
-		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
-		    &peer_tcp->tcp_fused_sigurg_mp);
-	}
-	/* Lift up any flow-control conditions */
-	if (tcp->tcp_flow_stopped) {
-		tcp_clrqfull(tcp);
-		tcp->tcp_flow_stopped = B_FALSE;
-		TCP_STAT(tcp_fusion_backenabled);
-	}
-	if (peer_tcp->tcp_flow_stopped) {
-		tcp_clrqfull(peer_tcp);
-		peer_tcp->tcp_flow_stopped = B_FALSE;
-		TCP_STAT(tcp_fusion_backenabled);
-	}
-
-	/* Free up M_PCSIG mblk(s) if not needed */
-	if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) {
-		freeb(tcp->tcp_fused_sigurg_mp);
-		tcp->tcp_fused_sigurg_mp = NULL;
-	}
-	if (!peer_tcp->tcp_fused_sigurg &&
-	    peer_tcp->tcp_fused_sigurg_mp != NULL) {
-		freeb(peer_tcp->tcp_fused_sigurg_mp);
-		peer_tcp->tcp_fused_sigurg_mp = NULL;
-	}
-
-	/*
-	 * Update th_seq and th_ack in the header template
-	 */
-	U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
-	U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
-
-	/* Unfuse the endpoints */
-	peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
-	peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
-}
-
-/*
- * Fusion output routine for urgent data.  This routine is called by
- * tcp_fuse_output() for handling non-M_DATA mblks.
- */
-static void
-tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
-{
-	mblk_t *mp1;
-	struct T_exdata_ind *tei;
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	mblk_t *head, *prev_head = NULL;
-
-	ASSERT(tcp->tcp_fused);
-	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-	ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
-	ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
-
-	/*
-	 * Urgent data arrives in the form of T_EXDATA_REQ from above.
-	 * Each occurence denotes a new urgent pointer.  For each new
-	 * urgent pointer we signal (SIGURG) the receiving app to indicate
-	 * that it needs to go into urgent mode.  This is similar to the
-	 * urgent data handling in the regular tcp.  We don't need to keep
-	 * track of where the urgent pointer is, because each T_EXDATA_REQ
-	 * "advances" the urgent pointer for us.
-	 *
-	 * The actual urgent data carried by T_EXDATA_REQ is then prepended
-	 * by a T_EXDATA_IND before being enqueued behind any existing data
-	 * destined for the receiving app.  There is only a single urgent
-	 * pointer (out-of-band mark) for a given tcp.  If the new urgent
-	 * data arrives before the receiving app reads some existing urgent
-	 * data, the previous marker is lost.  This behavior is emulated
-	 * accordingly below, by removing any existing T_EXDATA_IND messages
-	 * and essentially converting old urgent data into non-urgent.
-	 */
-	ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
-	/* Let sender get out of urgent mode */
-	tcp->tcp_valid_bits &= ~TCP_URG_VALID;
-
-	/*
-	 * Send up SIGURG to the receiving peer; if the peer is detached
-	 * or if we can't allocate the M_PCSIG, indicate that we need to
-	 * signal upon draining to the peer by marking tcp_fused_sigurg.
-	 * This flag will only get cleared once SIGURG is delivered and
-	 * is not affected by the tcp_fused flag -- delivery will still
-	 * happen even after an endpoint is unfused, to handle the case
-	 * where the sending endpoint immediately closes/unfuses after
-	 * sending urgent data and the accept is not yet finished.
-	 */
-	if (!TCP_IS_DETACHED(peer_tcp) &&
-	    ((mp1 = allocb(1, BPRI_HI)) != NULL ||
-	    (mp1 = allocb_tryhard(1)) != NULL)) {
-		peer_tcp->tcp_fused_sigurg = B_FALSE;
-		/* Send up the signal */
-		DB_TYPE(mp1) = M_PCSIG;
-		*mp1->b_wptr++ = (uchar_t)SIGURG;
-		putnext(peer_tcp->tcp_rq, mp1);
-	} else {
-		peer_tcp->tcp_fused_sigurg = B_TRUE;
-	}
-
-	/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
-	DB_TYPE(mp) = M_PROTO;
-	tei = (struct T_exdata_ind *)mp->b_rptr;
-	tei->PRIM_type = T_EXDATA_IND;
-	tei->MORE_flag = 0;
-	mp->b_wptr = (uchar_t *)&tei[1];
-
-	TCP_STAT(tcp_fusion_urg);
-	BUMP_MIB(&tcp_mib, tcpOutUrg);
-
-	head = peer_tcp->tcp_rcv_list;
-	while (head != NULL) {
-		/*
-		 * Remove existing T_EXDATA_IND, keep the data which follows
-		 * it and relink our list.  Note that we don't modify the
-		 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
-		 */
-		if (DB_TYPE(head) != M_DATA) {
-			mp1 = head;
-
-			ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
-			head = mp1->b_cont;
-			mp1->b_cont = NULL;
-			head->b_next = mp1->b_next;
-			mp1->b_next = NULL;
-			if (prev_head != NULL)
-				prev_head->b_next = head;
-			if (peer_tcp->tcp_rcv_list == mp1)
-				peer_tcp->tcp_rcv_list = head;
-			if (peer_tcp->tcp_rcv_last_head == mp1)
-				peer_tcp->tcp_rcv_last_head = head;
-			freeb(mp1);
-		}
-		prev_head = head;
-		head = head->b_next;
-	}
-}
-
-/*
- * Fusion output routine, called by tcp_output() and tcp_wput_proto().
- */
-static boolean_t
-tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
-{
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	queue_t *peer_rq;
-	mblk_t *mp_tail = mp;
-	uint32_t send_size = 0;
-
-	ASSERT(tcp->tcp_fused);
-	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
-	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
-	    DB_TYPE(mp) == M_PCPROTO);
-
-	peer_rq = peer_tcp->tcp_rq;
-
-	/* If this connection requires IP, unfuse and use regular path */
-	if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
-	    IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
-		TCP_STAT(tcp_fusion_aborted);
-		tcp_unfuse(tcp);
-		return (B_FALSE);
-	}
-
-	for (;;) {
-		if (DB_TYPE(mp_tail) == M_DATA)
-			send_size += MBLKL(mp_tail);
-		if (mp_tail->b_cont == NULL)
-			break;
-		mp_tail = mp_tail->b_cont;
-	}
-
-	if (send_size == 0) {
-		freemsg(mp);
-		return (B_TRUE);
-	}
-
-	/*
-	 * Handle urgent data; we either send up SIGURG to the peer now
-	 * or do it later when we drain, in case the peer is detached
-	 * or if we're short of memory for M_PCSIG mblk.
-	 */
-	if (DB_TYPE(mp) != M_DATA)
-		tcp_fuse_output_urg(tcp, mp);
-
-	/*
-	 * Enqueue data into the peer's receive list; we may or may not
-	 * drain the contents depending on the conditions below.
-	 */
-	tcp_rcv_enqueue(peer_tcp, mp, send_size);
-
-	/* In case it wrapped around and also to keep it constant */
-	peer_tcp->tcp_rwnd += send_size;
-
-	/*
-	 * If peer is detached, exercise flow-control when needed; we will
-	 * get back-enabled either in tcp_accept_finish() or tcp_unfuse().
-	 */
-	if (TCP_IS_DETACHED(peer_tcp) &&
-	    peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) {
-		tcp_setqfull(tcp);
-		tcp->tcp_flow_stopped = B_TRUE;
-		TCP_STAT(tcp_fusion_flowctl);
-	}
-
-	loopback_packets++;
-	tcp->tcp_last_sent_len = send_size;
-
-	/* Need to adjust the following SNMP MIB-related variables */
-	tcp->tcp_snxt += send_size;
-	tcp->tcp_suna = tcp->tcp_snxt;
-	peer_tcp->tcp_rnxt += send_size;
-	peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
-
-	BUMP_MIB(&tcp_mib, tcpOutDataSegs);
-	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
-
-	BUMP_MIB(&tcp_mib, tcpInSegs);
-	BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
-	UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
-
-	BUMP_LOCAL(tcp->tcp_obsegs);
-	BUMP_LOCAL(peer_tcp->tcp_ibsegs);
-
-	if (!TCP_IS_DETACHED(peer_tcp)) {
-		/*
-		 * If we can't send SIGURG above due to lack of memory,
-		 * schedule push timer and try again.  Otherwise drain
-		 * the data if we're not flow-controlled.
-		 */
-		if (peer_tcp->tcp_fused_sigurg) {
-			if (peer_tcp->tcp_push_tid == 0) {
-				peer_tcp->tcp_push_tid =
-				    TCP_TIMER(peer_tcp, tcp_push_timer,
-				    MSEC_TO_TICK(tcp_push_timer_interval));
-			}
-		} else if (!tcp->tcp_flow_stopped) {
-			if (!canputnext(peer_rq)) {
-				tcp_setqfull(tcp);
-				tcp->tcp_flow_stopped = B_TRUE;
-				TCP_STAT(tcp_fusion_flowctl);
-			} else {
-				ASSERT(peer_tcp->tcp_rcv_list != NULL);
-				(void) tcp_fuse_rcv_drain(peer_rq,
-				    peer_tcp, NULL);
-				TCP_STAT(tcp_fusion_putnext);
-			}
-		}
-	}
-	return (B_TRUE);
-}
-
-/*
- * This routine gets called to deliver data upstream on a fused or
- * previously fused tcp loopback endpoint; the latter happens only
- * when there is a pending SIGURG signal plus urgent data that can't
- * be sent upstream in the past.
- */
-static boolean_t
-tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
-{
-	mblk_t *mp;
-#ifdef DEBUG
-	uint_t cnt = 0;
-#endif
-
-	ASSERT(tcp->tcp_loopback);
-	ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
-	ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
-	ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
-
-	/* No need for the push timer now, in case it was scheduled */
-	if (tcp->tcp_push_tid != 0) {
-		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
-		tcp->tcp_push_tid = 0;
-	}
-	/*
-	 * If there's urgent data sitting in receive list and we didn't
-	 * get a chance to send up a SIGURG signal, make sure we send
-	 * it first before draining in order to ensure that SIOCATMARK
-	 * works properly.
-	 */
-	if (tcp->tcp_fused_sigurg) {
-		/*
-		 * sigurg_mpp is normally NULL, i.e. when we're still
-		 * fused and didn't get here because of tcp_unfuse().
-		 * In this case try hard to allocate the M_PCSIG mblk.
-		 */
-		if (sigurg_mpp == NULL &&
-		    (mp = allocb(1, BPRI_HI)) == NULL &&
-		    (mp = allocb_tryhard(1)) == NULL) {
-			/* Alloc failed; try again next time */
-			tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
-			    MSEC_TO_TICK(tcp_push_timer_interval));
-			return (B_TRUE);
-		} else if (sigurg_mpp != NULL) {
-			/*
-			 * Use the supplied M_PCSIG mblk; it means we're
-			 * either unfused or in the process of unfusing,
-			 * and the drain must happen now.
-			 */
-			mp = *sigurg_mpp;
-			*sigurg_mpp = NULL;
-		}
-		ASSERT(mp != NULL);
-
-		tcp->tcp_fused_sigurg = B_FALSE;
-		/* Send up the signal */
-		DB_TYPE(mp) = M_PCSIG;
-		*mp->b_wptr++ = (uchar_t)SIGURG;
-		putnext(q, mp);
-		/*
-		 * Let the regular tcp_rcv_drain() path handle
-		 * draining the data if we're no longer fused.
-		 */
-		if (!tcp->tcp_fused)
-			return (B_FALSE);
-	}
-
-	/* Drain the data */
-	while ((mp = tcp->tcp_rcv_list) != NULL) {
-		tcp->tcp_rcv_list = mp->b_next;
-		mp->b_next = NULL;
-#ifdef DEBUG
-		cnt += msgdsize(mp);
-#endif
-		putnext(q, mp);
-	}
-
-	ASSERT(cnt == tcp->tcp_rcv_cnt);
-	tcp->tcp_rcv_last_head = NULL;
-	tcp->tcp_rcv_last_tail = NULL;
-	tcp->tcp_rcv_cnt = 0;
-	tcp->tcp_rwnd = q->q_hiwat;
-
-	return (B_TRUE);
-}
-
-/*
- * This is the walker function, which is TCP specific.
- * It walks through the conn_hash bucket searching for the
- * next valid connp/tcp in the list, selecting connp/tcp
- * which haven't closed or condemned. It also REFHOLDS the
- * reference for the tcp, ensuring that the tcp exists
- * when the caller uses the tcp.
- *
- * tcp_get_next_conn
- * 	get the next entry in the conn global list
- * 	and put a reference on the next_conn.
- * 	decrement the reference on the current conn.
- */
-conn_t *
-tcp_get_next_conn(connf_t *connfp, conn_t *connp)
-{
-	conn_t	*next_connp;
-
-	if (connfp == NULL)
-		return (NULL);
-
-	mutex_enter(&connfp->connf_lock);
-
-	next_connp = (connp == NULL) ?
-	    connfp->connf_head : connp->conn_g_next;
-
-	while (next_connp != NULL) {
-		mutex_enter(&next_connp->conn_lock);
-		if ((next_connp->conn_state_flags &
-		    (CONN_CONDEMNED | CONN_INCIPIENT)) ||
-			!IPCL_IS_TCP(next_connp)) {
-			/*
-			 * This conn has been condemned or
-			 * is closing.
-			 */
-			mutex_exit(&next_connp->conn_lock);
-			next_connp = next_connp->conn_g_next;
-			continue;
-		}
-		ASSERT(next_connp->conn_tcp != NULL);
-		CONN_INC_REF_LOCKED(next_connp);
-		mutex_exit(&next_connp->conn_lock);
-		break;
-	}
-
-	mutex_exit(&connfp->connf_lock);
-
-	if (connp != NULL) {
-		CONN_DEC_REF(connp);
-	}
-
-	return (next_connp);
-}
-
 /*
  * Figure out the value of window scale opton.  Note that the rwnd is
  * ASSUMED to be rounded up to the nearest MSS before the calculation.
@@ -2808,7 +1950,7 @@
 		acceptor = tcp_acceptor_hash_lookup(acceptor_id);
 		if (acceptor == NULL) {
 			if (listener->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_accept: did not find acceptor 0x%x\n",
 				    acceptor_id);
@@ -3737,7 +2879,7 @@
 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad req, len %u",
 			    (uint_t)(mp->b_wptr - mp->b_rptr));
 		}
@@ -3768,7 +2910,7 @@
 			goto do_bind;
 		}
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad state, %d", tcp->tcp_state);
 		}
 		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -3805,7 +2947,7 @@
 		    sizeof (sin_t));
 		if (sin == NULL || !OK_32PTR((char *)sin)) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: bad address parameter, "
 				    "offset %d, len %d",
@@ -3835,7 +2977,7 @@
 		    tbr->ADDR_offset, sizeof (sin6_t));
 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: bad IPv6 address parameter, "
 				    "offset %d, len %d", tbr->ADDR_offset,
@@ -3857,7 +2999,7 @@
 
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad address length, %d",
 			    tbr->ADDR_length);
 		}
@@ -3945,7 +3087,7 @@
 
 			if (secpolicy_net_privaddr(cr, requested_port) != 0) {
 				if (tcp->tcp_debug) {
-					(void) strlog(TCP_MODULE_ID, 0, 1,
+					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_bind: no priv for port %d",
 					    requested_port);
@@ -3963,7 +3105,7 @@
 	if (allocated_port == 0) {
 		if (bind_to_req_port_only) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: requested addr busy");
 			}
@@ -3971,7 +3113,7 @@
 		} else {
 			/* If we are out of ports, fail the bind. */
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: out of ports?");
 			}
@@ -4436,7 +3578,7 @@
 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
 		}
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_clean_death: discon err %d", err);
 		}
 		mp = mi_tpi_discon_ind(NULL, err, 0);
@@ -4444,7 +3586,7 @@
 			putnext(q, mp);
 		} else {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_clean_death, sending M_ERROR");
 			}
@@ -4476,7 +3618,6 @@
 	if (tcp->tcp_state > TCPS_LISTEN) {
 		tcp_acceptor_hash_remove(tcp);
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 
@@ -4621,23 +3762,6 @@
 	return (0);
 }
 
-int
-tcp_modclose(queue_t *q)
-{
-	conn_t *connp = Q_TO_CONN(q);
-	ASSERT((connp->conn_flags & IPCL_TCPMOD) != 0);
-
-	qprocsoff(q);
-
-	if (connp->conn_cred != NULL) {
-		crfree(connp->conn_cred);
-		connp->conn_cred = NULL;
-	}
-	CONN_DEC_REF(connp);
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	return (0);
-}
-
 static int
 tcpclose_accept(queue_t *q)
 {
@@ -4798,7 +3922,6 @@
 		tcp_acceptor_hash_remove(tcp);
 
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 
@@ -4922,7 +4045,7 @@
 /*
  * Stop all TCP timers, and free the timer mblks if requested.
  */
-static void
+void
 tcp_timers_stop(tcp_t *tcp)
 {
 	if (tcp->tcp_timer_tid != 0) {
@@ -5285,7 +4408,7 @@
 		return (B_FALSE);
 
 	if (tcp->tcp_debug) {
-		(void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
 		    " (%d pending) on %s, drop one", tcp_conn_req_max_q0,
 		    tcp->tcp_conn_req_cnt_q0,
@@ -5371,8 +4494,8 @@
 		connp->conn_remv6 = ip6h->ip6_src;
 
 		/* db_cksumstuff is set at ip_fanout_tcp_v6 */
-		ifindex = (int)mp->b_datap->db_cksumstuff;
-		mp->b_datap->db_cksumstuff = 0;
+		ifindex = (int)DB_CKSUMSTUFF(mp);
+		DB_CKSUMSTUFF(mp) = 0;
 
 		sin6 = sin6_null;
 		sin6.sin6_addr = ip6h->ip6_src;
@@ -5727,8 +4850,8 @@
 		mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
 	}
 
-	new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
-	mp->b_datap->db_cksumstart = 0;
+	new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+	DB_CKSUMSTART(mp) = 0;
 
 	ASSERT(OK_32PTR(mp->b_rptr));
 	ipvers = IPH_HDR_VERSION(mp->b_rptr);
@@ -6012,7 +5135,7 @@
 		TCP_STAT(tcp_listendrop);
 		BUMP_MIB(&tcp_mib, tcpListenDrop);
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_conn_request: listen backlog (max=%d) "
 			    "overflow (%d pending) on %s",
 			    tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
@@ -6037,7 +5160,7 @@
 			mutex_exit(&tcp->tcp_eager_lock);
 			BUMP_MIB(&tcp_mib, tcpListenDropQ0);
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
 				    "tcp_conn_request: listen half-open queue "
 				    "(max=%d) full (%d pending) on %s",
 				    tcp_conn_req_max_q0,
@@ -6058,8 +5181,8 @@
 	 * otherwise an error case if neither of them is set.
 	 */
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
-		mp->b_datap->db_cksumstart = 0;
+		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+		DB_CKSUMSTART(mp) = 0;
 		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
 		econnp = (conn_t *)tcp_get_conn(arg2);
 		if (econnp == NULL)
@@ -6420,7 +5543,7 @@
 	uint32_t	conn_flags;
 
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
+		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
 	} else {
 		goto done;
 	}
@@ -7174,7 +6297,7 @@
 	 */
 	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
 		}
 		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -7988,10 +7111,6 @@
 	/* Cancel outstanding timers */
 	tcp_timers_stop(tcp);
 
-	if (tcp->tcp_flow_stopped) {
-		tcp->tcp_flow_stopped = B_FALSE;
-		tcp_clrqfull(tcp);
-	}
 	/*
 	 * Reset everything in the state vector, after updating global
 	 * MIB data from instance counters.
@@ -8006,6 +7125,10 @@
 		tcp_zcopy_notify(tcp);
 	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
+	if (tcp->tcp_flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
 	tcp_close_mpp(&tcp->tcp_reass_head);
 	tcp->tcp_reass_tail = NULL;
 	if (tcp->tcp_rcv_list != NULL) {
@@ -8193,7 +7316,6 @@
 	tcp->tcp_fin_sent = 0;
 	tcp->tcp_ordrel_done = 0;
 
-	ASSERT(tcp->tcp_flow_stopped == 0);
 	tcp->tcp_debug = 0;
 	tcp->tcp_dontroute = 0;
 	tcp->tcp_broadcast = 0;
@@ -8390,14 +7512,22 @@
 	ASSERT(tcp->tcp_rthdrlen == 0);
 	PRESERVE(tcp->tcp_drop_opt_ack_cnt);
 
+	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
+	tcp->tcp_direct_sockfs = B_FALSE;
+	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
+	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
 
 	tcp->tcp_in_ack_unsent = 0;
 	tcp->tcp_cork = B_FALSE;
 
+	PRESERVE(tcp->tcp_squeue_bytes);
+
 #undef	DONTCARE
 #undef	PRESERVE
 }
@@ -8469,10 +7599,16 @@
 	tcp->tcp_mdt_hdr_head = 0;
 	tcp->tcp_mdt_hdr_tail = 0;
 
+	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
+	tcp->tcp_direct_sockfs = B_FALSE;
+	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
+	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
 
 	/* Initialize the header template */
 	if (tcp->tcp_ipversion == IPV4_VERSION) {
@@ -9505,7 +8641,7 @@
 	    MSEC_TO_TICK(firetime));
 }
 
-static int
+int
 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
 {
 	queue_t	*q = tcp->tcp_rq;
@@ -9515,7 +8651,10 @@
 	if (TCP_IS_DETACHED(tcp))
 		return (mss);
 
-	if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
+	if (tcp->tcp_fused) {
+		maxpsz = tcp_fuse_maxpsz_set(tcp);
+		mss = INFPSZ;
+	} else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
 		/*
 		 * Set the sd_qn_maxpsz according to the socket send buffer
 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
@@ -9545,9 +8684,6 @@
 	if (set_maxblk)
 		(void) mi_set_sth_maxblk(q, mss);
 
-	if (tcp->tcp_loopback)
-		(void) mi_set_sth_copyopt(tcp->tcp_rq, COPYCACHED);
-
 	return (mss);
 }
 
@@ -9868,7 +9004,6 @@
 		 */
 		connp->conn_flags |= IPCL_SOCKET;
 		tcp->tcp_issocket = 1;
-
 		WR(q)->q_qinfo = &tcp_sock_winit;
 	} else {
 #ifdef	_ILP32
@@ -10452,32 +9587,45 @@
 			if (!checkonly)
 				tcp->tcp_dgram_errind = onoff;
 			break;
-		case SO_SNDBUF:
+		case SO_SNDBUF: {
+			tcp_t *peer_tcp;
+
 			if (*i1 > tcp_max_buf) {
 				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				tcp->tcp_xmit_hiwater = *i1;
-				if (tcp_snd_lowat_fraction != 0)
-					tcp->tcp_xmit_lowater =
-					    tcp->tcp_xmit_hiwater /
-					    tcp_snd_lowat_fraction;
-				(void) tcp_maxpsz_set(tcp, B_TRUE);
-				/*
-				 * If we are flow-controlled, recheck the
-				 * condition. There are apps that increase
-				 * SO_SNDBUF size when flow-controlled
-				 * (EWOULDBLOCK), and expect the flow control
-				 * condition to be lifted right away.
-				 */
-				if (tcp->tcp_flow_stopped &&
-				    tcp->tcp_unsent < tcp->tcp_xmit_hiwater) {
-					tcp->tcp_flow_stopped = B_FALSE;
-					tcp_clrqfull(tcp);
-				}
-			}
-			break;
+			if (checkonly)
+				break;
+
+			tcp->tcp_xmit_hiwater = *i1;
+			if (tcp_snd_lowat_fraction != 0)
+				tcp->tcp_xmit_lowater =
+				    tcp->tcp_xmit_hiwater /
+				    tcp_snd_lowat_fraction;
+			(void) tcp_maxpsz_set(tcp, B_TRUE);
+			/*
+			 * If we are flow-controlled, recheck the condition.
+			 * There are apps that increase SO_SNDBUF size when
+			 * flow-controlled (EWOULDBLOCK), and expect the flow
+			 * control condition to be lifted right away.
+			 *
+			 * For the fused tcp loopback case, in order to avoid
+			 * a race with the peer's tcp_fuse_rrw() we need to
+			 * hold its fuse_lock while accessing tcp_flow_stopped.
+			 */
+			peer_tcp = tcp->tcp_loopback_peer;
+			ASSERT(!tcp->tcp_fused || peer_tcp != NULL);
+			if (tcp->tcp_fused)
+				mutex_enter(&peer_tcp->tcp_fuse_lock);
+
+			if (tcp->tcp_flow_stopped &&
+			    TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
+				tcp_clrqfull(tcp);
+			}
+			if (tcp->tcp_fused)
+				mutex_exit(&peer_tcp->tcp_fuse_lock);
+			break;
+		}
 		case SO_RCVBUF:
 			if (*i1 > tcp_max_buf) {
 				*outlenp = 0;
@@ -11892,7 +11040,7 @@
  * M_DATA messages are added to the current element.
  * Other messages are added as new (b_next) elements.
  */
-static void
+void
 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
 {
 	ASSERT(seg_len == msgdsize(mp));
@@ -12380,7 +11528,7 @@
 		BUMP_MIB(&ip_mib, ipsecInSucceeded);
 		return (B_TRUE);
 	}
-	(void) strlog(TCP_MODULE_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
+	(void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
 	    "tcp inbound policy mismatch: %s, packet dropped\n",
 	    reason);
 	BUMP_MIB(&ip_mib, ipsecInFailed);
@@ -13469,7 +12617,7 @@
 			 */
 			seg_len -= gap;
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: unacceptable, gap %d, rgap %d, "
 				    "flags 0x%x, seg_seq %u, seg_ack %u, "
 				    "seg_len %d, rnxt %u, snxt %u, %s",
@@ -13873,7 +13021,7 @@
 			tcp->tcp_urp_mark_mp = mp1;
 			flags |= TH_SEND_URP_MARK;
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
 			    "last %x, %s",
 			    seg_seq, urp, tcp->tcp_urp_last,
@@ -14012,7 +13160,7 @@
 				mp1->b_wptr = (uchar_t *)&tei[1];
 				tcp->tcp_urp_mp = mp1;
 #ifdef DEBUG
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: allocated exdata_ind %s",
 				    tcp_display(tcp, NULL,
 				    DISP_PORT_ONLY));
@@ -14059,7 +13207,7 @@
 				tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT;
 			}
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: AT MARK, len %d, flags 0x%x, %s",
 			    seg_len, flags,
 			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
@@ -14067,7 +13215,7 @@
 		} else {
 			/* Data left until we hit mark */
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: URP %d bytes left, %s",
 			    urp - seg_len, tcp_display(tcp, NULL,
 			    DISP_PORT_ONLY));
@@ -14990,7 +14138,7 @@
 		/* Ready for a new signal. */
 		tcp->tcp_urp_last_valid = B_FALSE;
 #ifdef DEBUG
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_rput: sending exdata_ind %s",
 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
 #endif /* DEBUG */
@@ -15026,7 +14174,7 @@
 			    tcp->tcp_fused_sigurg);
 			if (flags & TH_MARKNEXT_NEEDED) {
 #ifdef DEBUG
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: sending MSGMARKNEXT %s",
 				    tcp_display(tcp, NULL,
 				    DISP_PORT_ONLY));
@@ -15167,7 +14315,7 @@
 		mp1 = tcp->tcp_urp_mark_mp;
 		tcp->tcp_urp_mark_mp = NULL;
 #ifdef DEBUG
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_rput: sending zero-length %s %s",
 		    ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
 		    "MSGNOTMARKNEXT"),
@@ -15853,7 +15001,7 @@
 			return;
 		case T_ERROR_ACK:
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_TRACE|SL_ERROR,
 				    "tcp_rput_other: case T_ERROR_ACK, "
 				    "ERROR_prim == %d",
@@ -15984,11 +15132,20 @@
 		ASSERT(tcp->tcp_connp->conn_sqp ==
 		    peer_tcp->tcp_connp->conn_sqp);
 
+		/*
+		 * Normally we would not get backenabled in synchronous
+		 * streams mode, but in case this happens, we need to stop
+		 * synchronous streams temporarily to prevent a race with
+		 * tcp_fuse_rrw() or tcp_fuse_rinfop().  It is safe to access
+		 * tcp_rcv_list here because those entry points will return
+		 * right away when synchronous streams is stopped.
+		 */
+		TCP_FUSE_SYNCSTR_STOP(tcp);
 		if (tcp->tcp_rcv_list != NULL)
 			(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
 
 		tcp_clrqfull(peer_tcp);
-		peer_tcp->tcp_flow_stopped = B_FALSE;
+		TCP_FUSE_SYNCSTR_RESUME(tcp);
 		TCP_STAT(tcp_fusion_backenabled);
 		return;
 	}
@@ -16118,6 +15275,30 @@
 	uint32_t	max_transmittable_rwnd;
 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
 
+	if (tcp->tcp_fused) {
+		size_t sth_hiwat;
+		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+		ASSERT(peer_tcp != NULL);
+		/*
+		 * Record the stream head's high water mark for
+		 * this endpoint; this is used for flow-control
+		 * purposes in tcp_fuse_output().
+		 */
+		sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
+		if (!tcp_detached)
+			(void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat);
+
+		/*
+		 * In the fusion case, the maxpsz stream head value of
+		 * our peer is set according to its send buffer size
+		 * and our receive buffer size; since the latter may
+		 * have changed we need to update the peer's maxpsz.
+		 */
+		(void) tcp_maxpsz_set(peer_tcp, B_TRUE);
+		return (rwnd);
+	}
+
 	if (tcp_detached)
 		old_max_rwnd = tcp->tcp_rwnd;
 	else
@@ -16196,23 +15377,16 @@
 	 * Set the Stream head high water mark. This doesn't have to be
 	 * here, since we are simply using default values, but we would
 	 * prefer to choose these values algorithmically, with a likely
-	 * relationship to rwnd.  For fused loopback tcp, we double the
-	 * amount of buffer in order to simulate the normal tcp case.
-	 */
-	if (tcp->tcp_fused) {
-		(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd << 1,
-		    tcp_sth_rcv_hiwat));
-	} else {
-		(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd,
-		    tcp_sth_rcv_hiwat));
-	}
+	 * relationship to rwnd.
+	 */
+	(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat));
 	return (rwnd);
 }
 
 /*
  * Return SNMP stuff in buffer in mpdata.
  */
-static int
+int
 tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 {
 	mblk_t			*mpdata;
@@ -16261,7 +15435,8 @@
 
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp_t *tcp;
 
 			if (connp->conn_zoneid != zoneid)
@@ -16406,7 +15581,7 @@
 
 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests  */
 /* ARGSUSED */
-static int
+int
 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
 {
 	mib2_tcpConnEntry_t	*tce = (mib2_tcpConnEntry_t *)ptr;
@@ -16627,7 +15802,8 @@
 
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16723,7 +15899,8 @@
 	for (i = 0; i < ipcl_bind_fanout_size; i++) {
 		connfp =  &ipcl_bind_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16770,7 +15947,8 @@
 	for (i = 0; i < ipcl_conn_fanout_size; i++) {
 		connfp =  &ipcl_conn_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16927,7 +16105,7 @@
 			 */
 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 				if (tcp->tcp_debug) {
-					(void) strlog(TCP_MODULE_ID, 0, 1,
+					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_TRACE, "tcp_timer: zero win");
 				}
 			} else {
@@ -17040,7 +16218,7 @@
 		return;
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_timer: strange state (%d) %s",
 			    tcp->tcp_state, tcp_display(tcp, NULL,
 			    DISP_PORT_ONLY));
@@ -17372,52 +16550,6 @@
 }
 
 /*
- * Write side put procedure for TCP module instance.
- * TCP as a module is only used for MIB browsers that push TCP over IP or
- * ARP. The only supported primitives are T_SVR4_OPTMGMT_REQ and
- * T_OPTMGMT_REQ. M_FLUSH messages are only passed downstream; we don't flush
- * our queues as we never enqueue messages there. All ioctls are NAKed and
- * everything else is freed.
- */
-static void
-tcp_wput_mod(queue_t *q, mblk_t *mp)
-{
-	switch (DB_TYPE(mp)) {
-	case M_PROTO:
-	case M_PCPROTO:
-		if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
-		    ((((union T_primitives *)mp->b_rptr)->type ==
-			T_SVR4_OPTMGMT_REQ) ||
-		    (((union T_primitives *)mp->b_rptr)->type ==
-			T_OPTMGMT_REQ))) {
-			/*
-			 * This is the only TPI primitive supported. Its
-			 * handling does not require tcp_t, but it does require
-			 * conn_t to check permissions.
-			 */
-			cred_t	*cr = DB_CREDDEF(mp, Q_TO_CONN(q)->conn_cred);
-			if (!snmpcom_req(q, mp, tcp_snmp_set,
-			    tcp_snmp_get, cr)) {
-				freemsg(mp);
-				return;
-			}
-		} else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
-		    != NULL)
-			qreply(q, mp);
-		break;
-	case M_FLUSH:
-		putnext(q, mp);
-		break;
-	case M_IOCTL:
-		miocnak(q, mp, 0, ENOTSUP);
-		break;
-	default:
-		freemsg(mp);
-		break;
-	}
-}
-
-/*
  * The TCP fast path write put procedure.
  * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
  */
@@ -17441,6 +16573,7 @@
 	int		usable;
 	conn_t		*connp = (conn_t *)arg;
 	tcp_t		*tcp = connp->conn_tcp;
+	uint32_t	msize;
 
 	/*
 	 * Try and ASSERT the minimum possible references on the
@@ -17455,8 +16588,15 @@
 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 
 	/* Bypass tcp protocol for fused tcp loopback */
-	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
-		return;
+	if (tcp->tcp_fused) {
+		msize = msgdsize(mp);
+		mutex_enter(&connp->conn_lock);
+		tcp->tcp_squeue_bytes -= msize;
+		mutex_exit(&connp->conn_lock);
+
+		if (tcp_fuse_output(tcp, mp, msize))
+			return;
+	}
 
 	mss = tcp->tcp_mss;
 	if (tcp->tcp_xmit_zc_clean)
@@ -17482,6 +16622,11 @@
 	    (len == 0) ||
 	    (len > mss) ||
 	    (tcp->tcp_valid_bits != 0)) {
+		msize = msgdsize(mp);
+		mutex_enter(&connp->conn_lock);
+		tcp->tcp_squeue_bytes -= msize;
+		mutex_exit(&connp->conn_lock);
+
 		tcp_wput_data(tcp, mp, B_FALSE);
 		return;
 	}
@@ -17489,6 +16634,10 @@
 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
 	ASSERT(tcp->tcp_fin_sent == 0);
 
+	mutex_enter(&connp->conn_lock);
+	tcp->tcp_squeue_bytes -= len;
+	mutex_exit(&connp->conn_lock);
+
 	/* queue new packet onto retransmission queue */
 	if (tcp->tcp_xmit_head == NULL) {
 		tcp->tcp_xmit_head = mp;
@@ -17536,6 +16685,11 @@
 		goto slow;
 	}
 
+	if (tcp->tcp_flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
+
 	/*
 	 * determine if anything to send (Nagle).
 	 *
@@ -17789,6 +16943,13 @@
 	mp = NULL;
 
 	/*
+	 * For a loopback connection with tcp_direct_sockfs on, note that
+	 * we don't have to protect tcp_rcv_list yet because synchronous
+	 * streams has not yet been enabled and tcp_fuse_rrw() cannot
+	 * possibly race with us.
+	 */
+
+	/*
 	 * Set the max window size (tcp_rq->q_hiwat) of the acceptor
 	 * properly.  This is the first time we know of the acceptor'
 	 * queue.  So we do it here.
@@ -17828,9 +16989,8 @@
 	/* Allocate room for SACK options if needed. */
 	stropt->so_flags |= SO_WROFF;
 	if (tcp->tcp_fused) {
-		size_t sth_hiwat;
-
 		ASSERT(tcp->tcp_loopback);
+		ASSERT(tcp->tcp_loopback_peer != NULL);
 		/*
 		 * For fused tcp loopback, set the stream head's write
 		 * offset value to zero since we won't be needing any room
@@ -17839,16 +16999,16 @@
 		 * Non-fused tcp loopback case is handled separately below.
 		 */
 		stropt->so_wroff = 0;
-
-		/*
-		 * Override q_hiwat and set it to be twice that of the
-		 * previous value; this is to simulate non-fusion case.
-		 */
-		sth_hiwat = q->q_hiwat << 1;
-		if (sth_hiwat > tcp_max_buf)
-			sth_hiwat = tcp_max_buf;
-
-		stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
+		/*
+		 * Record the stream head's high water mark for this endpoint;
+		 * this is used for flow-control purposes in tcp_fuse_output().
+		 */
+		stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat);
+		/*
+		 * Update the peer's transmit parameters according to
+		 * our recently calculated high water mark value.
+		 */
+		(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
 	} else if (tcp->tcp_snd_sack_ok) {
 		stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
 		    (tcp->tcp_loopback ? 0 : tcp_wroff_xtra);
@@ -17857,15 +17017,6 @@
 		    tcp_wroff_xtra);
 	}
 
-	/*
-	 * If loopback, set COPYCACHED option to make sure NOT to use
-	 * non-temporal access.
-	 */
-	if (tcp->tcp_loopback) {
-		stropt->so_flags |= SO_COPYOPT;
-		stropt->so_copyopt = COPYCACHED;
-	}
-
 	/* Send the options up */
 	putnext(q, stropt_mp);
 
@@ -17909,7 +17060,6 @@
 			ASSERT(peer_tcp->tcp_fused);
 
 			tcp_clrqfull(peer_tcp);
-			peer_tcp->tcp_flow_stopped = B_FALSE;
 			TCP_STAT(tcp_fusion_backenabled);
 		}
 	}
@@ -17924,11 +17074,9 @@
 				 * tcp_clean_death was deferred
 				 * for T_ORDREL_IND - do it now
 				 */
-				(void) tcp_clean_death(
-					tcp,
-					    tcp->tcp_client_errno, 21);
-				tcp->tcp_deferred_clean_death =
-				    B_FALSE;
+				(void) tcp_clean_death(tcp,
+				    tcp->tcp_client_errno, 21);
+				tcp->tcp_deferred_clean_death = B_FALSE;
 			}
 		} else {
 			/*
@@ -17942,8 +17090,14 @@
 		tcp->tcp_hard_binding = B_FALSE;
 		tcp->tcp_hard_bound = B_TRUE;
 	}
+
 	tcp->tcp_detached = B_FALSE;
 
+	/* We can enable synchronous streams now */
+	if (tcp->tcp_fused) {
+		tcp_fuse_syncstr_enable_pair(tcp);
+	}
+
 	if (tcp->tcp_ka_enabled) {
 		tcp->tcp_ka_last_intrvl = 0;
 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
@@ -18236,7 +17390,7 @@
 	}
 }
 
-static void
+void
 tcp_wput(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
@@ -18245,12 +17399,27 @@
 	t_scalar_t type;
 	uchar_t *rptr;
 	struct iocblk	*iocp;
+	uint32_t	msize;
 
 	ASSERT(connp->conn_ref >= 2);
 
 	switch (DB_TYPE(mp)) {
 	case M_DATA:
-		CONN_INC_REF(connp);
+		tcp = connp->conn_tcp;
+		ASSERT(tcp != NULL);
+
+		msize = msgdsize(mp);
+
+		mutex_enter(&connp->conn_lock);
+		CONN_INC_REF_LOCKED(connp);
+
+		tcp->tcp_squeue_bytes += msize;
+		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+			mutex_exit(&connp->conn_lock);
+			tcp_setqfull(tcp);
+		} else
+			mutex_exit(&connp->conn_lock);
+
 		(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
 		    tcp_output, connp, SQTAG_TCP_OUTPUT);
 		return;
@@ -18265,7 +17434,7 @@
 			type = ((union T_primitives *)rptr)->type;
 		} else {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, dropping one...");
 			}
@@ -18292,7 +17461,7 @@
 		/*
 		 * Most ioctls can be processed right away without going via
 		 * squeues - process them right here. Those that do require
-		 * squeue (currently TCP_IOC_DEFAULT_Q and SIOCPOPSOCKFS)
+		 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
 		 * are processed by tcp_wput_ioctl().
 		 */
 		iocp = (struct iocblk *)mp->b_rptr;
@@ -18372,7 +17541,7 @@
 	ASSERT(wq->q_qinfo == &tcp_sock_winit);
 	wq->q_qinfo = &tcp_winit;
 
-	ASSERT(IS_TCP_CONN(connp));
+	ASSERT(IPCL_IS_TCP(connp));
 	ASSERT(TCP_IS_SOCKET(tcp));
 
 	if (DB_TYPE(mp) == M_PCPROTO &&
@@ -18540,7 +17709,6 @@
 	mutex_exit(&stp->sd_lock);
 }
 
-
 static void
 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
 {
@@ -18555,7 +17723,6 @@
 	uint32_t	hcksum_txflags = 0;
 	mblk_t		*ire_fp_mp;
 	uint_t		ire_fp_mp_len;
-	ill_poll_capab_t *ill_poll;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
@@ -18699,7 +17866,7 @@
 		 */
 	}
 
-	if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) && dohwcksum) {
+	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 		ASSERT(ill->ill_hcksum_capab != NULL);
 		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
 	}
@@ -18710,53 +17877,21 @@
 	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
 	up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
 
-	/*
-	 * Underlying interface supports hardware checksum offload for
-	 * the tcp payload, along with M_DATA fast path; leave the payload
-	 * checksum for the hardware to calculate.
-	 *
-	 * N.B: We only need to set up checksum info on the first mblk.
-	 */
-	if (hcksum_txflags & HCKSUM_INET_FULL_V4) {
-		/*
-		 * Hardware calculates pseudo-header, header and payload
-		 * checksums, so clear checksum field in TCP header.
-		 */
-		*up = 0;
-		mp->b_datap->db_struioun.cksum.flags |= HCK_FULLCKSUM;
-	} else if (hcksum_txflags & HCKSUM_INET_PARTIAL) {
-		uint32_t sum;
-		/*
-		 * Partial checksum offload has been enabled.  Fill the
-		 * checksum field in the TCP header with the pseudo-header
-		 * checksum value.
-		 */
-		sum = *up + cksum + IP_TCP_CSUM_COMP;
-		sum = (sum & 0xFFFF) + (sum >> 16);
-		*up = (sum & 0xFFFF) + (sum >> 16);
-		mp->b_datap->db_cksumstart = IP_SIMPLE_HDR_LENGTH;
-		mp->b_datap->db_cksumstuff = IP_SIMPLE_HDR_LENGTH + 16;
-		mp->b_datap->db_cksumend = ntohs(ipha->ipha_length);
-		mp->b_datap->db_struioun.cksum.flags |= HCK_PARTIALCKSUM;
-	} else {
-		/* software checksumming */
+	IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
+	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
+
+	/* Software checksum? */
+	if (DB_CKSUMFLAGS(mp) == 0) {
 		TCP_STAT(tcp_out_sw_cksum);
-		*up = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH,
-		    cksum + IP_TCP_CSUM_COMP);
-		mp->b_datap->db_struioun.cksum.flags = 0;
+		TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+		    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
 	}
 
 	ipha->ipha_fragment_offset_and_flags |=
 	    (uint32_t)htons(ire->ire_frag_flag);
 
-	/*
-	 * Hardware supports IP header checksum offload; clear contents
-	 * of IP header checksum field.  Otherwise we calculate it.
-	 */
-	if (hcksum_txflags & HCKSUM_IPHDRCKSUM) {
-		ipha->ipha_hdr_checksum = 0;
-		mp->b_datap->db_struioun.cksum.flags |= HCK_IPV4_HDRCKSUM;
-	} else {
+	/* Calculate IP header checksum if hardware isn't capable */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
 		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
 		    ((uint16_t *)ipha)[4]);
 	}
@@ -18769,13 +17904,13 @@
 	ire->ire_last_used_time = lbolt;
 	BUMP_MIB(&ip_mib, ipOutRequests);
 
-	if (ill->ill_capabilities & ILL_CAPAB_POLL) {
-		ill_poll = ill->ill_poll_capab;
-		ASSERT(ill_poll != NULL);
-		ASSERT(ill_poll->ill_tx != NULL);
-		ASSERT(ill_poll->ill_tx_handle != NULL);
-
-		ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);
+	if (ILL_POLL_CAPABLE(ill)) {
+		/*
+		 * Send the packet directly to DLD, where it may be queued
+		 * depending on the availability of transmit resources at
+		 * the media layer.
+		 */
+		IP_POLL_ILL_TX(ill, mp);
 	} else {
 		putnext(ire->ire_stq, mp);
 	}
@@ -18876,7 +18011,7 @@
 			    DISP_ADDR_AND_PORT));
 #else
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_TRACE|SL_ERROR,
 				    "tcp_wput_data: data after ordrel, %s\n",
 				    tcp_display(tcp, NULL,
@@ -18888,6 +18023,10 @@
 		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
 			tcp_zcopy_notify(tcp);
 		freemsg(mp);
+		if (tcp->tcp_flow_stopped &&
+		    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+			tcp_clrqfull(tcp);
+		}
 		return;
 	}
 
@@ -19214,15 +18353,12 @@
 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 	}
 	/* Note that len is the amount we just sent but with a negative sign */
-	len += tcp->tcp_unsent;
-	tcp->tcp_unsent = len;
+	tcp->tcp_unsent += len;
 	if (tcp->tcp_flow_stopped) {
-		if (len <= tcp->tcp_xmit_lowater) {
-			tcp->tcp_flow_stopped = B_FALSE;
+		if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
 			tcp_clrqfull(tcp);
 		}
-	} else if (len >= tcp->tcp_xmit_hiwater) {
-		tcp->tcp_flow_stopped = B_TRUE;
+	} else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
 		tcp_setqfull(tcp);
 	}
 }
@@ -19361,6 +18497,12 @@
 }
 
 /*
+ * Smaller and private version of pdescinfo_t used specifically for TCP,
+ * which allows for only two payload spans per packet.
+ */
+typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
+
+/*
  * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
  * scheme, and returns one the following:
  *
@@ -19404,9 +18546,6 @@
 #define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 4) & 0x7)
 #endif
 
-#define	TCP_CSUM_OFFSET	16
-#define	TCP_CSUM_SIZE	2
-
 #define	PREP_NEW_MULTIDATA() {			\
 	mmd = NULL;				\
 	md_mp = md_hbuf = NULL;			\
@@ -19542,8 +18681,7 @@
 
 	ill = ire_to_ill(ire);
 	ASSERT(ill != NULL);
-	ASSERT((ill->ill_capabilities & ILL_CAPAB_MDT) == 0 ||
-	    ill->ill_mdt_capab != NULL);
+	ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
 
 	if (!tcp->tcp_ire_ill_check_done) {
 		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
@@ -19576,16 +18714,16 @@
 
 	/* does the interface support hardware checksum offload? */
 	hwcksum_flags = 0;
-	if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&
+	if (ILL_HCKSUM_CAPABLE(ill) &&
 	    (ill->ill_hcksum_capab->ill_hcksum_txflags &
-	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM)) &&
-	    dohwcksum) {
+	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
+	    HCKSUM_IPHDRCKSUM)) && dohwcksum) {
 		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
 		    HCKSUM_IPHDRCKSUM)
 			hwcksum_flags = HCK_IPV4_HDRCKSUM;
 
 		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
-		    HCKSUM_INET_FULL_V4)
+		    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
 			hwcksum_flags |= HCK_FULLCKSUM;
 		else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
 		    HCKSUM_INET_PARTIAL)
@@ -19726,10 +18864,16 @@
 			 * checksum offload; these are currently for IPv4.
 			 * For full checksum offload, they are set to zero.
 			 */
-			if (af == AF_INET &&
-			    (hwcksum_flags & HCK_PARTIALCKSUM)) {
-				start = IP_SIMPLE_HDR_LENGTH;
-				stuff = IP_SIMPLE_HDR_LENGTH + TCP_CSUM_OFFSET;
+			if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
+				if (af == AF_INET) {
+					start = IP_SIMPLE_HDR_LENGTH;
+					stuff = IP_SIMPLE_HDR_LENGTH +
+					    TCP_CHECKSUM_OFFSET;
+				} else {
+					start = IPV6_HDR_LEN;
+					stuff = IPV6_HDR_LEN +
+					    TCP_CHECKSUM_OFFSET;
+				}
 			} else {
 				start = stuff = 0;
 			}
@@ -19748,8 +18892,8 @@
 			    /* fastpath mblk */
 			    (af == AF_INET) ? ire->ire_dlureq_mp :
 			    ire->ire_nce->nce_res_mp,
-			    /* hardware checksum enabled (IPv4 only) */
-			    (af == AF_INET && hwcksum_flags != 0),
+			    /* hardware checksum enabled */
+			    (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
 			    /* hardware checksum offsets */
 			    start, stuff, 0,
 			    /* hardware checksum flag */
@@ -20224,8 +19368,8 @@
 				ASSERT(IPVER(ip6h) == IPV6_VERSION);
 				ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
 				ASSERT(PDESC_HDRL(pkt_info) >=
-				    (IPV6_HDR_LEN + TCP_CSUM_OFFSET +
-				    TCP_CSUM_SIZE));
+				    (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
+				    TCP_CHECKSUM_SIZE));
 				ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
 
 				if (tcp->tcp_ip_forward_progress) {
@@ -20273,29 +19417,45 @@
 				/* offset for TCP header checksum */
 				up = IPH_TCPH_CHECKSUMP(ipha,
 				    IP_SIMPLE_HDR_LENGTH);
-
-				if (hwcksum_flags & HCK_FULLCKSUM) {
-					/*
-					 * Hardware calculates pseudo-header,
-					 * header and payload checksums, so
-					 * zero out this field.
-					 */
-					*up = 0;
-				} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
-					uint32_t sum;
-
-					/* pseudo-header checksumming */
-					sum = *up + cksum + IP_TCP_CSUM_COMP;
-					sum = (sum & 0xFFFF) + (sum >> 16);
-					*up = (sum & 0xFFFF) + (sum >> 16);
-				} else {
-					/* software checksumming */
-					TCP_STAT(tcp_out_sw_cksum);
-					*up = IP_MD_CSUM(pkt,
-					    IP_SIMPLE_HDR_LENGTH,
-					    cksum + IP_TCP_CSUM_COMP);
-				}
-
+			} else {
+				up = (uint16_t *)&ip6h->ip6_src;
+
+				/* calculate pseudo-header checksum */
+				cksum = up[0] + up[1] + up[2] + up[3] +
+				    up[4] + up[5] + up[6] + up[7] +
+				    up[8] + up[9] + up[10] + up[11] +
+				    up[12] + up[13] + up[14] + up[15];
+
+				/* Fold the initial sum */
+				cksum = (cksum & 0xffff) + (cksum >> 16);
+
+				up = (uint16_t *)(((uchar_t *)ip6h) +
+				    IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
+			}
+
+			if (hwcksum_flags & HCK_FULLCKSUM) {
+				/* clear checksum field for hardware */
+				*up = 0;
+			} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
+				uint32_t sum;
+
+				/* pseudo-header checksumming */
+				sum = *up + cksum + IP_TCP_CSUM_COMP;
+				sum = (sum & 0xFFFF) + (sum >> 16);
+				*up = (sum & 0xFFFF) + (sum >> 16);
+			} else {
+				/* software checksumming */
+				TCP_STAT(tcp_out_sw_cksum);
+				TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+				    tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
+				*up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
+				    cksum + IP_TCP_CSUM_COMP);
+				if (*up == 0)
+					*up = 0xFFFF;
+			}
+
+			/* IPv4 header checksum */
+			if (af == AF_INET) {
 				ipha->ipha_fragment_offset_and_flags |=
 				    (uint32_t)htons(ire->ire_frag_flag);
 
@@ -20306,19 +19466,6 @@
 					    ((uint32_t *)ipha)[0],
 					    ((uint16_t *)ipha)[4]);
 				}
-			} else {
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + TCP_CSUM_OFFSET);
-
-				/*
-				 * Software checksumming (hardware checksum
-				 * offload for IPv6 will hopefully be
-				 * implemented one day).
-				 */
-				TCP_STAT(tcp_out_sw_cksum);
-				*up = IP_MD_CSUM(pkt,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_TCP));
 			}
 
 			/* advance header offset */
@@ -20373,8 +19520,6 @@
 #undef PREP_NEW_MULTIDATA
 #undef PREP_NEW_PBUF
 #undef IPVER
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
 
 	IRE_REFRELE(ire);
 	return (0);
@@ -20999,7 +20144,7 @@
 	 */
 	if (ip_multidata_outbound && check_mdt &&
 	    !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
-	    ill != NULL && (ill->ill_capabilities & ILL_CAPAB_MDT) &&
+	    ill != NULL && ILL_MDT_CAPABLE(ill) &&
 	    !CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
 	    !(ire->ire_flags & RTF_MULTIRT) &&
 	    !IPP_ENABLED(IPP_LOCAL_OUT) &&
@@ -21112,7 +20257,6 @@
 		 * tcp_xmit_lowater, so re-enable flow.
 		 */
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 	}
@@ -21305,26 +20449,47 @@
 		}
 		tcp_def_q_set(tcp, mp);
 		return;
-	case SIOCPOPSOCKFS:
-		/*
-		 * sockfs is being I_POP'ed, reset the flag
-		 * indicating this
-		 */
-		tcp->tcp_issocket = B_FALSE;
-
-		/*
-		 * Insert this socket into the acceptor hash.
-		 * We might need it for T_CONN_RES message
-		 */
+	case _SIOCSOCKFALLBACK:
+		/*
+		 * Either sockmod is about to be popped and the socket
+		 * would now be treated as a plain stream, or a module
+		 * is about to be pushed so we could no longer use read-
+		 * side synchronous streams for fused loopback tcp.
+		 * Drain any queued data and disable direct sockfs
+		 * interface from now on.
+		 */
+		if (!tcp->tcp_issocket) {
+			DB_TYPE(mp) = M_IOCNAK;
+			iocp->ioc_error = EINVAL;
+		} else {
 #ifdef	_ILP32
-		tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
+			tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
 #else
-		tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+			tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
 #endif
-		tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
-		mp->b_datap->db_type = M_IOCACK;
+			/*
+			 * Insert this socket into the acceptor hash.
+			 * We might need it for T_CONN_RES message
+			 */
+			tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+
+			if (tcp->tcp_fused) {
+				/*
+				 * This is a fused loopback tcp; disable
+				 * read-side synchronous streams interface
+				 * and drain any queued data.  It is okay
+				 * to do this for non-synchronous streams
+				 * fused tcp as well.
+				 */
+				tcp_fuse_disable_pair(tcp, B_FALSE);
+			}
+			tcp->tcp_issocket = B_FALSE;
+			TCP_STAT(tcp_sock_fallback);
+
+			DB_TYPE(mp) = M_IOCACK;
+			iocp->ioc_error = 0;
+		}
 		iocp->ioc_count = 0;
-		iocp->ioc_error = 0;
 		iocp->ioc_rval = 0;
 		qreply(q, mp);
 		return;
@@ -21364,7 +20529,9 @@
 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 		type = ((union T_primitives *)rptr)->type;
 		if (type == T_EXDATA_REQ) {
-			len = msgdsize(mp->b_cont) - 1;
+			uint32_t msize = msgdsize(mp->b_cont);
+
+			len = msize - 1;
 			if (len < 0) {
 				freemsg(mp);
 				return;
@@ -21381,7 +20548,7 @@
 			tcp->tcp_valid_bits |= TCP_URG_VALID;
 
 			/* Bypass tcp protocol for fused tcp loopback */
-			if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
+			if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
 				return;
 		} else if (type != T_DATA_REQ) {
 			goto non_urgent_data;
@@ -21393,7 +20560,7 @@
 		return;
 	} else {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, dropping one...");
 		}
 		freemsg(mp);
@@ -21454,7 +20621,7 @@
 			 * the other side. Just ignore it.
 			 */
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, T_ORDREL_REQ out of "
 				    "state %s",
@@ -21468,7 +20635,7 @@
 		break;
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, bogus TPI msg, type %d",
 			    tprim->type);
 		}
@@ -21530,7 +20697,7 @@
 
 	/* If a text string is passed in with the request, pass it to strlog. */
 	if (str != NULL && tcp->tcp_debug) {
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
 		    str, seq, ack, ctl);
 	}
@@ -21737,7 +20904,7 @@
 	}
 
 	if (str && q && tcp_dbg) {
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
 		    "flags 0x%x",
 		    str, seq, ack, ctl);
@@ -22478,7 +21645,7 @@
 }
 
 /* This function handles the push timeout. */
-static void
+void
 tcp_push_timer(void *arg)
 {
 	conn_t	*connp = (conn_t *)arg;
@@ -22488,10 +21655,18 @@
 
 	ASSERT(tcp->tcp_listener == NULL);
 
+	/*
+	 * We need to stop synchronous streams temporarily to prevent a race
+	 * with tcp_fuse_rrw() or tcp_fusion rinfop().  It is safe to access
+	 * tcp_rcv_list here because those entry points will return right
+	 * away when synchronous streams is stopped.
+	 */
+	TCP_FUSE_SYNCSTR_STOP(tcp);
 	tcp->tcp_push_tid = 0;
 	if ((tcp->tcp_rcv_list != NULL) &&
 	    (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED))
 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+	TCP_FUSE_SYNCSTR_RESUME(tcp);
 }
 
 /*
@@ -24059,15 +23234,14 @@
 	tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack,
 	    sizeof (tcp_g_t_info_ack));
 
-#if TCP_COUNTERS || TCP_DEBUG_COUNTER
-	if ((tcp_kstat = kstat_create("tcp", 0, "tcpstat",
+	if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat",
 		"net", KSTAT_TYPE_NAMED,
 		sizeof (tcp_statistics) / sizeof (kstat_named_t),
 		KSTAT_FLAG_VIRTUAL)) != NULL) {
 		tcp_kstat->ks_data = &tcp_statistics;
 		kstat_install(tcp_kstat);
 	}
-#endif
+
 	tcp_kstat_init();
 }
 
@@ -24181,7 +23355,8 @@
 		connfp = &ipcl_globalhash_fanout[i];
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 
 			tcp = connp->conn_tcp;
 			cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
@@ -24373,7 +23548,7 @@
 	 */
 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 		logflags |= SL_CONSOLE;
-	(void) strlog(TCP_MODULE_ID, 0, 1, logflags,
+	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
 		"TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
 		"start = %d, end = %d\n", lbuf, lport, rbuf, rport,
 		acp->ac_start, acp->ac_end);
@@ -24529,7 +23704,7 @@
 	 */
 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 		logflags |= SL_CONSOLE;
-	(void) strlog(TCP_MODULE_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
+	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
 	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
 	if (err == 0 && count == 0)
 		err = ENOENT;
@@ -24846,7 +24021,7 @@
 	}
 done:
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		mp->b_datap->db_cksumstart = 0;
+		DB_CKSUMSTART(mp) = 0;
 		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
 		TCP_STAT(tcp_time_wait_syn_fail);
 	}
@@ -24965,7 +24140,7 @@
 /*
  * TCP Timers Implementation.
  */
-static timeout_id_t
+timeout_id_t
 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
 {
 	mblk_t *mp;
@@ -25038,7 +24213,7 @@
  * it. But since both should execute on the same squeue, this race should not
  * occur.
  */
-static clock_t
+clock_t
 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
 {
 	mblk_t	*mp = (mblk_t *)id;
@@ -25165,30 +24340,48 @@
  * End of TCP Timers implementation.
  */
 
-static void
+/*
+ * tcp_{set,clr}qfull() functions are used to either set or clear QFULL
+ * on the specified backing STREAMS q. Note, the caller may make the
+ * decision to call based on the tcp_t.tcp_flow_stopped value which
+ * when check outside the q's lock is only an advisory check ...
+ */
+
+void
 tcp_setqfull(tcp_t *tcp)
 {
 	queue_t *q = tcp->tcp_wq;
 
 	if (!(q->q_flag & QFULL)) {
-		TCP_STAT(tcp_flwctl_on);
 		mutex_enter(QLOCK(q));
-		q->q_flag |= QFULL;
-		mutex_exit(QLOCK(q));
-	}
-}
-
-static void
+		if (!(q->q_flag & QFULL)) {
+			/* still need to set QFULL */
+			q->q_flag |= QFULL;
+			tcp->tcp_flow_stopped = B_TRUE;
+			mutex_exit(QLOCK(q));
+			TCP_STAT(tcp_flwctl_on);
+		} else {
+			mutex_exit(QLOCK(q));
+		}
+	}
+}
+
+void
 tcp_clrqfull(tcp_t *tcp)
 {
 	queue_t *q = tcp->tcp_wq;
 
 	if (q->q_flag & QFULL) {
 		mutex_enter(QLOCK(q));
-		q->q_flag &= ~QFULL;
-		mutex_exit(QLOCK(q));
-		if (q->q_flag & QWANTW)
-			qbackenable(q, 0);
+		if (q->q_flag & QFULL) {
+			q->q_flag &= ~QFULL;
+			tcp->tcp_flow_stopped = B_FALSE;
+			mutex_exit(QLOCK(q));
+			if (q->q_flag & QWANTW)
+				qbackenable(q, 0);
+		} else {
+			mutex_exit(QLOCK(q));
+		}
 	}
 }
 
@@ -25254,8 +24447,8 @@
 		{ "connTableSize6",	KSTAT_DATA_INT32, 0 }
 	};
 
-	tcp_mibkp = kstat_create("tcp", 0, "tcp", "mib2", KSTAT_TYPE_NAMED,
-	    NUM_OF_FIELDS(tcp_named_kstat_t), 0);
+	tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME,
+	    "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0);
 
 	if (tcp_mibkp == NULL)
 		return;
@@ -25304,7 +24497,8 @@
 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 		connfp = &ipcl_globalhash_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			switch (tcp_snmp_state(tcp)) {
 			case MIB2_TCP_established:
@@ -25401,7 +24595,7 @@
 	tcph = (tcph_t *)&mp->b_rptr[hdr_len];
 	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
 		mp->b_datap->db_struioflag |= STRUIO_EAGER;
-		mp->b_datap->db_cksumstart = (intptr_t)sqp;
+		DB_CKSUMSTART(mp) = (intptr_t)sqp;
 	}
 
 	squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,

--- a/usr/src/uts/common/inet/tcp/tcp6ddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,7 +37,13 @@
 #define	INET_DEVDESC	"TCP6 STREAMS driver %I%"
 #define	INET_MODDESC	"TCP6 STREAMS module %I%"
 #define	INET_DEVMINOR	TCP_MINOR6
-#define	INET_DEVMTFLAGS	D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here.  Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define	INET_DEVMTFLAGS	(D_MP|_D_DIRECT)
 #define	INET_MODMTFLAGS	D_MP
 
 #include "../inetddi.c"

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,1087 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipp_common.h>
+
+/*
+ * This file implements TCP fusion - a protocol-less data path for TCP
+ * loopback connections.  The fusion of two local TCP endpoints occurs
+ * at connection establishment time.  Various conditions (see details
+ * in tcp_fuse()) need to be met for fusion to be successful.  If it
+ * fails, we fall back to the regular TCP data path; if it succeeds,
+ * both endpoints proceed to use tcp_fuse_output() as the transmit path.
+ * tcp_fuse_output() enqueues application data directly onto the peer's
+ * receive queue; no protocol processing is involved.  After enqueueing
+ * the data, the sender can either push (putnext) data up the receiver's
+ * read queue; or the sender can simply return and let the receiver
+ * retrieve the enqueued data via the synchronous streams entry point
+ * tcp_fuse_rrw().  The latter path is taken if synchronous streams is
+ * enabled (the default).  It is disabled if sockfs no longer resides
+ * directly on top of tcp module due to a module insertion or removal.
+ * It also needs to be temporarily disabled when sending urgent data
+ * because the tcp_fuse_rrw() path bypasses the M_PROTO processing done
+ * by strsock_proto() hook.
+ *
+ * Sychronization is handled by squeue and the mutex tcp_fuse_lock.
+ * One of the requirements for fusion to succeed is that both endpoints
+ * need to be using the same squeue.  This ensures that neither side
+ * can disappear while the other side is still sending data.  By itself,
+ * squeue is not sufficient for guaranteeing safety when synchronous
+ * streams is enabled.  The reason is that tcp_fuse_rrw() doesn't enter
+ * the squeue and its access to tcp_rcv_list and other fusion-related
+ * fields needs to be sychronized with the sender.  tcp_fuse_lock is
+ * used for this purpose.  When there is urgent data, the sender needs
+ * to push the data up the receiver's streams read queue.  In order to
+ * avoid holding the tcp_fuse_lock across putnext(), the sender sets
+ * the peer tcp's tcp_fuse_syncstr_stopped bit and releases tcp_fuse_lock
+ * (see macro TCP_FUSE_SYNCSTR_STOP()).  If tcp_fuse_rrw() enters after
+ * this point, it will see that synchronous streams is temporarily
+ * stopped and it will immediately return EBUSY without accessing the
+ * tcp_rcv_list or other fields protected by the tcp_fuse_lock.  This
+ * will result in strget() calling getq_noenab() to dequeue data from
+ * the stream head instead.  After the sender has finished pushing up
+ * all urgent data, it will clear the tcp_fuse_syncstr_stopped bit using
+ * TCP_FUSE_SYNCSTR_RESUME and the receiver may then resume using
+ * tcp_fuse_rrw() to retrieve data from tcp_rcv_list.
+ *
+ * The following note applies only to the synchronous streams mode.
+ *
+ * Flow control is done by checking the size of receive buffer and
+ * the number of data blocks, both set to different limits.  This is
+ * different than regular streams flow control where cumulative size
+ * check dominates block count check -- streams queue high water mark
+ * typically represents bytes.  Each enqueue triggers notifications
+ * to the receiving process; a build up of data blocks indicates a
+ * slow receiver and the sender should be blocked or informed at the
+ * earliest moment instead of further wasting system resources.  In
+ * effect, this is equivalent to limiting the number of outstanding
+ * segments in flight.
+ */
+
+/*
+ * Macros that determine whether or not IP processing is needed for TCP.
+ */
+#define	TCP_IPOPT_POLICY_V4(tcp)					\
+	((tcp)->tcp_ipversion == IPV4_VERSION &&			\
+	((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH ||		\
+	CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) ||		\
+	CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
+
+#define	TCP_IPOPT_POLICY_V6(tcp)					\
+	((tcp)->tcp_ipversion == IPV6_VERSION &&			\
+	((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN ||			\
+	CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) ||		\
+	CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
+
+#define	TCP_LOOPBACK_IP(tcp)						\
+	(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) ||	\
+	!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
+
+/*
+ * Setting this to false means we disable fusion altogether and
+ * loopback connections would go through the protocol paths.
+ */
+boolean_t do_tcp_fusion = B_TRUE;
+
+/*
+ * Enabling this flag allows sockfs to retrieve data directly
+ * from a fused tcp endpoint using synchronous streams interface.
+ */
+boolean_t do_tcp_direct_sockfs = B_TRUE;
+
+/*
+ * This is the minimum amount of outstanding writes allowed on
+ * a synchronous streams-enabled receiving endpoint before the
+ * sender gets flow-controlled.  Setting this value to 0 means
+ * that the data block limit is equivalent to the byte count
+ * limit, which essentially disables the check.
+ */
+#define	TCP_FUSION_RCV_UNREAD_MIN	8
+uint_t tcp_fusion_rcv_unread_min = TCP_FUSION_RCV_UNREAD_MIN;
+
+static void	tcp_fuse_syncstr_enable(tcp_t *);
+static void	tcp_fuse_syncstr_disable(tcp_t *);
+static void	strrput_sig(queue_t *, boolean_t);
+
+/*
+ * This routine gets called by the eager tcp upon changing state from
+ * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
+ * and the active connect tcp such that the regular tcp processings
+ * may be bypassed under allowable circumstances.  Because the fusion
+ * requires both endpoints to be in the same squeue, it does not work
+ * for simultaneous active connects because there is no easy way to
+ * switch from one squeue to another once the connection is created.
+ * This is different from the eager tcp case where we assign it the
+ * same squeue as the one given to the active connect tcp during open.
+ */
+void
+tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
+{
+	conn_t *peer_connp, *connp = tcp->tcp_connp;
+	tcp_t *peer_tcp;
+
+	ASSERT(!tcp->tcp_fused);
+	ASSERT(tcp->tcp_loopback);
+	ASSERT(tcp->tcp_loopback_peer == NULL);
+	/*
+	 * We need to inherit q_hiwat of the listener tcp, but we can't
+	 * really use tcp_listener since we get here after sending up
+	 * T_CONN_IND and tcp_wput_accept() may be called independently,
+	 * at which point tcp_listener is cleared; this is why we use
+	 * tcp_saved_listener.  The listener itself is guaranteed to be
+	 * around until tcp_accept_finish() is called on this eager --
+	 * this won't happen until we're done since we're inside the
+	 * eager's perimeter now.
+	 */
+	ASSERT(tcp->tcp_saved_listener != NULL);
+
+	/*
+	 * Lookup peer endpoint; search for the remote endpoint having
+	 * the reversed address-port quadruplet in ESTABLISHED state,
+	 * which is guaranteed to be unique in the system.  Zone check
+	 * is applied accordingly for loopback address, but not for
+	 * local address since we want fusion to happen across Zones.
+	 */
+	if (tcp->tcp_ipversion == IPV4_VERSION) {
+		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
+		    (ipha_t *)iphdr, tcph);
+	} else {
+		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
+		    (ip6_t *)iphdr, tcph);
+	}
+
+	/*
+	 * We can only proceed if peer exists, resides in the same squeue
+	 * as our conn and is not raw-socket.  The squeue assignment of
+	 * this eager tcp was done earlier at the time of SYN processing
+	 * in ip_fanout_tcp{_v6}.  Note that similar squeues by itself
+	 * doesn't guarantee a safe condition to fuse, hence we perform
+	 * additional tests below.
+	 */
+	ASSERT(peer_connp == NULL || peer_connp != connp);
+	if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
+	    !IPCL_IS_TCP(peer_connp)) {
+		if (peer_connp != NULL) {
+			TCP_STAT(tcp_fusion_unqualified);
+			CONN_DEC_REF(peer_connp);
+		}
+		return;
+	}
+	peer_tcp = peer_connp->conn_tcp;	/* active connect tcp */
+
+	ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
+	ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
+	ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
+
+	/*
+	 * Fuse the endpoints; we perform further checks against both
+	 * tcp endpoints to ensure that a fusion is allowed to happen.
+	 * In particular we bail out for non-simple TCP/IP or if IPsec/
+	 * IPQoS policy exists.
+	 */
+	if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
+	    !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
+	    !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+		mblk_t *mp;
+		struct stroptions *stropt;
+		queue_t *peer_rq = peer_tcp->tcp_rq;
+
+		ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
+		ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
+
+		/*
+		 * We need to drain data on both endpoints during unfuse.
+		 * If we need to send up SIGURG at the time of draining,
+		 * we want to be sure that an mblk is readily available.
+		 * This is why we pre-allocate the M_PCSIG mblks for both
+		 * endpoints which will only be used during/after unfuse.
+		 */
+		if ((mp = allocb(1, BPRI_HI)) == NULL)
+			goto failed;
+
+		tcp->tcp_fused_sigurg_mp = mp;
+
+		if ((mp = allocb(1, BPRI_HI)) == NULL)
+			goto failed;
+
+		peer_tcp->tcp_fused_sigurg_mp = mp;
+
+		/* Allocate M_SETOPTS mblk */
+		if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
+			goto failed;
+
+		/* Fuse both endpoints */
+		peer_tcp->tcp_loopback_peer = tcp;
+		tcp->tcp_loopback_peer = peer_tcp;
+		peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
+
+		/*
+		 * We never use regular tcp paths in fusion and should
+		 * therefore clear tcp_unsent on both endpoints.  Having
+		 * them set to non-zero values means asking for trouble
+		 * especially after unfuse, where we may end up sending
+		 * through regular tcp paths which expect xmit_list and
+		 * friends to be correctly setup.
+		 */
+		peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
+
+		tcp_timers_stop(tcp);
+		tcp_timers_stop(peer_tcp);
+
+		/*
+		 * At this point we are a detached eager tcp and therefore
+		 * don't have a queue assigned to us until accept happens.
+		 * In the mean time the peer endpoint may immediately send
+		 * us data as soon as fusion is finished, and we need to be
+		 * able to flow control it in case it sends down huge amount
+		 * of data while we're still detached.  To prevent that we
+		 * inherit the listener's q_hiwat value; this is temporary
+		 * since we'll repeat the process in tcp_accept_finish().
+		 */
+		(void) tcp_fuse_set_rcv_hiwat(tcp,
+		    tcp->tcp_saved_listener->tcp_rq->q_hiwat);
+
+		/*
+		 * Set the stream head's write offset value to zero since we
+		 * won't be needing any room for TCP/IP headers; tell it to
+		 * not break up the writes (this would reduce the amount of
+		 * work done by kmem); and configure our receive buffer.
+		 * Note that we can only do this for the active connect tcp
+		 * since our eager is still detached; it will be dealt with
+		 * later in tcp_accept_finish().
+		 */
+		DB_TYPE(mp) = M_SETOPTS;
+		mp->b_wptr += sizeof (*stropt);
+
+		stropt = (struct stroptions *)mp->b_rptr;
+		stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
+		stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
+		stropt->so_wroff = 0;
+
+		/*
+		 * Record the stream head's high water mark for
+		 * peer endpoint; this is used for flow-control
+		 * purposes in tcp_fuse_output().
+		 */
+		stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp,
+		    peer_rq->q_hiwat);
+
+		/* Send the options up */
+		putnext(peer_rq, mp);
+	} else {
+		TCP_STAT(tcp_fusion_unqualified);
+	}
+	CONN_DEC_REF(peer_connp);
+	return;
+
+failed:
+	if (tcp->tcp_fused_sigurg_mp != NULL) {
+		freeb(tcp->tcp_fused_sigurg_mp);
+		tcp->tcp_fused_sigurg_mp = NULL;
+	}
+	if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
+		freeb(peer_tcp->tcp_fused_sigurg_mp);
+		peer_tcp->tcp_fused_sigurg_mp = NULL;
+	}
+	CONN_DEC_REF(peer_connp);
+}
+
+/*
+ * Unfuse a previously-fused pair of tcp loopback endpoints.
+ */
+void
+tcp_unfuse(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused && peer_tcp != NULL);
+	ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+	ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
+	ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
+	ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
+
+	/*
+	 * We disable synchronous streams, drain any queued data and
+	 * clear tcp_direct_sockfs.  The synchronous streams entry
+	 * points will become no-ops after this point.
+	 */
+	tcp_fuse_disable_pair(tcp, B_TRUE);
+
+	/*
+	 * Update th_seq and th_ack in the header template
+	 */
+	U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
+	U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+	U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
+	U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
+
+	/* Unfuse the endpoints */
+	peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
+	peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
+}
+
+/*
+ * Fusion output routine for urgent data.  This routine is called by
+ * tcp_fuse_output() for handling non-M_DATA mblks.
+ */
+void
+tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
+{
+	mblk_t *mp1;
+	struct T_exdata_ind *tei;
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	mblk_t *head, *prev_head = NULL;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+	ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
+	ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
+
+	/*
+	 * Urgent data arrives in the form of T_EXDATA_REQ from above.
+	 * Each occurence denotes a new urgent pointer.  For each new
+	 * urgent pointer we signal (SIGURG) the receiving app to indicate
+	 * that it needs to go into urgent mode.  This is similar to the
+	 * urgent data handling in the regular tcp.  We don't need to keep
+	 * track of where the urgent pointer is, because each T_EXDATA_REQ
+	 * "advances" the urgent pointer for us.
+	 *
+	 * The actual urgent data carried by T_EXDATA_REQ is then prepended
+	 * by a T_EXDATA_IND before being enqueued behind any existing data
+	 * destined for the receiving app.  There is only a single urgent
+	 * pointer (out-of-band mark) for a given tcp.  If the new urgent
+	 * data arrives before the receiving app reads some existing urgent
+	 * data, the previous marker is lost.  This behavior is emulated
+	 * accordingly below, by removing any existing T_EXDATA_IND messages
+	 * and essentially converting old urgent data into non-urgent.
+	 */
+	ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
+	/* Let sender get out of urgent mode */
+	tcp->tcp_valid_bits &= ~TCP_URG_VALID;
+
+	/*
+	 * This flag indicates that a signal needs to be sent up.
+	 * This flag will only get cleared once SIGURG is delivered and
+	 * is not affected by the tcp_fused flag -- delivery will still
+	 * happen even after an endpoint is unfused, to handle the case
+	 * where the sending endpoint immediately closes/unfuses after
+	 * sending urgent data and the accept is not yet finished.
+	 */
+	peer_tcp->tcp_fused_sigurg = B_TRUE;
+
+	/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
+	DB_TYPE(mp) = M_PROTO;
+	tei = (struct T_exdata_ind *)mp->b_rptr;
+	tei->PRIM_type = T_EXDATA_IND;
+	tei->MORE_flag = 0;
+	mp->b_wptr = (uchar_t *)&tei[1];
+
+	TCP_STAT(tcp_fusion_urg);
+	BUMP_MIB(&tcp_mib, tcpOutUrg);
+
+	head = peer_tcp->tcp_rcv_list;
+	while (head != NULL) {
+		/*
+		 * Remove existing T_EXDATA_IND, keep the data which follows
+		 * it and relink our list.  Note that we don't modify the
+		 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
+		 */
+		if (DB_TYPE(head) != M_DATA) {
+			mp1 = head;
+
+			ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
+			head = mp1->b_cont;
+			mp1->b_cont = NULL;
+			head->b_next = mp1->b_next;
+			mp1->b_next = NULL;
+			if (prev_head != NULL)
+				prev_head->b_next = head;
+			if (peer_tcp->tcp_rcv_list == mp1)
+				peer_tcp->tcp_rcv_list = head;
+			if (peer_tcp->tcp_rcv_last_head == mp1)
+				peer_tcp->tcp_rcv_last_head = head;
+			freeb(mp1);
+		}
+		prev_head = head;
+		head = head->b_next;
+	}
+}
+
+/*
+ * Fusion output routine, called by tcp_output() and tcp_wput_proto().
+ */
+boolean_t
+tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	queue_t *peer_rq;
+	uint_t max_unread;
+	boolean_t flow_stopped;
+	boolean_t urgent = (DB_TYPE(mp) != M_DATA);
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
+	    DB_TYPE(mp) == M_PCPROTO);
+
+	peer_rq = peer_tcp->tcp_rq;
+	max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater;
+
+	/* If this connection requires IP, unfuse and use regular path */
+	if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
+	    IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+		TCP_STAT(tcp_fusion_aborted);
+		tcp_unfuse(tcp);
+		return (B_FALSE);
+	}
+
+	if (send_size == 0) {
+		freemsg(mp);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Handle urgent data; we either send up SIGURG to the peer now
+	 * or do it later when we drain, in case the peer is detached
+	 * or if we're short of memory for M_PCSIG mblk.
+	 */
+	if (urgent) {
+		/*
+		 * We stop synchronous streams when we have urgent data
+		 * queued to prevent tcp_fuse_rrw() from pulling it.  If
+		 * for some reasons the urgent data can't be delivered
+		 * below, synchronous streams will remain stopped until
+		 * someone drains the tcp_rcv_list.
+		 */
+		TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+		tcp_fuse_output_urg(tcp, mp);
+	}
+
+	mutex_enter(&peer_tcp->tcp_fuse_lock);
+	/*
+	 * Wake up and signal the peer; it is okay to do this before
+	 * enqueueing because we are holding the lock.  One of the
+	 * advantages of synchronous streams is the ability for us to
+	 * find out when the application performs a read on the socket,
+	 * by way of tcp_fuse_rrw() entry point being called.  Every
+	 * data that gets enqueued onto the receiver is treated as if
+	 * it has arrived at the receiving endpoint, thus generating
+	 * SIGPOLL/SIGIO for asynchronous socket just as in the strrput()
+	 * case.  However, we only wake up the application when necessary,
+	 * i.e. during the first enqueue.  When tcp_fuse_rrw() is called
+	 * it will send everything upstream.
+	 */
+	if (peer_tcp->tcp_direct_sockfs && !urgent &&
+	    !TCP_IS_DETACHED(peer_tcp)) {
+		if (peer_tcp->tcp_rcv_list == NULL)
+			STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq));
+		/* Update poll events and send SIGPOLL/SIGIO if necessary */
+		STR_SENDSIG(STREAM(peer_tcp->tcp_rq));
+	}
+
+	/*
+	 * Enqueue data into the peer's receive list; we may or may not
+	 * drain the contents depending on the conditions below.
+	 */
+	tcp_rcv_enqueue(peer_tcp, mp, send_size);
+
+	/* In case it wrapped around and also to keep it constant */
+	peer_tcp->tcp_rwnd += send_size;
+
+	/*
+	 * Exercise flow-control when needed; we will get back-enabled
+	 * in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw().
+	 * If tcp_direct_sockfs is on or if the peer endpoint is detached,
+	 * we emulate streams flow control by checking the peer's queue
+	 * size and high water mark; otherwise we simply use canputnext()
+	 * to decide if we need to stop our flow.
+	 *
+	 * The outstanding unread data block check does not apply for a
+	 * detached receiver; this is to avoid unnecessary blocking of the
+	 * sender while the accept is currently in progress and is quite
+	 * similar to the regular tcp.
+	 */
+	if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0)
+		max_unread = UINT_MAX;
+
+	flow_stopped = tcp->tcp_flow_stopped;
+	if (!flow_stopped &&
+	    (((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) &&
+	    (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater ||
+	    ++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) ||
+	    (!peer_tcp->tcp_direct_sockfs &&
+	    !TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) {
+		tcp_setqfull(tcp);
+		flow_stopped = B_TRUE;
+		TCP_STAT(tcp_fusion_flowctl);
+		DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp,
+		    uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt,
+		    uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt);
+	} else if (flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
+
+	loopback_packets++;
+	tcp->tcp_last_sent_len = send_size;
+
+	/* Need to adjust the following SNMP MIB-related variables */
+	tcp->tcp_snxt += send_size;
+	tcp->tcp_suna = tcp->tcp_snxt;
+	peer_tcp->tcp_rnxt += send_size;
+	peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
+
+	BUMP_MIB(&tcp_mib, tcpOutDataSegs);
+	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
+
+	BUMP_MIB(&tcp_mib, tcpInSegs);
+	BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
+	UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
+
+	BUMP_LOCAL(tcp->tcp_obsegs);
+	BUMP_LOCAL(peer_tcp->tcp_ibsegs);
+
+	mutex_exit(&peer_tcp->tcp_fuse_lock);
+
+	DTRACE_PROBE2(tcp__fuse__output, tcp_t *, tcp, uint_t, send_size);
+
+	if (!TCP_IS_DETACHED(peer_tcp)) {
+		/*
+		 * Drain the peer's receive queue it has urgent data or if
+		 * we're not flow-controlled.  There is no need for draining
+		 * normal data when tcp_direct_sockfs is on because the peer
+		 * will pull the data via tcp_fuse_rrw().
+		 */
+		if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) {
+			ASSERT(peer_tcp->tcp_rcv_list != NULL);
+			(void) tcp_fuse_rcv_drain(peer_rq, peer_tcp, NULL);
+			/*
+			 * If synchronous streams was stopped above due
+			 * to the presence of urgent data, re-enable it.
+			 */
+			if (urgent)
+				TCP_FUSE_SYNCSTR_RESUME(peer_tcp);
+		}
+	}
+	return (B_TRUE);
+}
+
+/*
+ * This routine gets called to deliver data upstream on a fused or
+ * previously fused tcp loopback endpoint; the latter happens only
+ * when there is a pending SIGURG signal plus urgent data that can't
+ * be sent upstream in the past.
+ */
+boolean_t
+tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
+{
+	mblk_t *mp;
+#ifdef DEBUG
+	uint_t cnt = 0;
+#endif
+
+	ASSERT(tcp->tcp_loopback);
+	ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
+	ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
+	ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
+
+	/* No need for the push timer now, in case it was scheduled */
+	if (tcp->tcp_push_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
+		tcp->tcp_push_tid = 0;
+	}
+	/*
+	 * If there's urgent data sitting in receive list and we didn't
+	 * get a chance to send up a SIGURG signal, make sure we send
+	 * it first before draining in order to ensure that SIOCATMARK
+	 * works properly.
+	 */
+	if (tcp->tcp_fused_sigurg) {
+		/*
+		 * sigurg_mpp is normally NULL, i.e. when we're still
+		 * fused and didn't get here because of tcp_unfuse().
+		 * In this case try hard to allocate the M_PCSIG mblk.
+		 */
+		if (sigurg_mpp == NULL &&
+		    (mp = allocb(1, BPRI_HI)) == NULL &&
+		    (mp = allocb_tryhard(1)) == NULL) {
+			/* Alloc failed; try again next time */
+			tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
+			    MSEC_TO_TICK(tcp_push_timer_interval));
+			return (B_TRUE);
+		} else if (sigurg_mpp != NULL) {
+			/*
+			 * Use the supplied M_PCSIG mblk; it means we're
+			 * either unfused or in the process of unfusing,
+			 * and the drain must happen now.
+			 */
+			mp = *sigurg_mpp;
+			*sigurg_mpp = NULL;
+		}
+		ASSERT(mp != NULL);
+
+		tcp->tcp_fused_sigurg = B_FALSE;
+		/* Send up the signal */
+		DB_TYPE(mp) = M_PCSIG;
+		*mp->b_wptr++ = (uchar_t)SIGURG;
+		putnext(q, mp);
+		/*
+		 * Let the regular tcp_rcv_drain() path handle
+		 * draining the data if we're no longer fused.
+		 */
+		if (!tcp->tcp_fused)
+			return (B_FALSE);
+	}
+
+	/*
+	 * In the synchronous streams case, we generate SIGPOLL/SIGIO for
+	 * each M_DATA that gets enqueued onto the receiver.  At this point
+	 * we are about to drain any queued data via putnext().  In order
+	 * to avoid extraneous signal generation from strrput(), we set
+	 * STRGETINPROG flag at the stream head prior to the draining and
+	 * restore it afterwards.  This masks out signal generation only
+	 * for M_DATA messages and does not affect urgent data.
+	 */
+	if (tcp->tcp_direct_sockfs)
+		strrput_sig(q, B_FALSE);
+
+	/* Drain the data */
+	while ((mp = tcp->tcp_rcv_list) != NULL) {
+		tcp->tcp_rcv_list = mp->b_next;
+		mp->b_next = NULL;
+#ifdef DEBUG
+		cnt += msgdsize(mp);
+#endif
+		putnext(q, mp);
+		TCP_STAT(tcp_fusion_putnext);
+	}
+
+	if (tcp->tcp_direct_sockfs)
+		strrput_sig(q, B_TRUE);
+
+	ASSERT(cnt == tcp->tcp_rcv_cnt);
+	tcp->tcp_rcv_last_head = NULL;
+	tcp->tcp_rcv_last_tail = NULL;
+	tcp->tcp_rcv_cnt = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
+	tcp->tcp_rwnd = q->q_hiwat;
+
+	return (B_TRUE);
+}
+
+/*
+ * Synchronous stream entry point for sockfs to retrieve
+ * data directly from tcp_rcv_list.
+ */
+int
+tcp_fuse_rrw(queue_t *q, struiod_t *dp)
+{
+	tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
+	mblk_t *mp;
+
+	mutex_enter(&tcp->tcp_fuse_lock);
+	/*
+	 * If someone had turned off tcp_direct_sockfs or if synchronous
+	 * streams is temporarily disabled, we return EBUSY.  This causes
+	 * strget() to dequeue data from the stream head instead.
+	 */
+	if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped) {
+		mutex_exit(&tcp->tcp_fuse_lock);
+		TCP_STAT(tcp_fusion_rrw_busy);
+		return (EBUSY);
+	}
+
+	if ((mp = tcp->tcp_rcv_list) != NULL) {
+		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+		DTRACE_PROBE3(tcp__fuse__rrw, tcp_t *, tcp,
+		    uint32_t, tcp->tcp_rcv_cnt, ssize_t, dp->d_uio.uio_resid);
+
+		tcp->tcp_rcv_list = NULL;
+		TCP_STAT(tcp_fusion_rrw_msgcnt);
+
+		/*
+		 * At this point nothing should be left in tcp_rcv_list.
+		 * The only possible case where we would have a chain of
+		 * b_next-linked messages is urgent data, but we wouldn't
+		 * be here if that's true since urgent data is delivered
+		 * via putnext() and synchronous streams is stopped until
+		 * tcp_fuse_rcv_drain() is finished.
+		 */
+		ASSERT(DB_TYPE(mp) == M_DATA && mp->b_next == NULL);
+
+		tcp->tcp_rcv_last_head = NULL;
+		tcp->tcp_rcv_last_tail = NULL;
+		tcp->tcp_rcv_cnt = 0;
+		tcp->tcp_fuse_rcv_unread_cnt = 0;
+
+		if (peer_tcp->tcp_flow_stopped) {
+			tcp_clrqfull(peer_tcp);
+			TCP_STAT(tcp_fusion_backenabled);
+		}
+	}
+
+	/*
+	 * Either we just dequeued everything or we get here from sockfs
+	 * and have nothing to return; in this case clear RSLEEP.
+	 */
+	ASSERT(tcp->tcp_rcv_last_head == NULL);
+	ASSERT(tcp->tcp_rcv_last_tail == NULL);
+	ASSERT(tcp->tcp_rcv_cnt == 0);
+	ASSERT(tcp->tcp_fuse_rcv_unread_cnt == 0);
+	STR_WAKEUP_CLEAR(STREAM(q));
+
+	mutex_exit(&tcp->tcp_fuse_lock);
+	dp->d_mp = mp;
+	return (0);
+}
+
+/*
+ * Synchronous stream entry point used by certain ioctls to retrieve
+ * information about or peek into the tcp_rcv_list.
+ */
+int
+tcp_fuse_rinfop(queue_t *q, infod_t *dp)
+{
+	tcp_t	*tcp = Q_TO_CONN(q)->conn_tcp;
+	mblk_t	*mp;
+	uint_t	cmd = dp->d_cmd;
+	int	res = 0;
+	int	error = 0;
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&tcp->tcp_fuse_lock);
+	/* If shutdown on read has happened, return nothing */
+	mutex_enter(&stp->sd_lock);
+	if (stp->sd_flag & STREOF) {
+		mutex_exit(&stp->sd_lock);
+		goto done;
+	}
+	mutex_exit(&stp->sd_lock);
+
+	/*
+	 * It is OK not to return an answer if tcp_rcv_list is
+	 * currently not accessible.
+	 */
+	if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped ||
+	    (mp = tcp->tcp_rcv_list) == NULL)
+		goto done;
+
+	if (cmd & INFOD_COUNT) {
+		/*
+		 * We have at least one message and
+		 * could return only one at a time.
+		 */
+		dp->d_count++;
+		res |= INFOD_COUNT;
+	}
+	if (cmd & INFOD_BYTES) {
+		/*
+		 * Return size of all data messages.
+		 */
+		dp->d_bytes += tcp->tcp_rcv_cnt;
+		res |= INFOD_BYTES;
+	}
+	if (cmd & INFOD_FIRSTBYTES) {
+		/*
+		 * Return size of first data message.
+		 */
+		dp->d_bytes = msgdsize(mp);
+		res |= INFOD_FIRSTBYTES;
+		dp->d_cmd &= ~INFOD_FIRSTBYTES;
+	}
+	if (cmd & INFOD_COPYOUT) {
+		mblk_t *mp1;
+		int n;
+
+		if (DB_TYPE(mp) == M_DATA) {
+			mp1 = mp;
+		} else {
+			mp1 = mp->b_cont;
+			ASSERT(mp1 != NULL);
+		}
+
+		/*
+		 * Return data contents of first message.
+		 */
+		ASSERT(DB_TYPE(mp1) == M_DATA);
+		while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+			n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+			if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+			    UIO_READ, dp->d_uiop)) != 0) {
+				goto done;
+			}
+			mp1 = mp1->b_cont;
+		}
+		res |= INFOD_COPYOUT;
+		dp->d_cmd &= ~INFOD_COPYOUT;
+	}
+done:
+	mutex_exit(&tcp->tcp_fuse_lock);
+
+	dp->d_res |= res;
+
+	return (error);
+}
+
+/*
+ * Enable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_enable(tcp_t *tcp)
+{
+	queue_t *rq = tcp->tcp_rq;
+	struct stdata *stp = STREAM(rq);
+
+	/* We can only enable synchronous streams for sockfs mode */
+	tcp->tcp_direct_sockfs = tcp->tcp_issocket && do_tcp_direct_sockfs;
+
+	if (!tcp->tcp_direct_sockfs)
+		return;
+
+	mutex_enter(&stp->sd_lock);
+	mutex_enter(QLOCK(rq));
+
+	/*
+	 * We replace our q_qinfo with one that has the qi_rwp entry point.
+	 * Clear SR_SIGALLDATA because we generate the equivalent signal(s)
+	 * for every enqueued data in tcp_fuse_output().
+	 */
+	rq->q_qinfo = &tcp_loopback_rinit;
+	rq->q_struiot = tcp_loopback_rinit.qi_struiot;
+	stp->sd_struiordq = rq;
+	stp->sd_rput_opt &= ~SR_SIGALLDATA;
+
+	mutex_exit(QLOCK(rq));
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_disable(tcp_t *tcp)
+{
+	queue_t *rq = tcp->tcp_rq;
+	struct stdata *stp = STREAM(rq);
+
+	if (!tcp->tcp_direct_sockfs)
+		return;
+
+	mutex_enter(&stp->sd_lock);
+	mutex_enter(QLOCK(rq));
+
+	/*
+	 * Reset q_qinfo to point to the default tcp entry points.
+	 * Also restore SR_SIGALLDATA so that strrput() can generate
+	 * the signals again for future M_DATA messages.
+	 */
+	rq->q_qinfo = &tcp_rinit;
+	rq->q_struiot = tcp_rinit.qi_struiot;
+	stp->sd_struiordq = NULL;
+	stp->sd_rput_opt |= SR_SIGALLDATA;
+	tcp->tcp_direct_sockfs = B_FALSE;
+
+	mutex_exit(QLOCK(rq));
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Enable synchronous streams on a pair of fused tcp endpoints.
+ */
+void
+tcp_fuse_syncstr_enable_pair(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+
+	tcp_fuse_syncstr_enable(tcp);
+	tcp_fuse_syncstr_enable(peer_tcp);
+}
+
+/*
+ * Allow or disallow signals to be generated by strrput().
+ */
+static void
+strrput_sig(queue_t *q, boolean_t on)
+{
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&stp->sd_lock);
+	if (on)
+		stp->sd_flag &= ~STRGETINPROG;
+	else
+		stp->sd_flag |= STRGETINPROG;
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a pair of fused tcp endpoints and drain
+ * any queued data; called either during unfuse or upon transitioning from
+ * a socket to a stream endpoint due to _SIOCSOCKFALLBACK.
+ */
+void
+tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+
+	/*
+	 * We need to prevent tcp_fuse_rrw() from entering before
+	 * we can disable synchronous streams.
+	 */
+	TCP_FUSE_SYNCSTR_STOP(tcp);
+	TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+
+	/*
+	 * Drain any pending data; the detached check is needed because
+	 * we may be called as a result of a tcp_unfuse() triggered by
+	 * tcp_fuse_output().  Note that in case of a detached tcp, the
+	 * draining will happen later after the tcp is unfused.  For non-
+	 * urgent data, this can be handled by the regular tcp_rcv_drain().
+	 * If we have urgent data sitting in the receive list, we will
+	 * need to send up a SIGURG signal first before draining the data.
+	 * All of these will be handled by the code in tcp_fuse_rcv_drain()
+	 * when called from tcp_rcv_drain().
+	 */
+	if (!TCP_IS_DETACHED(tcp)) {
+		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
+		    (unfusing ? &tcp->tcp_fused_sigurg_mp : NULL));
+	}
+	if (!TCP_IS_DETACHED(peer_tcp)) {
+		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
+		    (unfusing ? &peer_tcp->tcp_fused_sigurg_mp : NULL));
+	}
+
+	/* Lift up any flow-control conditions */
+	if (tcp->tcp_flow_stopped) {
+		tcp_clrqfull(tcp);
+		TCP_STAT(tcp_fusion_backenabled);
+	}
+	if (peer_tcp->tcp_flow_stopped) {
+		tcp_clrqfull(peer_tcp);
+		TCP_STAT(tcp_fusion_backenabled);
+	}
+
+	/* Disable synchronous streams */
+	tcp_fuse_syncstr_disable(tcp);
+	tcp_fuse_syncstr_disable(peer_tcp);
+}
+
+/*
+ * Calculate the size of receive buffer for a fused tcp endpoint.
+ */
+size_t
+tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
+{
+	ASSERT(tcp->tcp_fused);
+
+	/* Ensure that value is within the maximum upper bound */
+	if (rwnd > tcp_max_buf)
+		rwnd = tcp_max_buf;
+
+	/* Obey the absolute minimum tcp receive high water mark */
+	if (rwnd < tcp_sth_rcv_hiwat)
+		rwnd = tcp_sth_rcv_hiwat;
+
+	/*
+	 * Round up to system page size in case SO_RCVBUF is modified
+	 * after SO_SNDBUF; the latter is also similarly rounded up.
+	 */
+	rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
+	tcp->tcp_fuse_rcv_hiwater = rwnd;
+	return (rwnd);
+}
+
+/*
+ * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
+ */
+int
+tcp_fuse_maxpsz_set(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	uint_t sndbuf = tcp->tcp_xmit_hiwater;
+	uint_t maxpsz = sndbuf;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+	ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0);
+	/*
+	 * In the fused loopback case, we want the stream head to split
+	 * up larger writes into smaller chunks for a more accurate flow-
+	 * control accounting.  Our maxpsz is half of the sender's send
+	 * buffer or the receiver's receive buffer, whichever is smaller.
+	 * We round up the buffer to system page size due to the lack of
+	 * TCP MSS concept in Fusion.
+	 */
+	if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater)
+		maxpsz = peer_tcp->tcp_fuse_rcv_hiwater;
+	maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
+
+	/*
+	 * Calculate the peer's limit for the number of outstanding unread
+	 * data block.  This is the amount of data blocks that are allowed
+	 * to reside in the receiver's queue before the sender gets flow
+	 * controlled.  It is used only in the synchronous streams mode as
+	 * a way to throttle the sender when it performs consecutive writes
+	 * faster than can be read.  The value is derived from SO_SNDBUF in
+	 * order to give the sender some control; we divide it with a large
+	 * value (16KB) to produce a fairly low initial limit.
+	 */
+	if (tcp_fusion_rcv_unread_min == 0) {
+		/* A value of 0 means that we disable the check */
+		peer_tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	} else {
+		peer_tcp->tcp_fuse_rcv_unread_hiwater =
+		    MAX(sndbuf >> 14, tcp_fusion_rcv_unread_min);
+	}
+	return (maxpsz);
+}

--- a/usr/src/uts/common/inet/tcp/tcpddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -38,7 +38,13 @@
 #define	INET_DEVDESC	"TCP STREAMS driver %I%"
 #define	INET_MODDESC	"TCP STREAMS module %I%"
 #define	INET_DEVMINOR	TCP_MINOR
-#define	INET_DEVMTFLAGS	D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here.  Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define	INET_DEVMTFLAGS	(D_MP|_D_DIRECT)
 #define	INET_MODMTFLAGS	D_MP
 
 #include "../inetddi.c"

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/tcp_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,332 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_INET_TCP_IMPL_H
+#define	_INET_TCP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * TCP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <inet/tcp.h>
+
+#define	TCP_MOD_ID	5105
+
+/*
+ * Was this tcp created via socket() interface?
+ */
+#define	TCP_IS_SOCKET(tcp)	((tcp)->tcp_issocket)
+
+/*
+ * Is this tcp not attached to any upper client?
+ */
+#define	TCP_IS_DETACHED(tcp)	((tcp)->tcp_detached)
+
+#define	TCP_TIMER(tcp, f, tim)		\
+	tcp_timeout(tcp->tcp_connp, f, tim)
+#define	TCP_TIMER_CANCEL(tcp, id)	\
+	tcp_timeout_cancel(tcp->tcp_connp, id)
+
+/*
+ * To restart the TCP retransmission timer.
+ */
+#define	TCP_TIMER_RESTART(tcp, intvl) {					\
+	if ((tcp)->tcp_timer_tid != 0)					\
+		(void) TCP_TIMER_CANCEL((tcp), (tcp)->tcp_timer_tid);	\
+	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer,		\
+	    MSEC_TO_TICK(intvl));					\
+}
+
+/*
+ * This stops synchronous streams for a fused tcp endpoint
+ * and prevents tcp_rrw() from pulling data from it.
+ */
+#define	TCP_FUSE_SYNCSTR_STOP(tcp) {				\
+	if ((tcp)->tcp_direct_sockfs) {				\
+		mutex_enter(&(tcp)->tcp_fuse_lock);		\
+		(tcp)->tcp_fuse_syncstr_stopped = B_TRUE;	\
+		mutex_exit(&(tcp)->tcp_fuse_lock);		\
+	}							\
+}
+
+/*
+ * This resumes synchronous streams for this fused tcp endpoint
+ * and allows tcp_rrw() to pull data from it again.
+ */
+#define	TCP_FUSE_SYNCSTR_RESUME(tcp) {				\
+	if ((tcp)->tcp_direct_sockfs) {				\
+		mutex_enter(&(tcp)->tcp_fuse_lock);		\
+		(tcp)->tcp_fuse_syncstr_stopped = B_FALSE;	\
+		mutex_exit(&(tcp)->tcp_fuse_lock);		\
+	}							\
+}
+
+/*
+ * Write-side flow-control is implemented via the per instance STREAMS
+ * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
+ * and clearing QFULL and calling qbackenable() to restart the flow based
+ * on the number of TCP unsent bytes (i.e. those not on the wire waiting
+ * for a remote ACK).
+ *
+ * This is different than a standard STREAMS kmod which when using the
+ * STREAMS Q the framework would automatictly flow-control based on the
+ * defined hiwat/lowat values as mblk_t's are enqueued/dequeued.
+ *
+ * As of FireEngine TCP write-side flow-control needs to take into account
+ * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes
+ * (i.e. from tcp_wput() -> tcp_output()).
+ *
+ * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to
+ * count the number of bytes enqueued by tcp_wput() and the number of bytes
+ * dequeued and processed by tcp_output().
+ *
+ * So, the total number of bytes unsent is (squeue_bytes + unsent) with all
+ * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES.
+ */
+extern void	tcp_clrqfull(tcp_t *);
+extern void	tcp_setqfull(tcp_t *);
+
+#define	TCP_UNSENT_BYTES(tcp) \
+	((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent)
+
+/* Named Dispatch Parameter Management Structure */
+typedef struct tcpparam_s {
+	uint32_t	tcp_param_min;
+	uint32_t	tcp_param_max;
+	uint32_t	tcp_param_val;
+	char		*tcp_param_name;
+} tcpparam_t;
+
+extern tcpparam_t tcp_param_arr[];
+
+#define	tcp_time_wait_interval			tcp_param_arr[0].tcp_param_val
+#define	tcp_conn_req_max_q			tcp_param_arr[1].tcp_param_val
+#define	tcp_conn_req_max_q0			tcp_param_arr[2].tcp_param_val
+#define	tcp_conn_req_min			tcp_param_arr[3].tcp_param_val
+#define	tcp_conn_grace_period			tcp_param_arr[4].tcp_param_val
+#define	tcp_cwnd_max_				tcp_param_arr[5].tcp_param_val
+#define	tcp_dbg					tcp_param_arr[6].tcp_param_val
+#define	tcp_smallest_nonpriv_port		tcp_param_arr[7].tcp_param_val
+#define	tcp_ip_abort_cinterval			tcp_param_arr[8].tcp_param_val
+#define	tcp_ip_abort_linterval			tcp_param_arr[9].tcp_param_val
+#define	tcp_ip_abort_interval			tcp_param_arr[10].tcp_param_val
+#define	tcp_ip_notify_cinterval			tcp_param_arr[11].tcp_param_val
+#define	tcp_ip_notify_interval			tcp_param_arr[12].tcp_param_val
+#define	tcp_ipv4_ttl				tcp_param_arr[13].tcp_param_val
+#define	tcp_keepalive_interval_high		tcp_param_arr[14].tcp_param_max
+#define	tcp_keepalive_interval			tcp_param_arr[14].tcp_param_val
+#define	tcp_keepalive_interval_low		tcp_param_arr[14].tcp_param_min
+#define	tcp_maxpsz_multiplier			tcp_param_arr[15].tcp_param_val
+#define	tcp_mss_def_ipv4			tcp_param_arr[16].tcp_param_val
+#define	tcp_mss_max_ipv4			tcp_param_arr[17].tcp_param_val
+#define	tcp_mss_min				tcp_param_arr[18].tcp_param_val
+#define	tcp_naglim_def				tcp_param_arr[19].tcp_param_val
+#define	tcp_rexmit_interval_initial		tcp_param_arr[20].tcp_param_val
+#define	tcp_rexmit_interval_max			tcp_param_arr[21].tcp_param_val
+#define	tcp_rexmit_interval_min			tcp_param_arr[22].tcp_param_val
+#define	tcp_deferred_ack_interval		tcp_param_arr[23].tcp_param_val
+#define	tcp_snd_lowat_fraction			tcp_param_arr[24].tcp_param_val
+#define	tcp_sth_rcv_hiwat			tcp_param_arr[25].tcp_param_val
+#define	tcp_sth_rcv_lowat			tcp_param_arr[26].tcp_param_val
+#define	tcp_dupack_fast_retransmit		tcp_param_arr[27].tcp_param_val
+#define	tcp_ignore_path_mtu			tcp_param_arr[28].tcp_param_val
+#define	tcp_smallest_anon_port			tcp_param_arr[29].tcp_param_val
+#define	tcp_largest_anon_port			tcp_param_arr[30].tcp_param_val
+#define	tcp_xmit_hiwat				tcp_param_arr[31].tcp_param_val
+#define	tcp_xmit_lowat				tcp_param_arr[32].tcp_param_val
+#define	tcp_recv_hiwat				tcp_param_arr[33].tcp_param_val
+#define	tcp_recv_hiwat_minmss			tcp_param_arr[34].tcp_param_val
+#define	tcp_fin_wait_2_flush_interval		tcp_param_arr[35].tcp_param_val
+#define	tcp_co_min				tcp_param_arr[36].tcp_param_val
+#define	tcp_max_buf				tcp_param_arr[37].tcp_param_val
+#define	tcp_strong_iss				tcp_param_arr[38].tcp_param_val
+#define	tcp_rtt_updates				tcp_param_arr[39].tcp_param_val
+#define	tcp_wscale_always			tcp_param_arr[40].tcp_param_val
+#define	tcp_tstamp_always			tcp_param_arr[41].tcp_param_val
+#define	tcp_tstamp_if_wscale			tcp_param_arr[42].tcp_param_val
+#define	tcp_rexmit_interval_extra		tcp_param_arr[43].tcp_param_val
+#define	tcp_deferred_acks_max			tcp_param_arr[44].tcp_param_val
+#define	tcp_slow_start_after_idle		tcp_param_arr[45].tcp_param_val
+#define	tcp_slow_start_initial			tcp_param_arr[46].tcp_param_val
+#define	tcp_co_timer_interval			tcp_param_arr[47].tcp_param_val
+#define	tcp_sack_permitted			tcp_param_arr[48].tcp_param_val
+#define	tcp_trace				tcp_param_arr[49].tcp_param_val
+#define	tcp_compression_enabled			tcp_param_arr[50].tcp_param_val
+#define	tcp_ipv6_hoplimit			tcp_param_arr[51].tcp_param_val
+#define	tcp_mss_def_ipv6			tcp_param_arr[52].tcp_param_val
+#define	tcp_mss_max_ipv6			tcp_param_arr[53].tcp_param_val
+#define	tcp_rev_src_routes			tcp_param_arr[54].tcp_param_val
+#define	tcp_local_dack_interval			tcp_param_arr[55].tcp_param_val
+#define	tcp_ndd_get_info_interval		tcp_param_arr[56].tcp_param_val
+#define	tcp_local_dacks_max			tcp_param_arr[57].tcp_param_val
+#define	tcp_ecn_permitted			tcp_param_arr[58].tcp_param_val
+#define	tcp_rst_sent_rate_enabled		tcp_param_arr[59].tcp_param_val
+#define	tcp_rst_sent_rate			tcp_param_arr[60].tcp_param_val
+#define	tcp_push_timer_interval			tcp_param_arr[61].tcp_param_val
+#define	tcp_use_smss_as_mss_opt			tcp_param_arr[62].tcp_param_val
+#define	tcp_keepalive_abort_interval_high	tcp_param_arr[63].tcp_param_max
+#define	tcp_keepalive_abort_interval		tcp_param_arr[63].tcp_param_val
+#define	tcp_keepalive_abort_interval_low	tcp_param_arr[63].tcp_param_min
+
+/* Kstats */
+typedef struct tcp_stat {
+	kstat_named_t	tcp_time_wait;
+	kstat_named_t	tcp_time_wait_syn;
+	kstat_named_t	tcp_time_wait_syn_success;
+	kstat_named_t	tcp_time_wait_syn_fail;
+	kstat_named_t	tcp_reinput_syn;
+	kstat_named_t	tcp_ip_output;
+	kstat_named_t	tcp_detach_non_time_wait;
+	kstat_named_t	tcp_detach_time_wait;
+	kstat_named_t	tcp_time_wait_reap;
+	kstat_named_t	tcp_clean_death_nondetached;
+	kstat_named_t	tcp_reinit_calls;
+	kstat_named_t	tcp_eager_err1;
+	kstat_named_t	tcp_eager_err2;
+	kstat_named_t	tcp_eager_blowoff_calls;
+	kstat_named_t	tcp_eager_blowoff_q;
+	kstat_named_t	tcp_eager_blowoff_q0;
+	kstat_named_t	tcp_not_hard_bound;
+	kstat_named_t	tcp_no_listener;
+	kstat_named_t	tcp_found_eager;
+	kstat_named_t	tcp_wrong_queue;
+	kstat_named_t	tcp_found_eager_binding1;
+	kstat_named_t	tcp_found_eager_bound1;
+	kstat_named_t	tcp_eager_has_listener1;
+	kstat_named_t	tcp_open_alloc;
+	kstat_named_t	tcp_open_detached_alloc;
+	kstat_named_t	tcp_rput_time_wait;
+	kstat_named_t	tcp_listendrop;
+	kstat_named_t	tcp_listendropq0;
+	kstat_named_t	tcp_wrong_rq;
+	kstat_named_t	tcp_rsrv_calls;
+	kstat_named_t	tcp_eagerfree2;
+	kstat_named_t	tcp_eagerfree3;
+	kstat_named_t	tcp_eagerfree4;
+	kstat_named_t	tcp_eagerfree5;
+	kstat_named_t	tcp_timewait_syn_fail;
+	kstat_named_t	tcp_listen_badflags;
+	kstat_named_t	tcp_timeout_calls;
+	kstat_named_t	tcp_timeout_cached_alloc;
+	kstat_named_t	tcp_timeout_cancel_reqs;
+	kstat_named_t	tcp_timeout_canceled;
+	kstat_named_t	tcp_timermp_alloced;
+	kstat_named_t	tcp_timermp_freed;
+	kstat_named_t	tcp_timermp_allocfail;
+	kstat_named_t	tcp_timermp_allocdblfail;
+	kstat_named_t	tcp_push_timer_cnt;
+	kstat_named_t	tcp_ack_timer_cnt;
+	kstat_named_t	tcp_ire_null1;
+	kstat_named_t	tcp_ire_null;
+	kstat_named_t	tcp_ip_send;
+	kstat_named_t	tcp_ip_ire_send;
+	kstat_named_t   tcp_wsrv_called;
+	kstat_named_t   tcp_flwctl_on;
+	kstat_named_t	tcp_timer_fire_early;
+	kstat_named_t	tcp_timer_fire_miss;
+	kstat_named_t	tcp_freelist_cleanup;
+	kstat_named_t	tcp_rput_v6_error;
+	kstat_named_t	tcp_out_sw_cksum;
+	kstat_named_t	tcp_out_sw_cksum_bytes;
+	kstat_named_t	tcp_zcopy_on;
+	kstat_named_t	tcp_zcopy_off;
+	kstat_named_t	tcp_zcopy_backoff;
+	kstat_named_t	tcp_zcopy_disable;
+	kstat_named_t	tcp_mdt_pkt_out;
+	kstat_named_t	tcp_mdt_pkt_out_v4;
+	kstat_named_t	tcp_mdt_pkt_out_v6;
+	kstat_named_t	tcp_mdt_discarded;
+	kstat_named_t	tcp_mdt_conn_halted1;
+	kstat_named_t	tcp_mdt_conn_halted2;
+	kstat_named_t	tcp_mdt_conn_halted3;
+	kstat_named_t	tcp_mdt_conn_resumed1;
+	kstat_named_t	tcp_mdt_conn_resumed2;
+	kstat_named_t	tcp_mdt_legacy_small;
+	kstat_named_t	tcp_mdt_legacy_all;
+	kstat_named_t	tcp_mdt_legacy_ret;
+	kstat_named_t	tcp_mdt_allocfail;
+	kstat_named_t	tcp_mdt_addpdescfail;
+	kstat_named_t	tcp_mdt_allocd;
+	kstat_named_t	tcp_mdt_linked;
+	kstat_named_t	tcp_fusion_flowctl;
+	kstat_named_t	tcp_fusion_backenabled;
+	kstat_named_t	tcp_fusion_urg;
+	kstat_named_t	tcp_fusion_putnext;
+	kstat_named_t	tcp_fusion_unfusable;
+	kstat_named_t	tcp_fusion_aborted;
+	kstat_named_t	tcp_fusion_unqualified;
+	kstat_named_t	tcp_fusion_rrw_busy;
+	kstat_named_t	tcp_fusion_rrw_msgcnt;
+	kstat_named_t	tcp_in_ack_unsent_drop;
+	kstat_named_t	tcp_sock_fallback;
+} tcp_stat_t;
+
+extern tcp_stat_t tcp_statistics;
+
+#define	TCP_STAT(x)		(tcp_statistics.x.value.ui64++)
+#define	TCP_STAT_UPDATE(x, n)	(tcp_statistics.x.value.ui64 += (n))
+#define	TCP_STAT_SET(x, n)	(tcp_statistics.x.value.ui64 = (n))
+
+extern struct qinit tcp_loopback_rinit, tcp_rinit;
+extern boolean_t do_tcp_fusion;
+
+extern int	tcp_maxpsz_set(tcp_t *, boolean_t);
+extern void	tcp_timers_stop(tcp_t *);
+extern void	tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t);
+extern void	tcp_push_timer(void *);
+extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
+extern clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
+
+extern void	tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
+extern void	tcp_unfuse(tcp_t *);
+extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
+extern void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
+extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
+extern void	tcp_fuse_syncstr_enable_pair(tcp_t *);
+extern void	tcp_fuse_disable_pair(tcp_t *, boolean_t);
+extern int	tcp_fuse_rrw(queue_t *, struiod_t *);
+extern int	tcp_fuse_rinfop(queue_t *, infod_t *);
+extern size_t	tcp_fuse_set_rcv_hiwat(tcp_t *, size_t);
+extern int	tcp_fuse_maxpsz_set(tcp_t *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_TCP_IMPL_H */

--- a/usr/src/uts/common/inet/udp/udp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -31,6 +31,8 @@
 
 #include <sys/types.h>
 #include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
 #include <sys/stropts.h>
 #include <sys/strlog.h>
 #include <sys/strsun.h>
@@ -50,6 +52,7 @@
 #include <sys/zone.h>
 
 #include <sys/socket.h>
+#include <sys/sockio.h>
 #include <sys/vtrace.h>
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
@@ -59,11 +62,15 @@
 #include <netinet/icmp6.h>
 #include <netinet/udp.h>
 #include <net/if.h>
+#include <net/route.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_multi.h>
 #include <inet/mi.h>
 #include <inet/mib2.h>
 #include <inet/nd.h>
@@ -71,9 +78,12 @@
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
 #include <inet/udp_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipp_common.h>
 
 /*
- * The ipsec_info.h header file is here since it has the defination for the
+ * The ipsec_info.h header file is here since it has the definition for the
  * M_CTL message types used by IP to convey information to the ULP. The
  * ipsec_info.h needs the pfkeyv2.h, hence the latters presence.
  */
@@ -81,40 +91,138 @@
 #include <inet/ipsec_info.h>
 
 /*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX. These and other externs should really move to a udp header file.
- */
-extern optdb_obj_t	udp_opt_obj;
-extern uint_t		udp_max_optsize;
-
-
-/*
  * Synchronization notes:
  *
- * UDP uses a combination of the queue-pair STREAMS perimeter, a global
- * lock and a set of bind hash locks to protect its data structures.
+ * UDP uses a combination of its internal perimeter, a global lock and
+ * a set of bind hash locks to protect its data structures.  Please see
+ * the note above udp_mode_assertions for details about the internal
+ * perimeter.
  *
- * The queue-pair perimeter is not acquired exclusively in the put
- * procedures thus when udp_rput or udp_wput needs exclusive access to
- * the udp_t instance structure it will use qwriter(..., PERIM_INNER) to
- * asynchronously acquire exclusive access to the udp_t instance.
- *
- * When UDP global data needs to be modified the udp_g_lock mutex is acquired.
- * Currently, udp_g_head and udp_g_epriv_ports[] are protected by it.
- *
- * When an UDP endpoint is bound to a local port, it is inserted into
+ * When a UDP endpoint is bound to a local port, it is inserted into
  * a bind hash list.  The list consists of an array of udp_fanout_t buckets.
  * The size of the array is controlled by the udp_bind_fanout_size variable.
  * This variable can be changed in /etc/system if the default value is
- * not large enough.  Each bind hash bucket is protected by a per bucket lock.
- * It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
+ * not large enough.  Each bind hash bucket is protected by a per bucket
+ * lock.  It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
  * structure.  An UDP endpoint is removed from the bind hash list only
  * when it is being unbound or being closed.  The per bucket lock also
- * protects an UDP endpoint's state changes.
+ * protects a UDP endpoint's state changes.
+ *
+ * Plumbing notes:
+ *
+ * Both udp and ip are merged, but the streams plumbing is kept unchanged
+ * in that udp is always pushed atop /dev/ip.  This is done to preserve
+ * backwards compatibility for certain applications which rely on such
+ * plumbing geometry to do things such as issuing I_POP on the stream
+ * in order to obtain direct access to /dev/ip, etc.
+ *
+ * All UDP processings happen in the /dev/ip instance; the udp module
+ * instance does not possess any state about the endpoint, and merely
+ * acts as a dummy module whose presence is to keep the streams plumbing
+ * appearance unchanged.  At open time /dev/ip allocates a conn_t that
+ * happens to embed a udp_t.  This stays dormant until the time udp is
+ * pushed, which indicates to /dev/ip that it must convert itself from
+ * an IP to a UDP endpoint.
+ *
+ * We only allow for the following plumbing cases:
+ *
+ * Normal:
+ *	/dev/ip is first opened and later udp is pushed directly on top.
+ *	This is the default action that happens when a udp socket or
+ *	/dev/udp is opened.  The conn_t created by /dev/ip instance is
+ *	now shared and is marked with IPCL_UDP.
+ *
+ * SNMP-only:
+ *	udp is pushed on top of a module other than /dev/ip.  When this
+ *	happens it will support only SNMP semantics.  A new conn_t is
+ *	allocated and marked with IPCL_UDPMOD.
+ *
+ * The above cases imply that we don't support any intermediate module to
+ * reside in between /dev/ip and udp -- in fact, we never supported such
+ * scenario in the past as the inter-layer communication semantics have
+ * always been private.  Also note that the normal case allows for SNMP
+ * requests to be processed in addition to the rest of UDP operations.
+ *
+ * The normal case plumbing is depicted by the following diagram:
+ *
+ *	+---------------+---------------+
+ *	|		|		| udp
+ *	|     udp_wq	|    udp_rq	|
+ *	|		|    UDP_RD	|
+ *	|		|		|
+ *	+---------------+---------------+
+ *		|		^
+ *		v		|
+ *	+---------------+---------------+
+ *	|		|		| /dev/ip
+ *	|     ip_wq	|     ip_rq	| conn_t
+ *	|     UDP_WR	|		|
+ *	|		|		|
+ *	+---------------+---------------+
+ *
+ * Messages arriving at udp_wq from above will end up in ip_wq before
+ * it gets processed, i.e. udp write entry points will advance udp_wq
+ * and use its q_next value as ip_wq in order to use the conn_t that
+ * is stored in its q_ptr.  Likewise, messages generated by ip to the
+ * module above udp will appear as if they are originated from udp_rq,
+ * i.e. putnext() calls to the module above udp is done using the
+ * udp_rq instead of ip_rq in order to avoid udp_rput() which does
+ * nothing more than calling putnext().
+ *
+ * The above implies the following rule of thumb:
+ *
+ *   1. udp_t is obtained from conn_t, which is created by the /dev/ip
+ *	instance and is stored in q_ptr of both ip_wq and ip_rq.  There
+ *	is no direct reference to conn_t from either udp_wq or udp_rq.
+ *
+ *   2. Write-side entry points of udp can obtain the conn_t via the
+ *	Q_TO_CONN() macro, using the queue value obtain from UDP_WR().
+ *
+ *   3. While in /dev/ip context, putnext() to the module above udp can
+ *	be done by supplying the queue value obtained from UDP_RD().
+ *
  */
 
+static queue_t *UDP_WR(queue_t *);
+static queue_t *UDP_RD(queue_t *);
+
+udp_stat_t udp_statistics = {
+	{ "udp_ip_send",		KSTAT_DATA_UINT64 },
+	{ "udp_ip_ire_send",		KSTAT_DATA_UINT64 },
+	{ "udp_ire_null",		KSTAT_DATA_UINT64 },
+	{ "udp_drain",			KSTAT_DATA_UINT64 },
+	{ "udp_sock_fallback",		KSTAT_DATA_UINT64 },
+	{ "udp_rrw_busy",		KSTAT_DATA_UINT64 },
+	{ "udp_rrw_msgcnt",		KSTAT_DATA_UINT64 },
+	{ "udp_out_sw_cksum",		KSTAT_DATA_UINT64 },
+	{ "udp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
+	{ "udp_out_opt",		KSTAT_DATA_UINT64 },
+	{ "udp_out_err_notconn",	KSTAT_DATA_UINT64 },
+	{ "udp_out_err_output",		KSTAT_DATA_UINT64 },
+	{ "udp_out_err_tudr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_pktinfo",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvdstaddr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvif",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvslla",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvucred",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvttl",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvhopopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvhoplimit",	KSTAT_DATA_UINT64 },
+	{ "udp_in_recvdstopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvrtdstopts",	KSTAT_DATA_UINT64 },
+	{ "udp_in_recvrthdr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvpktinfo",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvtclass",		KSTAT_DATA_UINT64 },
+#ifdef DEBUG
+	{ "udp_data_conn",		KSTAT_DATA_UINT64 },
+	{ "udp_data_notconn",		KSTAT_DATA_UINT64 },
+#endif
+};
+
+static kstat_t *udp_ksp;
+struct kmem_cache *udp_cache;
+
 /*
  * Bind hash list size and hash function.  It has to be a power of 2 for
  * hashing.
@@ -151,14 +259,6 @@
 	"later.\n"
 #define	NDD_OUT_OF_BUF_MSG	"<< Out of buffer >>\n"
 
-/* Named Dispatch Parameter Management Structure */
-typedef struct udpparam_s {
-	uint32_t udp_param_min;
-	uint32_t udp_param_max;
-	uint32_t udp_param_value;
-	char	*udp_param_name;
-} udpparam_t;
-
 static void	udp_addr_req(queue_t *q, mblk_t *mp);
 static void	udp_bind(queue_t *q, mblk_t *mp);
 static void	udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
@@ -188,15 +288,6 @@
 static  int	udp_unitdata_opt_process(queue_t *q, mblk_t *mp,
 		    int *errorp, void *thisdg_attrs);
 static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
-int		udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr);
-int		udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr);
-int		udp_opt_set(queue_t *q, uint_t optset_context,
-		    int level, int name,
-		    uint_t inlen, uchar_t *invalp,
-		    uint_t *outlenp, uchar_t *outvalp,
-		    void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
 static int	udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t udp_param_register(udpparam_t *udppa, int cnt);
 static int	udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -205,62 +296,91 @@
 		    uchar_t **optbufp, uint_t *optlenp);
 static void	udp_report_item(mblk_t *mp, udp_t *udp);
 static void	udp_rput(queue_t *q, mblk_t *mp);
+static void	udp_rput_other(queue_t *, mblk_t *);
+static int	udp_rinfop(queue_t *q, infod_t *dp);
+static int	udp_rrw(queue_t *q, struiod_t *dp);
 static	void	udp_rput_bind_ack(queue_t *q, mblk_t *mp);
-static void	udp_rput_other(queue_t *q, mblk_t *mp);
-static int	udp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int	udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr, int len);
 static int	udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
-static void	udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
+static void	udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha);
+static void	udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
+		    t_scalar_t destlen, t_scalar_t err);
 static void	udp_unbind(queue_t *q, mblk_t *mp);
 static in_port_t udp_update_next_port(in_port_t port, boolean_t random);
 static void	udp_wput(queue_t *q, mblk_t *mp);
-static void	udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
-		    t_scalar_t tudr_optlen);
+static mblk_t	*udp_output_v4(conn_t *, mblk_t *mp, ipaddr_t v4dst,
+		    uint16_t port, uint_t srcid, int *error);
+static mblk_t	*udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
+		    t_scalar_t tudr_optlen, int *error);
 static void	udp_wput_other(queue_t *q, mblk_t *mp);
 static void	udp_wput_iocdata(queue_t *q, mblk_t *mp);
+static void	udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
+		    socklen_t addrlen);
+static size_t	udp_set_rcv_hiwat(udp_t *udp, size_t size);
 
 static void	udp_kstat_init(void);
 static void	udp_kstat_fini(void);
 static int	udp_kstat_update(kstat_t *kp, int rw);
+static void	udp_input_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2);
+
+static void	udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp,
+		    uint_t pkt_len);
+static void	udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing);
+static void	udp_enter(conn_t *, mblk_t *, sqproc_t, uint8_t);
+static void	udp_exit(conn_t *);
+static void	udp_become_writer(conn_t *, mblk_t *, sqproc_t, uint8_t);
+#ifdef DEBUG
+static void	udp_mode_assertions(udp_t *, int);
+#endif /* DEBUG */
 
 major_t UDP6_MAJ;
-#define		UDP6		"udp6"
-
-#define		UDP_MAXPACKET_IPV4	\
-	(IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
-#define		UDP_MAXPACKET_IPV6	\
-	(IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
-
-static struct module_info info =  {
-	5607, "udp", 1, INFPSZ, 512, 128
+#define	UDP6 "udp6"
+
+#define	UDP_RECV_HIWATER	(56 * 1024)
+#define	UDP_RECV_LOWATER	128
+#define	UDP_XMIT_HIWATER	(56 * 1024)
+#define	UDP_XMIT_LOWATER	1024
+
+static struct module_info udp_info =  {
+	UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER
+};
+
+static struct qinit udp_rinit = {
+	(pfi_t)udp_rput, NULL, udp_open, udp_close, NULL,
+	&udp_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD
 };
 
-static struct qinit rinit = {
-	(pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, &info
+static struct qinit udp_winit = {
+	(pfi_t)udp_wput, NULL, NULL, NULL, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
 };
 
-static struct qinit winit = {
-	(pfi_t)udp_wput, NULL, NULL, NULL, NULL, &info
+/* Support for just SNMP if UDP is not pushed directly over device IP */
+struct qinit udp_snmp_rinit = {
+	(pfi_t)putnext, NULL, udp_open, ip_snmpmod_close, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
+};
+
+struct qinit udp_snmp_winit = {
+	(pfi_t)ip_snmpmod_wput, NULL, udp_open, ip_snmpmod_close, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
 };
 
 struct streamtab udpinfo = {
-	&rinit, &winit
+	&udp_rinit, &udp_winit
 };
 
 static	sin_t	sin_null;	/* Zero address for quick clears */
 static	sin6_t	sin6_null;	/* Zero address for quick clears */
 
-/* Protected by udp_g_lock */
-static void	*udp_g_head;	/* Head for list of open udp streams. */
-kmutex_t	udp_g_lock;	/* Protects the above variable */
-
 /* Hint not protected by any lock */
 static in_port_t	udp_g_next_port_to_try;
 
 /*
- * Extra privileged ports. In host byte order. Protected by udp_g_lock.
+ * Extra privileged ports. In host byte order.
  */
 #define	UDP_NUM_EPRIV_PORTS	64
 static int	udp_g_num_epriv_ports = UDP_NUM_EPRIV_PORTS;
@@ -273,6 +393,7 @@
 static mib2_udp_t	udp_mib;	/* SNMP fixed size info */
 static kstat_t		*udp_mibkp;	/* kstat exporting udp_mib data */
 
+#define	UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
 
 /* Default structure copied into T_INFO_ACK messages */
 static struct T_info_ack udp_g_t_info_ack_ipv4 = {
@@ -289,6 +410,8 @@
 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
 };
 
+#define	UDP_MAXPACKET_IPV6 (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
+
 static	struct T_info_ack udp_g_t_info_ack_ipv6 = {
 	T_INFO_ACK,
 	UDP_MAXPACKET_IPV6,	/* TSDU_size.  Excl. headers */
@@ -311,33 +434,23 @@
  * in udp_open.
  * All of these are alterable, within the min/max values given, at run time.
  */
-static udpparam_t	udp_param_arr[] = {
-	/* min	max		value		name */
-	{ 0L,	256,		32,		"udp_wroff_extra" },
-	{ 1L,	255,		255,		"udp_ipv4_ttl" },
-	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"udp_ipv6_hoplimit"},
-	{ 1024,	(32 * 1024),	1024,		"udp_smallest_nonpriv_port" },
-	{ 0,	1,		1,		"udp_do_checksum" },
-	{ 1024,	UDP_MAX_PORT,	(32 * 1024),	"udp_smallest_anon_port" },
-	{ 1024,	UDP_MAX_PORT,	UDP_MAX_PORT,	"udp_largest_anon_port" },
-	{ 4096,	1024*1024,	56*1024,	"udp_xmit_hiwat"},
-	{ 0,	1024*1024,	1024,		"udp_xmit_lowat"},
-	{ 4096,	1024*1024,	56*1024,	"udp_recv_hiwat"},
-	{ 65536, 1024*1024*1024, 2*1024*1024,	"udp_max_buf"},
-	{ 100,	60000,		1000,		"udp_ndd_get_info_interval"},
+/* BEGIN CSTYLED */
+udpparam_t udp_param_arr[] = {
+ /*min		max		value		name */
+ { 0L,		256,		32,		"udp_wroff_extra" },
+ { 1L,		255,		255,		"udp_ipv4_ttl" },
+ { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"},
+ { 1024,	(32 * 1024),	1024,		"udp_smallest_nonpriv_port" },
+ { 0,		1,		1,		"udp_do_checksum" },
+ { 1024,	UDP_MAX_PORT,	(32 * 1024),	"udp_smallest_anon_port" },
+ { 1024,	UDP_MAX_PORT,	UDP_MAX_PORT,	"udp_largest_anon_port" },
+ { UDP_XMIT_LOWATER, (1<<30), UDP_XMIT_HIWATER,	"udp_xmit_hiwat"},
+ { 0,		     (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"},
+ { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER,	"udp_recv_hiwat"},
+ { 65536,	(1<<30),	2*1024*1024,	"udp_max_buf"},
+ { 100,		60000,		1000,		"udp_ndd_get_info_interval"},
 };
-#define	udp_wroff_extra			udp_param_arr[0].udp_param_value
-#define	udp_ipv4_ttl			udp_param_arr[1].udp_param_value
-#define	udp_ipv6_hoplimit		udp_param_arr[2].udp_param_value
-#define	udp_smallest_nonpriv_port	udp_param_arr[3].udp_param_value
-#define	udp_do_checksum			udp_param_arr[4].udp_param_value
-#define	udp_smallest_anon_port		udp_param_arr[5].udp_param_value
-#define	udp_largest_anon_port		udp_param_arr[6].udp_param_value
-#define	udp_xmit_hiwat			udp_param_arr[7].udp_param_value
-#define	udp_xmit_lowat			udp_param_arr[8].udp_param_value
-#define	udp_recv_hiwat			udp_param_arr[9].udp_param_value
-#define	udp_max_buf			udp_param_arr[10].udp_param_value
-#define	udp_ndd_get_info_interval	udp_param_arr[11].udp_param_value
+/* END CSTYLED */
 
 /*
  * The smallest anonymous port in the priviledged port range which UDP
@@ -354,9 +467,434 @@
  */
 
 void (*cl_inet_bind)(uchar_t protocol, sa_family_t addr_family,
-			uint8_t *laddrp, in_port_t lport) = NULL;
+    uint8_t *laddrp, in_port_t lport) = NULL;
 void (*cl_inet_unbind)(uint8_t protocol, sa_family_t addr_family,
-			uint8_t *laddrp, in_port_t lport) = NULL;
+    uint8_t *laddrp, in_port_t lport) = NULL;
+
+typedef union T_primitives *t_primp_t;
+
+#define	UDP_ENQUEUE_MP(udp, mp, proc, tag) {			\
+	ASSERT((mp)->b_prev == NULL && (mp)->b_queue == NULL);	\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(mp)->b_queue = (queue_t *)((uintptr_t)tag);		\
+	(mp)->b_prev = (mblk_t *)proc;				\
+	if ((udp)->udp_mphead == NULL)				\
+		(udp)->udp_mphead = (mp);			\
+	else							\
+		(udp)->udp_mptail->b_next = (mp);		\
+	(udp)->udp_mptail = (mp);				\
+	(udp)->udp_mpcount++;					\
+}
+
+#define	UDP_READERS_INCREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_reader_count++;				\
+}
+
+#define	UDP_READERS_DECREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_reader_count--;				\
+	if ((udp)->udp_reader_count == 0)			\
+		cv_broadcast(&(udp)->udp_connp->conn_cv);	\
+}
+
+#define	UDP_SQUEUE_DECREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_squeue_count--;				\
+	if ((udp)->udp_squeue_count == 0)			\
+		cv_broadcast(&(udp)->udp_connp->conn_cv);	\
+}
+
+/*
+ * Notes on UDP endpoint synchronization:
+ *
+ * UDP needs exclusive operation on a per endpoint basis, when executing
+ * functions that modify the endpoint state.  udp_rput_other() deals with
+ * packets with IP options, and processing these packets end up having
+ * to update the endpoint's option related state.  udp_wput_other() deals
+ * with control operations from the top, e.g. connect() that needs to
+ * update the endpoint state.  These could be synchronized using locks,
+ * but the current version uses squeues for this purpose.  squeues may
+ * give performance improvement for certain cases such as connected UDP
+ * sockets; thus the framework allows for using squeues.
+ *
+ * The perimeter routines are described as follows:
+ *
+ * udp_enter():
+ *	Enter the UDP endpoint perimeter.
+ *
+ * udp_become_writer():
+ *	Become exclusive on the UDP endpoint.  Specifies a function
+ *	that will be called exclusively either immediately or later
+ *	when the perimeter is available exclusively.
+ *
+ * udp_exit():
+ *	Exit the UDP perimeter.
+ *
+ * Entering UDP from the top or from the bottom must be done using
+ * udp_enter().  No lock must be held while attempting to enter the UDP
+ * perimeter.  When finished, udp_exit() must be called to get out of
+ * the perimeter.
+ *
+ * UDP operates in either MT_HOT mode or in SQUEUE mode.  In MT_HOT mode,
+ * multiple threads may enter a UDP endpoint concurrently.  This is used
+ * for sending and/or receiving normal data.  Control operations and other
+ * special cases call udp_become_writer() to become exclusive on a per
+ * endpoint basis and this results in transitioning to SQUEUE mode.  squeue
+ * by definition serializes access to the conn_t.  When there are no more
+ * pending messages on the squeue for the UDP connection, the endpoint
+ * reverts to MT_HOT mode.  During the interregnum when not all MT threads
+ * of an endpoint have finished, messages are queued in the UDP endpoint
+ * and the UDP is in UDP_MT_QUEUED mode or UDP_QUEUED_SQUEUE mode.
+ *
+ * These modes have the following analogs:
+ *
+ *	UDP_MT_HOT/udp_reader_count==0		none
+ *	UDP_MT_HOT/udp_reader_count>0		RW_READ_LOCK
+ *	UDP_MT_QUEUED				RW_WRITE_WANTED
+ *	UDP_SQUEUE or UDP_QUEUED_SQUEUE		RW_WRITE_LOCKED
+ *
+ * Stable modes:	UDP_MT_HOT, UDP_SQUEUE
+ * Transient modes:	UDP_MT_QUEUED, UDP_QUEUED_SQUEUE
+ *
+ * While in stable modes, UDP keeps track of the number of threads
+ * operating on the endpoint.  The udp_reader_count variable represents
+ * the number of threads entering the endpoint as readers while it is
+ * in UDP_MT_HOT mode.  Transitioning to UDP_SQUEUE happens when there
+ * is only a single reader, i.e. when this counter drops to 1.  Likewise,
+ * udp_squeue_count represents the number of threads operating on the
+ * endpoint's squeue while it is in UDP_SQUEUE mode.  The mode transition
+ * to UDP_MT_HOT happens after the last thread exits the endpoint, i.e.
+ * when this counter drops to 0.
+ *
+ * The default mode is set to UDP_MT_HOT and UDP alternates between
+ * UDP_MT_HOT and UDP_SQUEUE as shown in the state transition below.
+ *
+ * Mode transition:
+ * ----------------------------------------------------------------
+ * old mode		Event				New mode
+ * ----------------------------------------------------------------
+ * UDP_MT_HOT		Call to udp_become_writer()	UDP_SQUEUE
+ *			and udp_reader_count == 1
+ *
+ * UDP_MT_HOT		Call to udp_become_writer()	UDP_MT_QUEUED
+ *			and udp_reader_count > 1
+ *
+ * UDP_MT_QUEUED	udp_reader_count drops to zero	UDP_QUEUED_SQUEUE
+ *
+ * UDP_QUEUED_SQUEUE	All messages enqueued on the	UDP_SQUEUE
+ *			internal UDP queue successfully
+ *			moved to squeue AND udp_squeue_count != 0
+ *
+ * UDP_QUEUED_SQUEUE	All messages enqueued on the	UDP_MT_HOT
+ *			internal UDP queue successfully
+ *			moved to squeue AND udp_squeue_count
+ *			drops to zero
+ *
+ * UDP_SQUEUE		udp_squeue_count drops to zero	UDP_MT_HOT
+ * ----------------------------------------------------------------
+ */
+
+static queue_t *
+UDP_WR(queue_t *q)
+{
+	ASSERT(q->q_ptr == NULL && _OTHERQ(q)->q_ptr == NULL);
+	ASSERT(WR(q)->q_next != NULL && WR(q)->q_next->q_ptr != NULL);
+	ASSERT(IPCL_IS_UDP(Q_TO_CONN(WR(q)->q_next)));
+
+	return (_WR(q)->q_next);
+}
+
+static queue_t *
+UDP_RD(queue_t *q)
+{
+	ASSERT(q->q_ptr != NULL && _OTHERQ(q)->q_ptr != NULL);
+	ASSERT(IPCL_IS_UDP(Q_TO_CONN(q)));
+	ASSERT(RD(q)->q_next != NULL && RD(q)->q_next->q_ptr == NULL);
+
+	return (_RD(q)->q_next);
+}
+
+#ifdef DEBUG
+#define	UDP_MODE_ASSERTIONS(udp, caller) udp_mode_assertions(udp, caller)
+#else
+#define	UDP_MODE_ASSERTIONS(udp, caller)
+#endif
+
+/* Invariants */
+#ifdef DEBUG
+
+uint32_t udp_count[4];
+
+/* Context of udp_mode_assertions */
+#define	UDP_ENTER		1
+#define	UDP_BECOME_WRITER	2
+#define	UDP_EXIT		3
+
+static void
+udp_mode_assertions(udp_t *udp, int caller)
+{
+	ASSERT(MUTEX_HELD(&udp->udp_connp->conn_lock));
+
+	switch (udp->udp_mode) {
+	case UDP_MT_HOT:
+		/*
+		 * Messages have not yet been enqueued on the internal queue,
+		 * otherwise we would have switched to UDP_MT_QUEUED. Likewise
+		 * by definition, there can't be any messages enqueued on the
+		 * squeue. The UDP could be quiescent, so udp_reader_count
+		 * could be zero at entry.
+		 */
+		ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0 &&
+		    udp->udp_squeue_count == 0);
+		ASSERT(caller == UDP_ENTER || udp->udp_reader_count != 0);
+		udp_count[0]++;
+		break;
+
+	case UDP_MT_QUEUED:
+		/*
+		 * The last MT thread to exit the udp perimeter empties the
+		 * internal queue and then switches the UDP to
+		 * UDP_QUEUED_SQUEUE mode. Since we are still in UDP_MT_QUEUED
+		 * mode, it means there must be at least 1 MT thread still in
+		 * the perimeter and at least 1 message on the internal queue.
+		 */
+		ASSERT(udp->udp_reader_count >= 1 && udp->udp_mphead != NULL &&
+		    udp->udp_mpcount != 0 && udp->udp_squeue_count == 0);
+		udp_count[1]++;
+		break;
+
+	case UDP_QUEUED_SQUEUE:
+		/*
+		 * The switch has happened from MT to SQUEUE. So there can't
+		 * any MT threads. Messages could still pile up on the internal
+		 * queue until the transition is complete and we move to
+		 * UDP_SQUEUE mode. We can't assert on nonzero udp_squeue_count
+		 * since the squeue could drain any time.
+		 */
+		ASSERT(udp->udp_reader_count == 0);
+		udp_count[2]++;
+		break;
+
+	case UDP_SQUEUE:
+		/*
+		 * The transition is complete. Thre can't be any messages on
+		 * the internal queue. The udp could be quiescent or the squeue
+		 * could drain any time, so we can't assert on nonzero
+		 * udp_squeue_count during entry. Nor can we assert that
+		 * udp_reader_count is zero, since, a reader thread could have
+		 * directly become writer in line by calling udp_become_writer
+		 * without going through the queued states.
+		 */
+		ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0);
+		ASSERT(caller == UDP_ENTER || udp->udp_squeue_count != 0);
+		udp_count[3]++;
+		break;
+	}
+}
+#endif
+
+#define	_UDP_ENTER(connp, mp, proc, tag) {				\
+	udp_t *_udp = (connp)->conn_udp;				\
+									\
+	mutex_enter(&(connp)->conn_lock);				\
+	if ((connp)->conn_state_flags & CONN_CLOSING) {			\
+		mutex_exit(&(connp)->conn_lock);			\
+		freemsg(mp);						\
+	} else {							\
+		UDP_MODE_ASSERTIONS(_udp, UDP_ENTER);			\
+									\
+		switch (_udp->udp_mode) {				\
+		case UDP_MT_HOT:					\
+			/* We can execute as reader right away. */	\
+			UDP_READERS_INCREF(_udp);			\
+			mutex_exit(&(connp)->conn_lock);		\
+			(*(proc))(connp, mp, (connp)->conn_sqp);	\
+			break;						\
+									\
+		case UDP_SQUEUE:					\
+			/*						\
+			 * We are in squeue mode, send the		\
+			 * packet to the squeue				\
+			 */						\
+			_udp->udp_squeue_count++;			\
+			CONN_INC_REF_LOCKED(connp);			\
+			mutex_exit(&(connp)->conn_lock);		\
+			squeue_enter((connp)->conn_sqp, mp, proc,	\
+			    connp, tag);				\
+			break;						\
+									\
+		case UDP_MT_QUEUED:					\
+		case UDP_QUEUED_SQUEUE:					\
+			/*						\
+			 * Some messages may have been enqueued		\
+			 * ahead of us.  Enqueue the new message	\
+			 * at the tail of the internal queue to		\
+			 * preserve message ordering.			\
+			 */						\
+			UDP_ENQUEUE_MP(_udp, mp, proc, tag);		\
+			mutex_exit(&(connp)->conn_lock);		\
+			break;						\
+		}							\
+	}								\
+}
+
+static void
+udp_enter(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+	_UDP_ENTER(connp, mp, proc, tag);
+}
+
+static void
+udp_become_writer(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+	udp_t	*udp;
+
+	udp = connp->conn_udp;
+
+	mutex_enter(&connp->conn_lock);
+
+	UDP_MODE_ASSERTIONS(udp, UDP_BECOME_WRITER);
+
+	switch (udp->udp_mode) {
+	case UDP_MT_HOT:
+		if (udp->udp_reader_count == 1) {
+			/*
+			 * We are the only MT thread. Switch to squeue mode
+			 * immediately.
+			 */
+			udp->udp_mode = UDP_SQUEUE;
+			udp->udp_squeue_count = 1;
+			CONN_INC_REF_LOCKED(connp);
+			mutex_exit(&connp->conn_lock);
+			squeue_enter(connp->conn_sqp, mp, proc, connp, tag);
+			return;
+		}
+		/* FALLTHRU */
+
+	case UDP_MT_QUEUED:
+		/* Enqueue the packet internally in UDP */
+		udp->udp_mode = UDP_MT_QUEUED;
+		UDP_ENQUEUE_MP(udp, mp, proc, tag);
+		mutex_exit(&connp->conn_lock);
+		return;
+
+	case UDP_SQUEUE:
+	case UDP_QUEUED_SQUEUE:
+		/*
+		 * We are already exclusive. i.e. we are already
+		 * writer. Simply call the desired function.
+		 */
+		udp->udp_squeue_count++;
+		mutex_exit(&connp->conn_lock);
+		(*proc)(connp, mp, connp->conn_sqp);
+		return;
+	}
+}
+
+/*
+ * Transition from MT mode to SQUEUE mode, when the last MT thread
+ * is exiting the UDP perimeter. Move all messages from the internal
+ * udp queue to the squeue. A better way would be to move all the
+ * messages in one shot, this needs more support from the squeue framework
+ */
+static void
+udp_switch_to_squeue(udp_t *udp)
+{
+	mblk_t *mp;
+	mblk_t	*mp_next;
+	sqproc_t proc;
+	uint8_t	tag;
+	conn_t	*connp = udp->udp_connp;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(udp->udp_mode == UDP_MT_QUEUED);
+	while (udp->udp_mphead != NULL) {
+		mp = udp->udp_mphead;
+		udp->udp_mphead = NULL;
+		udp->udp_mptail = NULL;
+		udp->udp_mpcount = 0;
+		udp->udp_mode = UDP_QUEUED_SQUEUE;
+		mutex_exit(&connp->conn_lock);
+		/*
+		 * It is best not to hold any locks across the calls
+		 * to squeue functions. Since we drop the lock we
+		 * need to go back and check the udp_mphead once again
+		 * after the squeue_fill and hence the while loop at
+		 * the top of this function
+		 */
+		for (; mp != NULL; mp = mp_next) {
+			mp_next = mp->b_next;
+			proc = (sqproc_t)mp->b_prev;
+			tag = (uint8_t)((uintptr_t)mp->b_queue);
+			mp->b_next = NULL;
+			mp->b_prev = NULL;
+			mp->b_queue = NULL;
+			CONN_INC_REF(connp);
+			udp->udp_squeue_count++;
+			squeue_fill(connp->conn_sqp, mp, proc, connp,
+			    tag);
+		}
+		mutex_enter(&connp->conn_lock);
+	}
+	/*
+	 * udp_squeue_count of zero implies that the squeue has drained
+	 * even before we arrived here (i.e. after the squeue_fill above)
+	 */
+	udp->udp_mode = (udp->udp_squeue_count != 0) ?
+	    UDP_SQUEUE : UDP_MT_HOT;
+}
+
+#define	_UDP_EXIT(connp) {						\
+	udp_t *_udp = (connp)->conn_udp;				\
+									\
+	mutex_enter(&(connp)->conn_lock);				\
+	UDP_MODE_ASSERTIONS(_udp, UDP_EXIT);				\
+									\
+	switch (_udp->udp_mode) {					\
+	case UDP_MT_HOT:						\
+		UDP_READERS_DECREF(_udp);				\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_SQUEUE:						\
+		UDP_SQUEUE_DECREF(_udp);				\
+		if (_udp->udp_squeue_count == 0)			\
+		    _udp->udp_mode = UDP_MT_HOT;			\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_MT_QUEUED:						\
+		/*							\
+		 * If this is the last MT thread, we need to		\
+		 * switch to squeue mode				\
+		 */							\
+		UDP_READERS_DECREF(_udp);				\
+		if (_udp->udp_reader_count == 0)			\
+			udp_switch_to_squeue(_udp);			\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_QUEUED_SQUEUE:						\
+		UDP_SQUEUE_DECREF(_udp);				\
+		/*							\
+		 * Even if the udp_squeue_count drops to zero, we	\
+		 * don't want to change udp_mode to UDP_MT_HOT here.	\
+		 * The thread in udp_switch_to_squeue will take care	\
+		 * of the transition to UDP_MT_HOT, after emptying	\
+		 * any more new messages that have been enqueued in	\
+		 * udp_mphead.						\
+		 */							\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+	}								\
+}
+
+static void
+udp_exit(conn_t *connp)
+{
+	_UDP_EXIT(connp);
+}
 
 /*
  * Return the next anonymous port in the priviledged port range for
@@ -379,9 +917,13 @@
 udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 {
 	udp_fanout_t	*udpf;
-	udp_t		*udp;
 	int		i;
 	zoneid_t	zoneid;
+	conn_t		*connp;
+	udp_t		*udp;
+
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 
 	/* Refer to comments in udp_status_report(). */
 	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
@@ -403,8 +945,7 @@
 	    " zone lport src addr        dest addr       port  state");
 	/*    1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 
 	for (i = 0; i < udp_bind_fanout_size; i++) {
 		udpf = &udp_bind_fanout[i];
@@ -415,7 +956,7 @@
 		if (zoneid != GLOBAL_ZONEID) {
 			/* skip to first entry in this zone; might be none */
 			while (udp != NULL &&
-			    udp->udp_zoneid != zoneid)
+			    udp->udp_connp->conn_zoneid != zoneid)
 				udp = udp->udp_bind_hash;
 		}
 		if (udp != NULL) {
@@ -432,7 +973,7 @@
 			}
 			for (; udp != NULL; udp = udp->udp_bind_hash) {
 				if (zoneid == GLOBAL_ZONEID ||
-				    zoneid == udp->udp_zoneid)
+				    zoneid == udp->udp_connp->conn_zoneid)
 					udp_report_item(mp->b_cont, udp);
 			}
 		}
@@ -542,7 +1083,6 @@
 	in_port_t	port;		/* Host byte order */
 	in_port_t	requested_port;	/* Host byte order */
 	struct T_bind_req *tbr;
-	udp_t		*udp;
 	int		count;
 	in6_addr_t	v6src;
 	boolean_t	bind_to_req_port_only;
@@ -550,8 +1090,11 @@
 	udp_fanout_t	*udpf;
 	in_port_t	lport;		/* Network byte order */
 	zoneid_t	zoneid;
-
-	udp = (udp_t *)q->q_ptr;
+	conn_t		*connp;
+	udp_t		*udp;
+
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_bind: bad req, len %u",
@@ -559,6 +1102,7 @@
 		udp_err_ack(q, mp, TPROTO, 0);
 		return;
 	}
+
 	if (udp->udp_state != TS_UNBND) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_bind: bad state, %u", udp->udp_state);
@@ -673,7 +1217,7 @@
 		}
 
 		if (priv) {
-			cred_t *cr = DB_CREDDEF(mp, udp->udp_credp);
+			cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
 
 			if (secpolicy_net_privaddr(cr, port) != 0) {
 				udp_err_ack(q, mp, TACCES, 0);
@@ -736,7 +1280,7 @@
 		loopmax = udp_largest_anon_port - udp_smallest_anon_port + 1;
 	}
 
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 	for (;;) {
 		udp_t		*udp1;
 		boolean_t	is_inaddr_any;
@@ -753,7 +1297,7 @@
 		for (udp1 = udpf->uf_udp; udp1 != NULL;
 		    udp1 = udp1->udp_bind_hash) {
 			if (lport != udp1->udp_port ||
-			    zoneid != udp1->udp_zoneid)
+			    zoneid != udp1->udp_connp->conn_zoneid)
 				continue;
 
 			/*
@@ -933,7 +1477,39 @@
 		mp->b_cont->b_wptr += sizeof (ire_t);
 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
 	}
-	putnext(q, mp);
+	if (udp->udp_family == AF_INET6)
+		mp = ip_bind_v6(q, mp, connp, NULL);
+	else
+		mp = ip_bind_v4(q, mp, connp);
+
+	if (mp != NULL)
+		udp_rput_other(_RD(q), mp);
+	else
+		CONN_INC_REF(connp);
+}
+
+
+void
+udp_resume_bind(conn_t *connp, mblk_t *mp)
+{
+	udp_enter(connp, mp, udp_resume_bind_cb, SQTAG_BIND_RETRY);
+}
+
+/*
+ * This is called from ip_wput_nondata to resume a deferred UDP bind.
+ */
+/* ARGSUSED */
+static void
+udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2)
+{
+	conn_t *connp = arg;
+
+	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+
+	udp_rput_other(connp->conn_rq, mp);
+
+	CONN_OPER_PENDING_DONE(connp);
+	udp_exit(connp);
 }
 
 /*
@@ -958,15 +1534,16 @@
 	sin6_t	*sin6;
 	sin_t	*sin;
 	struct T_conn_req	*tcr;
-	udp_t	*udp, *udp1;
 	in6_addr_t v6dst;
 	ipaddr_t v4dst;
 	uint16_t dstport;
 	uint32_t flowinfo;
 	mblk_t	*mp1, *mp2;
 	udp_fanout_t	*udpf;
-
-	udp = (udp_t *)q->q_ptr;
+	udp_t	*udp, *udp1;
+
+	udp = Q_TO_UDP(q);
+
 	tcr = (struct T_conn_req *)mp->b_rptr;
 
 	/* A bit of sanity checking */
@@ -987,6 +1564,7 @@
 	ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
 
 	udpf = &udp_bind_fanout[UDP_BIND_HASH(udp->udp_port)];
+
 	if (udp->udp_state == TS_DATA_XFER) {
 		/* Already connected - clear out state */
 		mutex_enter(&udpf->uf_lock);
@@ -1185,20 +1763,67 @@
 	linkb(mp1, mp);
 	linkb(mp1, mp2);
 
-	putnext(q, mp1);
+	if (udp->udp_family == AF_INET)
+		mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+	else
+		mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+
+	if (mp1 != NULL)
+		udp_rput_other(_RD(q), mp1);
+	else
+		CONN_INC_REF(udp->udp_connp);
 }
 
-/* This is the close routine for udp.  It frees the per-stream data. */
 static int
 udp_close(queue_t *q)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
-
-	TRACE_1(TR_FAC_UDP, TR_UDP_CLOSE,
-		"udp_close: q %p", q);
+	conn_t	*connp = Q_TO_CONN(UDP_WR(q));
+	udp_t	*udp;
+	queue_t	*ip_rq = RD(UDP_WR(q));
+
+	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+	udp = connp->conn_udp;
+
+	ip_quiesce_conn(connp);
+	/*
+	 * Disable read-side synchronous stream
+	 * interface and drain any queued data.
+	 */
+	udp_rcv_drain(q, udp, B_TRUE);
+	ASSERT(!udp->udp_direct_sockfs);
 
 	qprocsoff(q);
 
+	/* restore IP module's high and low water marks to default values */
+	ip_rq->q_hiwat = ip_rq->q_qinfo->qi_minfo->mi_hiwat;
+	WR(ip_rq)->q_hiwat = WR(ip_rq)->q_qinfo->qi_minfo->mi_hiwat;
+	WR(ip_rq)->q_lowat = WR(ip_rq)->q_qinfo->qi_minfo->mi_lowat;
+
+	ASSERT(udp->udp_rcv_cnt == 0);
+	ASSERT(udp->udp_rcv_msgcnt == 0);
+	ASSERT(udp->udp_rcv_list_head == NULL);
+	ASSERT(udp->udp_rcv_list_tail == NULL);
+
+	/* connp is now single threaded. */
+	udp_close_free(connp);
+	/*
+	 * Restore connp as an IP endpoint.  We don't need
+	 * any locks since we are now single threaded
+	 */
+	connp->conn_flags &= ~IPCL_UDP;
+	connp->conn_state_flags &=
+	    ~(CONN_CLOSING | CONN_CONDEMNED | CONN_QUIESCED);
+	return (0);
+}
+
+/*
+ * Called in the close path from IP (ip_quiesce_conn) to quiesce the conn
+ */
+void
+udp_quiesce_conn(conn_t *connp)
+{
+	udp_t	*udp = connp->conn_udp;
+
 	if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
 		/*
 		 * Running in cluster mode - register unbind information
@@ -1215,16 +1840,30 @@
 	}
 
 	udp_bind_hash_remove(udp, B_FALSE);
-	mutex_enter(&udp_g_lock);
-	/* Unlink the udp structure and release the minor device number. */
-	mi_close_unlink(&udp_g_head, (IDP)udp);
-	mutex_exit(&udp_g_lock);
+
+	mutex_enter(&connp->conn_lock);
+	while (udp->udp_reader_count != 0 || udp->udp_squeue_count != 0 ||
+	    udp->udp_mode != UDP_MT_HOT) {
+		cv_wait(&connp->conn_cv, &connp->conn_lock);
+	}
+	mutex_exit(&connp->conn_lock);
+}
+
+void
+udp_close_free(conn_t *connp)
+{
+	udp_t *udp = connp->conn_udp;
+
 	/* If there are any options associated with the stream, free them. */
-	if (udp->udp_ip_snd_options)
+	if (udp->udp_ip_snd_options) {
 		mi_free((char *)udp->udp_ip_snd_options);
-
-	if (udp->udp_ip_rcv_options)
+		udp->udp_ip_snd_options = NULL;
+	}
+
+	if (udp->udp_ip_rcv_options) {
 		mi_free((char *)udp->udp_ip_rcv_options);
+		udp->udp_ip_rcv_options = NULL;
+	}
 
 	/* Free memory associated with sticky options */
 	if (udp->udp_sticky_hdrs_len != 0) {
@@ -1233,30 +1872,33 @@
 		udp->udp_sticky_hdrs = NULL;
 		udp->udp_sticky_hdrs_len = 0;
 	}
+
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_hopopts,
 		    udp->udp_sticky_ipp.ipp_hopoptslen);
+		udp->udp_sticky_ipp.ipp_hopopts = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_rtdstopts,
 		    udp->udp_sticky_ipp.ipp_rtdstoptslen);
+		udp->udp_sticky_ipp.ipp_rtdstopts = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
 		kmem_free(udp->udp_sticky_ipp.ipp_rthdr,
 		    udp->udp_sticky_ipp.ipp_rthdrlen);
+		udp->udp_sticky_ipp.ipp_rthdr = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_dstopts,
 		    udp->udp_sticky_ipp.ipp_dstoptslen);
+		udp->udp_sticky_ipp.ipp_dstopts = NULL;
 	}
 	udp->udp_sticky_ipp.ipp_fields &=
 	    ~(IPPF_HOPOPTS|IPPF_RTDSTOPTS|IPPF_RTHDR|IPPF_DSTOPTS);
 
-	crfree(udp->udp_credp);
-	/* Free the data structure */
-	mi_close_free((IDP)udp);
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	return (0);
+	udp->udp_connp = NULL;
+	connp->conn_udp = NULL;
+	kmem_cache_free(udp_cache, udp);
 }
 
 /*
@@ -1277,12 +1919,10 @@
 static void
 udp_disconnect(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp;
+	udp_t	*udp = Q_TO_UDP(q);
 	mblk_t	*mp1;
 	udp_fanout_t *udpf;
 
-	udp = (udp_t *)q->q_ptr;
-
 	if (udp->udp_state != TS_DATA_XFER) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_disconnect: bad state, %u", udp->udp_state);
@@ -1331,7 +1971,16 @@
 
 	/* Append the T_OK_ACK to the T_BIND_REQ for udp_rput */
 	linkb(mp1, mp);
-	putnext(q, mp1);
+
+	if (udp->udp_family == AF_INET6)
+		mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+	else
+		mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+
+	if (mp1 != NULL)
+		udp_rput_other(_RD(q), mp1);
+	else
+		CONN_INC_REF(udp->udp_connp);
 }
 
 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
@@ -1339,7 +1988,7 @@
 udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
 {
 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
-		qreply(q, mp);
+		putnext(UDP_RD(q), mp);
 }
 
 /* Shorthand to generate and send TPI error acks to our client */
@@ -1355,7 +2004,7 @@
 		teackp->ERROR_prim = primitive;
 		teackp->TLI_error = t_error;
 		teackp->UNIX_error = sys_error;
-		qreply(q, mp);
+		putnext(UDP_RD(q), mp);
 	}
 }
 
@@ -1372,10 +2021,6 @@
 	return (0);
 }
 
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
 /* ARGSUSED */
 static int
 udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1393,11 +2038,9 @@
 		return (EINVAL);
 	}
 
-	mutex_enter(&udp_g_lock);
 	/* Check if the value is already in the list */
 	for (i = 0; i < udp_g_num_epriv_ports; i++) {
 		if (new_value == udp_g_epriv_ports[i]) {
-			mutex_exit(&udp_g_lock);
 			return (EEXIST);
 		}
 	}
@@ -1407,20 +2050,14 @@
 			break;
 	}
 	if (i == udp_g_num_epriv_ports) {
-		mutex_exit(&udp_g_lock);
 		return (EOVERFLOW);
 	}
 
 	/* Set the new value */
 	udp_g_epriv_ports[i] = (in_port_t)new_value;
-	mutex_exit(&udp_g_lock);
 	return (0);
 }
 
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
 /* ARGSUSED */
 static int
 udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1438,20 +2075,17 @@
 		return (EINVAL);
 	}
 
-	mutex_enter(&udp_g_lock);
 	/* Check that the value is already in the list */
 	for (i = 0; i < udp_g_num_epriv_ports; i++) {
 		if (udp_g_epriv_ports[i] == new_value)
 			break;
 	}
 	if (i == udp_g_num_epriv_ports) {
-		mutex_exit(&udp_g_lock);
 		return (ESRCH);
 	}
 
 	/* Clear the value */
 	udp_g_epriv_ports[i] = 0;
-	mutex_exit(&udp_g_lock);
 	return (0);
 }
 
@@ -1478,8 +2112,8 @@
 	sin6_t	sin6;
 	mblk_t	*mp1;
 	int	error = 0;
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	size_t	mp_size = MBLKL(mp);
+	udp_t	*udp = Q_TO_UDP(q);
 
 	/*
 	 * Assume IP provides aligned packets - otherwise toss
@@ -1495,7 +2129,7 @@
 	 */
 	if (!udp->udp_dgram_errind || mp_size < sizeof (ipha_t)) {
 noticmpv4:
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -1590,7 +2224,7 @@
 		break;
 	}
 	if (mp1)
-		putnext(q, mp1);
+		putnext(UDP_RD(q), mp1);
 	freemsg(mp);
 }
 
@@ -1609,7 +2243,6 @@
 static void
 udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
 {
-	udp_t		*udp = (udp_t *)q->q_ptr;
 	icmp6_t		*icmp6;
 	ip6_t		*ip6h, *outer_ip6h;
 	uint16_t	hdr_length;
@@ -1619,13 +2252,14 @@
 	mblk_t		*mp1;
 	int		error = 0;
 	size_t		mp_size = MBLKL(mp);
+	udp_t		*udp = Q_TO_UDP(q);
 
 	/*
 	 * Verify that we have a complete IP header. If not, send it upstream.
 	 */
 	if (mp_size < sizeof (ip6_t)) {
 noticmpv6:
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -1736,7 +2370,7 @@
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		putnext(q, newmp);
+		putnext(UDP_RD(q), newmp);
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1766,7 +2400,7 @@
 	mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
 	    error);
 	if (mp1)
-		putnext(q, mp1);
+		putnext(UDP_RD(q), mp1);
 	freemsg(mp);
 }
 
@@ -1780,11 +2414,11 @@
 static void
 udp_addr_req(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	sin_t	*sin;
 	sin6_t	*sin6;
 	mblk_t	*ackmp;
 	struct T_addr_ack *taa;
+	udp_t	*udp = Q_TO_UDP(q);
 
 	/* Make it large enough for worst case */
 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1894,7 +2528,7 @@
 		}
 	}
 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
-	qreply(q, ackmp);
+	putnext(UDP_RD(q), ackmp);
 }
 
 static void
@@ -1918,9 +2552,9 @@
 static void
 udp_capability_req(queue_t *q, mblk_t *mp)
 {
-	udp_t			*udp = (udp_t *)q->q_ptr;
 	t_uscalar_t		cap_bits1;
 	struct T_capability_ack	*tcap;
+	udp_t	*udp = Q_TO_UDP(q);
 
 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
 
@@ -1937,7 +2571,7 @@
 		tcap->CAP_bits1 |= TC1_INFO;
 	}
 
-	qreply(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -1948,7 +2582,7 @@
 static void
 udp_info_req(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
+	udp_t *udp = Q_TO_UDP(q);
 
 	/* Create a T_INFO_ACK message. */
 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
@@ -1956,7 +2590,7 @@
 	if (!mp)
 		return;
 	udp_copy_info((struct T_info_ack *)mp->b_rptr, udp);
-	qreply(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -2102,20 +2736,19 @@
  * This is the open routine for udp.  It allocates a udp_t structure for
  * the stream and, on the first open of the module, creates an ND table.
  */
+/* ARGSUSED */
 static int
 udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 {
 	int	err;
 	udp_t	*udp;
+	conn_t *connp;
+	zoneid_t zoneid = getzoneid();
+	queue_t	*ip_wq;
+	char	*name;
 
 	TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q);
 
-	/*
-	 * Defer the qprocson until everything is initialized since
-	 * we are D_MTPERQ and after qprocson the rput routine can
-	 * run.
-	 */
-
 	/* If the stream is already open, return immediately. */
 	if (q->q_ptr != NULL)
 		return (0);
@@ -2124,85 +2757,110 @@
 	if (sflag != MODOPEN)
 		return (EINVAL);
 
+	q->q_hiwat = udp_recv_hiwat;
+	WR(q)->q_hiwat = udp_xmit_hiwat;
+	WR(q)->q_lowat = udp_xmit_lowat;
+
+	/* Insert ourselves in the stream since we're about to walk q_next */
+	qprocson(q);
+
+	udp = kmem_cache_alloc(udp_cache, KM_SLEEP);
+	bzero(udp, sizeof (*udp));
+
 	/*
-	 * Create and initialize a udp_t structure for this stream.
+	 * UDP is supported only as a module and it has to be pushed directly
+	 * above the device instance of IP. If UDP is pushed anywhere else
+	 * on a stream, it will support just T_SVR4_OPTMGMT_REQ for the
+	 * sake of MIB browsers and fail everything else.
 	 */
-	udp = (udp_t *)mi_open_alloc_sleep(sizeof (udp_t));
+	ip_wq = WR(q)->q_next;
+	if (ip_wq->q_next != NULL ||
+	    (name = ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL ||
+	    strcmp(name, IP_MOD_NAME) != 0 ||
+	    ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) {
+		/* Support just SNMP for MIB browsers */
+		connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP);
+		connp->conn_rq = q;
+		connp->conn_wq = WR(q);
+		connp->conn_flags |= IPCL_UDPMOD;
+		connp->conn_cred = credp;
+		connp->conn_zoneid = zoneid;
+		connp->conn_udp = udp;
+		udp->udp_connp = connp;
+		q->q_ptr = WR(q)->q_ptr = connp;
+		crhold(credp);
+		q->q_qinfo = &udp_snmp_rinit;
+		WR(q)->q_qinfo = &udp_snmp_winit;
+		return (0);
+	}
+
+	/*
+	 * Initialize the udp_t structure for this stream.
+	 */
+	q = RD(ip_wq);
+	connp = Q_TO_CONN(q);
+	mutex_enter(&connp->conn_lock);
+	connp->conn_proto = IPPROTO_UDP;
+	connp->conn_flags |= IPCL_UDP;
+	connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+	connp->conn_udp = udp;
 
 	/* Set the initial state of the stream and the privilege status. */
-	q->q_ptr = WR(q)->q_ptr = udp;
+	udp->udp_connp = connp;
 	udp->udp_state = TS_UNBND;
+	udp->udp_mode = UDP_MT_HOT;
 	if (getmajor(*devp) == (major_t)UDP6_MAJ) {
 		udp->udp_family = AF_INET6;
 		udp->udp_ipversion = IPV6_VERSION;
 		udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
 		udp->udp_ttl = udp_ipv6_hoplimit;
+		connp->conn_af_isv6 = B_TRUE;
+		connp->conn_flags |= IPCL_ISV6;
 	} else {
 		udp->udp_family = AF_INET;
 		udp->udp_ipversion = IPV4_VERSION;
 		udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
 		udp->udp_ttl = udp_ipv4_ttl;
-	}
-
-	/*
-	 * The receive hiwat is only looked at on the stream head queue.
-	 * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
-	 */
-	q->q_hiwat = udp_recv_hiwat;
+		connp->conn_af_isv6 = B_FALSE;
+		connp->conn_flags &= ~IPCL_ISV6;
+	}
 
 	udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
-	udp->udp_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-	udp->udp_credp = credp;
-	crhold(credp);
-
-	udp->udp_zoneid = getzoneid();
-
-	/*
-	 * Acquire the lock and link it into the list of open streams.
-	 */
-	mutex_enter(&udp_g_lock);
-	err = mi_open_link(&udp_g_head, (IDP)udp, devp, flag, sflag, credp);
-	mutex_exit(&udp_g_lock);
-	if (err != 0)
-		goto error;
-
-	qprocson(q);
+	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+	connp->conn_zoneid = zoneid;
+
+	if (connp->conn_flags & IPCL_SOCKET) {
+		udp->udp_issocket = B_TRUE;
+		udp->udp_direct_sockfs = B_TRUE;
+	}
+	mutex_exit(&connp->conn_lock);
 
 	/*
 	 * The transmit hiwat/lowat is only looked at on IP's queue.
-	 * Store in q_hiwat in order to return on SO_SNDBUF
+	 * Store in q_hiwat in order to return on SO_SNDBUF/SO_RCVBUF
 	 * getsockopts.
 	 */
+	q->q_hiwat = udp_recv_hiwat;
 	WR(q)->q_hiwat = udp_xmit_hiwat;
-	WR(q)->q_next->q_hiwat = WR(q)->q_hiwat;
 	WR(q)->q_lowat = udp_xmit_lowat;
-	WR(q)->q_next->q_lowat = WR(q)->q_lowat;
 
 	if (udp->udp_family == AF_INET6) {
 		/* Build initial header template for transmit */
 		if ((err = udp_build_hdrs(q, udp)) != 0) {
-			qprocsoff(q);
-			/*
-			 * Unlink the udp structure and release
-			 * the minor device number.
-			 */
-			mutex_enter(&udp_g_lock);
-			mi_close_unlink(&udp_g_head, (IDP)udp);
-			mutex_exit(&udp_g_lock);
-			goto error;
-		}
-	}
-
-	/* Set the Stream head write offset. */
-	(void) mi_set_sth_wroff(q, udp->udp_max_hdr_len + udp_wroff_extra);
-	(void) mi_set_sth_hiwat(q, q->q_hiwat);
+			qprocsoff(UDP_RD(q));
+			udp->udp_connp = NULL;
+			connp->conn_udp = NULL;
+			kmem_cache_free(udp_cache, udp);
+			return (err);
+		}
+	}
+
+	/* Set the Stream head write offset and high watermark. */
+	(void) mi_set_sth_wroff(UDP_RD(q),
+	    udp->udp_max_hdr_len + udp_wroff_extra);
+	(void) mi_set_sth_hiwat(UDP_RD(q), udp_set_rcv_hiwat(udp, q->q_hiwat));
+
 	return (0);
-
-error:
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	crfree(credp);
-	mi_close_free((IDP)udp);
-	return (err);
 }
 
 /*
@@ -2212,7 +2870,6 @@
 static boolean_t
 udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
 {
-
 	return (B_TRUE);
 }
 
@@ -2255,15 +2912,22 @@
 }
 
 /*
- * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * This routine retrieves the current status of socket options
+ * and expects the caller to pass in the queue pointer of the
+ * upper instance.  It returns the size of the option retrieved.
  */
 int
 udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 {
 	int	*i1 = (int *)ptr;
-	udp_t	*udp = (udp_t *)q->q_ptr;
-	ip6_pkt_t	*ipp = &udp->udp_sticky_ipp;
+	conn_t	*connp;
+	udp_t	*udp;
+	ip6_pkt_t *ipp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
+	ipp = &udp->udp_sticky_ipp;
 
 	switch (level) {
 	case SOL_SOCKET:
@@ -2333,7 +2997,7 @@
 			*(uchar_t *)ptr = udp->udp_multicast_ttl;
 			return (sizeof (uchar_t));
 		case IP_MULTICAST_LOOP:
-			*ptr = udp->udp_multicast_loop;
+			*ptr = connp->conn_multicast_loop;
 			return (sizeof (uint8_t));
 		case IP_RECVOPTS:
 			*i1 = udp->udp_recvopts;
@@ -2394,7 +3058,7 @@
 			*i1 = udp->udp_multicast_ttl;
 			break;	/* goto sizeof (int) option return */
 		case IPV6_MULTICAST_LOOP:
-			*i1 = udp->udp_multicast_loop;
+			*i1 = connp->conn_multicast_loop;
 			break;	/* goto sizeof (int) option return */
 		case IPV6_JOIN_GROUP:
 		case IPV6_LEAVE_GROUP:
@@ -2520,18 +3184,26 @@
 	return (sizeof (int));
 }
 
-/* This routine sets socket options. */
+/*
+ * This routine sets socket options; it expects the caller
+ * to pass in the queue pointer of the upper instance.
+ */
 /* ARGSUSED */
 int
 udp_opt_set(queue_t *q, uint_t optset_context, int level,
     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	int	*i1 = (int *)invalp;
 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
 	boolean_t checkonly;
 	int	error;
+	conn_t	*connp;
+	udp_t	*udp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 
 	switch (optset_context) {
 	case SETFN_OPTCOM_CHECKONLY:
@@ -2619,7 +3291,7 @@
 			}
 			if (!checkonly) {
 				q->q_hiwat = *i1;
-				q->q_next->q_hiwat = *i1;
+				WR(UDP_RD(q))->q_hiwat = *i1;
 			}
 			break;
 		case SO_RCVBUF:
@@ -2629,7 +3301,9 @@
 			}
 			if (!checkonly) {
 				RD(q)->q_hiwat = *i1;
-				(void) mi_set_sth_hiwat(RD(q), *i1);
+				UDP_RD(q)->q_hiwat = *i1;
+				(void) mi_set_sth_hiwat(UDP_RD(q),
+				    udp_set_rcv_hiwat(udp, *i1));
 			}
 			break;
 		case SO_DGRAM_ERRIND:
@@ -2709,7 +3383,7 @@
 			break;
 		case IP_MULTICAST_LOOP:
 			if (!checkonly)
-				udp->udp_multicast_loop = *invalp;
+				connp->conn_multicast_loop = *invalp;
 			break;
 		case IP_RECVOPTS:
 			if (!checkonly)
@@ -2847,7 +3521,7 @@
 				return (EINVAL);
 			}
 			if (!checkonly)
-				udp->udp_multicast_loop = *i1;
+				connp->conn_multicast_loop = *i1;
 			break;
 		case IPV6_JOIN_GROUP:
 		case IPV6_LEAVE_GROUP:
@@ -3093,6 +3767,7 @@
 					ipp->ipp_rtdstopts = NULL;
 					ipp->ipp_rtdstoptslen = 0;
 				}
+
 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
 			} else {
@@ -3447,12 +4122,13 @@
 }
 
 static void
-udp_rput(queue_t *q, mblk_t *mp)
+udp_input(conn_t *connp, mblk_t *mp)
 {
 	struct T_unitdata_ind	*tudi;
-	uchar_t			*rptr;
-	int			hdr_length;
+	uchar_t			*rptr;		/* Pointer to IP header */
+	int			hdr_length;	/* Length of IP+UDP headers */
 	int			udi_size;	/* Size of T_unitdata_ind */
+	int			mp_len;
 	udp_t			*udp;
 	udpha_t			*udpha;
 	int			ipversion;
@@ -3462,104 +4138,56 @@
 	mblk_t			*mp1;
 	mblk_t			*options_mp = NULL;
 	in_pktinfo_t		*pinfo = NULL;
-	size_t			mp_size = MBLKL(mp);
 	cred_t			*cr = NULL;
+	queue_t			*q = connp->conn_rq;
 	pid_t			cpid;
 
 	TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
 	    "udp_rput_start: q %p mp %p", q, mp);
 
-	udp = (udp_t *)q->q_ptr;
+	udp = connp->conn_udp;
 	rptr = mp->b_rptr;
-
-	switch (mp->b_datap->db_type) {
-	case M_DATA:
-		/*
-		 * M_DATA messages contain IP datagrams.  They are handled
-		 * after this switch.
-		 */
-		break;
-	case M_PROTO:
-	case M_PCPROTO:
-		/* M_PROTO messages contain some type of TPI message. */
-		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
-			freemsg(mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-				"udp_rput_end: q %p (%S)", q, "protoshort");
-			return;
-		}
-		qwriter(q, mp, udp_rput_other, PERIM_INNER);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "proto");
-		return;
-	case M_FLUSH:
-		if (*mp->b_rptr & FLUSHR)
-			flushq(q, FLUSHDATA);
-		putnext(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "flush");
-		return;
-	case M_CTL:
-		if (udp->udp_recvif || udp->udp_recvslla ||
-		    udp->udp_ipv6_recvpktinfo) {
-			/*
-			 * IP should have prepended the options data in an M_CTL
-			 * Check M_CTL "type" to make sure are not here bcos of
-			 * a valid ICMP message
-			 */
-			if (mp_size == sizeof (in_pktinfo_t) &&
-			    ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
-			    IN_PKTINFO) {
-				pinfo = (in_pktinfo_t *)mp->b_rptr;
-				/*
-				 * Jump to normal data processing, this is not
-				 * an ICMP message
-				 */
-				break;
-			}
-		}
-		/*
-		 * ICMP messages.
-		 */
-		udp_icmp_error(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "m_ctl");
-		return;
-	default:
-		putnext(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "default");
-		return;
-	}
+	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+	ASSERT(OK_32PTR(rptr));
 
 	/*
-	 * If we are here bcos the IP_RECVIF or IP_RECVSLLA then we need to
-	 * extract the mblk and adjust the rptr
+	 * IP should have prepended the options data in an M_CTL
+	 * Check M_CTL "type" to make sure are not here bcos of
+	 * a valid ICMP message
 	 */
-	if (pinfo != NULL) {
-		ASSERT(mp->b_datap->db_type == M_CTL);
-		options_mp = mp;
-		mp = mp->b_cont;
-		rptr = mp->b_rptr;
-		mp_size = MBLKL(mp);
-	}
+	if (DB_TYPE(mp) == M_CTL) {
+		if (MBLKL(mp) == sizeof (in_pktinfo_t) &&
+		    ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
+		    IN_PKTINFO) {
+			/*
+			 * IP_RECVIF or IP_RECVSLLA information has been
+			 * appended to the packet by IP. We need to
+			 * extract the mblk and adjust the rptr
+			 */
+			pinfo = (in_pktinfo_t *)mp->b_rptr;
+			options_mp = mp;
+			mp = mp->b_cont;
+			rptr = mp->b_rptr;
+			UDP_STAT(udp_in_pktinfo);
+		} else {
+			/*
+			 * ICMP messages.
+			 */
+			udp_icmp_error(q, mp);
+			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
+				"udp_rput_end: q %p (%S)", q, "m_ctl");
+			return;
+		}
+	}
+
+	mp_len = msgdsize(mp);
 	/*
 	 * This is the inbound data path.
 	 * First, we check to make sure the IP version number is correct,
 	 * and then pull the IP and UDP headers into the first mblk.
-	 */
-	/*
 	 * Assume IP provides aligned packets - otherwise toss.
 	 * Also, check if we have a complete IP header.
 	 */
-	if (!OK_32PTR(rptr) || (mp_size < sizeof (ipha_t))) {
-tossit:
-		freemsg(mp);
-		if (options_mp != NULL)
-			freeb(options_mp);
-		BUMP_MIB(&udp_mib, udpInErrors);
-		return;
-	}
 
 	/* Initialize regardless if ipversion is IPv4 or IPv6 */
 	ipp.ipp_fields = 0;
@@ -3567,10 +4195,9 @@
 	ipversion = IPH_HDR_VERSION(rptr);
 	switch (ipversion) {
 	case IPV4_VERSION:
+		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+		ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
 		hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE;
-		/* Verify this is a UDP packet */
-		if (((ipha_t *)rptr)->ipha_protocol != IPPROTO_UDP)
-			goto tossit;
 		if ((hdr_length > IP_SIMPLE_HDR_LENGTH + UDPH_SIZE) ||
 		    (udp->udp_ip_rcv_options_len)) {
 			/*
@@ -3587,7 +4214,7 @@
 			 * the packet.
 			 */
 			udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
-			if (msgdsize(mp) != (ntohs(udpha->uha_length) +
+			if (mp_len != (ntohs(udpha->uha_length) +
 			    hdr_length - UDPH_SIZE)) {
 				goto tossit;
 			}
@@ -3597,14 +4224,16 @@
 			 */
 			if (pinfo != NULL)
 				mp = options_mp;
-			qwriter(q, mp, udp_rput_other, PERIM_INNER);
+			udp_become_writer(connp, mp, udp_rput_other_wrapper,
+			    SQTAG_UDP_INPUT);
 			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
 				"udp_rput_end: q %p (%S)", q, "end");
 			return;
 		}
 
 		/* Handle IPV6_RECVHOPLIMIT. */
-		if ((udp->udp_family == AF_INET6) && (pinfo != NULL)) {
+		if ((udp->udp_family == AF_INET6) && (pinfo != NULL) &&
+		    udp->udp_ipv6_recvpktinfo) {
 			if (pinfo->in_pkt_flags & IPF_RECVIF) {
 				ipp.ipp_fields |= IPPF_IFINDEX;
 				ipp.ipp_ifindex = pinfo->in_pkt_ifindex;
@@ -3620,8 +4249,7 @@
 		ASSERT(udp->udp_family == AF_INET6);
 
 		ip6h = (ip6_t *)rptr;
-		if ((uchar_t *)&ip6h[1] > mp->b_wptr)
-			goto tossit;
+		ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
 
 		if (ip6h->ip6_nxt != IPPROTO_UDP) {
 			uint8_t nexthdrp;
@@ -3647,6 +4275,7 @@
 				if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE))
 					goto tossit;
 				ip6h = (ip6_t *)rptr;
+				mp_len = msgdsize(mp);
 			}
 			/*
 			 * Find any potentially interesting extension headers
@@ -3655,18 +4284,14 @@
 			 */
 			hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) +
 			    UDPH_SIZE;
-			/* Verify this is a UDP packet */
-			if (nexthdrp != IPPROTO_UDP)
-				goto tossit;
+			ASSERT(nexthdrp == IPPROTO_UDP);
 		} else {
 			hdr_length = IPV6_HDR_LEN + UDPH_SIZE;
 			ip6i = NULL;
 		}
 		break;
 	default:
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "Unknown IP version");
-		goto tossit;
+		ASSERT(0);
 	}
 
 	/*
@@ -3677,14 +4302,15 @@
 	 */
 	udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
 	if ((MBLKL(mp) < hdr_length) ||
-	    (msgdsize(mp) != (ntohs(udpha->uha_length) +
-	    hdr_length - UDPH_SIZE))) {
+	    (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) {
 		goto tossit;
 	}
 
 	/* Walk past the headers. */
-	if (!udp->udp_rcvhdr)
+	if (!udp->udp_rcvhdr) {
 		mp->b_rptr = rptr + hdr_length;
+		mp_len -= hdr_length;
+	}
 
 	/*
 	 * This is the inbound data path.  Packets are passed upstream as
@@ -3706,6 +4332,7 @@
 		if (udp->udp_recvdstaddr) {
 			udi_size += sizeof (struct T_opthdr) +
 			    sizeof (struct in_addr);
+			UDP_STAT(udp_in_recvdstaddr);
 		}
 
 		/*
@@ -3714,25 +4341,28 @@
 		 */
 		if (udp->udp_recvif && (pinfo != NULL) &&
 		    (pinfo->in_pkt_flags & IPF_RECVIF)) {
-			udi_size += sizeof (struct T_opthdr) +
-				sizeof (uint_t);
+			udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+			UDP_STAT(udp_in_recvif);
 		}
 
 		if (udp->udp_recvslla && (pinfo != NULL) &&
 		    (pinfo->in_pkt_flags & IPF_RECVSLLA)) {
 			udi_size += sizeof (struct T_opthdr) +
-				sizeof (struct sockaddr_dl);
+			    sizeof (struct sockaddr_dl);
+			UDP_STAT(udp_in_recvslla);
 		}
 
 		if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 			udi_size += sizeof (struct T_opthdr) + ucredsize;
 			cpid = DB_CPID(mp);
+			UDP_STAT(udp_in_recvucred);
 		}
 		/*
 		 * If IP_RECVTTL is set allocate the appropriate sized buffer
 		 */
 		if (udp->udp_recvttl) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+			UDP_STAT(udp_in_recvttl);
 		}
 
 		ASSERT(IPH_HDR_LENGTH((ipha_t *)rptr) == IP_SIMPLE_HDR_LENGTH);
@@ -3889,12 +4519,14 @@
 			    (ipp.ipp_fields & IPPF_HOPOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_hopoptslen;
+				UDP_STAT(udp_in_recvhopopts);
 			}
 			if ((udp->udp_ipv6_recvdstopts ||
 				udp->udp_old_ipv6_recvdstopts) &&
 			    (ipp.ipp_fields & IPPF_DSTOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_dstoptslen;
+				UDP_STAT(udp_in_recvdstopts);
 			}
 			if (((udp->udp_ipv6_recvdstopts &&
 			    udp->udp_ipv6_recvrthdr &&
@@ -3903,29 +4535,37 @@
 			    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_rtdstoptslen;
+				UDP_STAT(udp_in_recvrtdstopts);
 			}
 			if (udp->udp_ipv6_recvrthdr &&
 			    (ipp.ipp_fields & IPPF_RTHDR)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_rthdrlen;
+				UDP_STAT(udp_in_recvrthdr);
 			}
 			if (udp->udp_ipv6_recvpktinfo &&
 			    (ipp.ipp_fields & IPPF_IFINDEX)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    sizeof (struct in6_pktinfo);
+				UDP_STAT(udp_in_recvpktinfo);
 			}
 
 		}
 		if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 			udi_size += sizeof (struct T_opthdr) + ucredsize;
 			cpid = DB_CPID(mp);
-		}
-
-		if (udp->udp_ipv6_recvhoplimit)
+			UDP_STAT(udp_in_recvucred);
+		}
+
+		if (udp->udp_ipv6_recvhoplimit) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (int);
-
-		if (udp->udp_ipv6_recvtclass)
+			UDP_STAT(udp_in_recvhoplimit);
+		}
+
+		if (udp->udp_ipv6_recvtclass) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (int);
+			UDP_STAT(udp_in_recvtclass);
+		}
 
 		mp1 = allocb(udi_size, BPRI_MED);
 		if (mp1 == NULL) {
@@ -3960,7 +4600,7 @@
 			sin6->sin6_flowinfo = 0;
 			sin6->sin6_scope_id = 0;
 			sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
-			    udp->udp_zoneid);
+			    connp->conn_zoneid);
 		} else {
 			sin6->sin6_addr = ip6h->ip6_src;
 			/* No sin6_flowinfo per API */
@@ -3971,8 +4611,8 @@
 				sin6->sin6_scope_id = ipp.ipp_ifindex;
 			else
 				sin6->sin6_scope_id = 0;
-			sin6->__sin6_src_id =
-			    ip_srcid_find_addr(&ip6h->ip6_dst, udp->udp_zoneid);
+			sin6->__sin6_src_id = ip_srcid_find_addr(
+			    &ip6h->ip6_dst, connp->conn_zoneid);
 		}
 		sin6->sin6_port = udpha->uha_src_port;
 		sin6->sin6_family = udp->udp_family;
@@ -4133,7 +4773,45 @@
 		"udp_rput_end: q %p (%S)", q, "end");
 	if (options_mp != NULL)
 		freeb(options_mp);
-	putnext(q, mp);
+
+	if (udp->udp_direct_sockfs) {
+		/*
+		 * There is nothing above us except for the stream head;
+		 * use the read-side synchronous stream interface in
+		 * order to reduce the time spent in interrupt thread.
+		 */
+		ASSERT(udp->udp_issocket);
+		udp_rcv_enqueue(UDP_RD(q), udp, mp, mp_len);
+	} else {
+		/*
+		 * Use regular STREAMS interface to pass data upstream
+		 * if this is not a socket endpoint, or if we have
+		 * switched over to the slow mode due to sockmod being
+		 * popped or a module being pushed on top of us.
+		 */
+		putnext(UDP_RD(q), mp);
+	}
+	return;
+
+tossit:
+	freemsg(mp);
+	if (options_mp != NULL)
+		freeb(options_mp);
+	BUMP_MIB(&udp_mib, udpInErrors);
+}
+
+void
+udp_conn_recv(conn_t *connp, mblk_t *mp)
+{
+	_UDP_ENTER(connp, mp, udp_input_wrapper, SQTAG_UDP_FANOUT);
+}
+
+/* ARGSUSED */
+static void
+udp_input_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_input((conn_t *)arg, mp);
+	_UDP_EXIT((conn_t *)arg);
 }
 
 /*
@@ -4152,18 +4830,17 @@
 	int			opt_len;	/* Length of IP options */
 	sin_t			*sin;
 	struct T_error_ack	*tea;
-	udp_t			*udp;
 	mblk_t			*options_mp = NULL;
 	in_pktinfo_t		*pinfo;
 	boolean_t		recv_on = B_FALSE;
 	cred_t			*cr = NULL;
+	udp_t			*udp = Q_TO_UDP(q);
 	pid_t			cpid;
 
 	TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
 	    "udp_rput_other: q %p mp %p", q, mp);
 
 	ASSERT(OK_32PTR(mp->b_rptr));
-	udp = (udp_t *)q->q_ptr;
 	rptr = mp->b_rptr;
 
 	switch (mp->b_datap->db_type) {
@@ -4258,7 +4935,7 @@
 			freemsg(mp);
 			return;
 		}
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -4323,9 +5000,12 @@
 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
 	if (udp->udp_recvdstaddr) {
 		udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr);
-	}
-	if (udp->udp_recvopts && opt_len > 0)
+		UDP_STAT(udp_in_recvdstaddr);
+	}
+	if (udp->udp_recvopts && opt_len > 0) {
 		udi_size += sizeof (struct T_opthdr) + opt_len;
+		UDP_STAT(udp_in_recvopts);
+	}
 
 	/*
 	 * If the IP_RECVSLLA or the IP_RECVIF is set then allocate
@@ -4333,25 +5013,28 @@
 	 */
 	if (udp->udp_recvif && recv_on &&
 	    (pinfo->in_pkt_flags & IPF_RECVIF)) {
-		udi_size += sizeof (struct T_opthdr) +
-		    sizeof (uint_t);
+		udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+		UDP_STAT(udp_in_recvif);
 	}
 
 	if (udp->udp_recvslla && recv_on &&
 	    (pinfo->in_pkt_flags & IPF_RECVSLLA)) {
 		udi_size += sizeof (struct T_opthdr) +
 		    sizeof (struct sockaddr_dl);
+		UDP_STAT(udp_in_recvslla);
 	}
 
 	if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 		udi_size += sizeof (struct T_opthdr) + ucredsize;
 		cpid = DB_CPID(mp);
+		UDP_STAT(udp_in_recvucred);
 	}
 	/*
 	 * If IP_RECVTTL is set allocate the appropriate sized buffer
 	 */
 	if (udp->udp_recvttl) {
 		udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+		UDP_STAT(udp_in_recvttl);
 	}
 
 	/* Allocate a message block for the T_UNITDATA_IND structure. */
@@ -4502,7 +5185,34 @@
 	    "udp_rput_other_end: q %p (%S)", q, "end");
 	if (options_mp != NULL)
 		freeb(options_mp);
-	putnext(q, mp);
+
+	if (udp->udp_direct_sockfs) {
+		/*
+		 * There is nothing above us except for the stream head;
+		 * use the read-side synchronous stream interface in
+		 * order to reduce the time spent in interrupt thread.
+		 */
+		ASSERT(udp->udp_issocket);
+		udp_rcv_enqueue(UDP_RD(q), udp, mp, msgdsize(mp));
+	} else {
+		/*
+		 * Use regular STREAMS interface to pass data upstream
+		 * if this is not a socket endpoint, or if we have
+		 * switched over to the slow mode due to sockmod being
+		 * popped or a module being pushed on top of us.
+		 */
+		putnext(UDP_RD(q), mp);
+	}
+}
+
+/* ARGSUSED */
+static void
+udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	conn_t *connp = arg;
+
+	udp_rput_other(connp->conn_rq, mp);
+	udp_exit(connp);
 }
 
 /*
@@ -4511,7 +5221,7 @@
 static void
 udp_rput_bind_ack(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
+	udp_t	*udp = Q_TO_UDP(q);
 	mblk_t	*mp1;
 	ire_t	*ire;
 	struct T_bind_ack *tba;
@@ -4602,20 +5312,20 @@
 		while (mp != NULL) {
 			mp1 = mp->b_cont;
 			mp->b_cont = NULL;
-			putnext(q, mp);
+			putnext(UDP_RD(q), mp);
 			mp = mp1;
 		}
 		return;
 	}
 	freemsg(mp->b_cont);
 	mp->b_cont = NULL;
-	putnext(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
  * return SNMP stuff in buffer in mpdata
  */
-static int
+int
 udp_snmp_get(queue_t *q, mblk_t *mpctl)
 {
 	mblk_t			*mpdata;
@@ -4626,12 +5336,14 @@
 	mblk_t			*mp_conn_tail = NULL;
 	mblk_t			*mp6_conn_tail = NULL;
 	struct opthdr		*optp;
-	IDP			idp;
-	udp_t			*udp;
 	mib2_udpEntry_t		ude;
 	mib2_udp6Entry_t	ude6;
 	int			state;
 	zoneid_t		zoneid;
+	int			i;
+	connf_t			*connfp;
+	conn_t			*connp = Q_TO_CONN(q);
+	udp_t			*udp = connp->conn_udp;
 
 	if (mpctl == NULL ||
 	    (mpdata = mpctl->b_cont) == NULL ||
@@ -4644,8 +5356,7 @@
 	mp_conn_data = mp_conn_ctl->b_cont;
 	mp6_conn_data = mp6_conn_ctl->b_cont;
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 
 	/* fixed length structure for IPv4 and IPv6 counters */
 	SET_MIB(udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t));
@@ -4657,76 +5368,88 @@
 	optp->len = msgdsize(mpdata);
 	qreply(q, mpctl);
 
-	mutex_enter(&udp_g_lock);
-	for (idp = mi_first_ptr(&udp_g_head);
-	    (udp = (udp_t *)idp) != 0;
-	    idp = mi_next_ptr(&udp_g_head, idp)) {
-
-		if (zoneid != udp->udp_zoneid)
-			continue;
-
-		/* Note that the port numbers are sent in host byte order */
-
-		if (udp->udp_state == TS_UNBND)
-			state = MIB2_UDP_unbound;
-		else if (udp->udp_state == TS_IDLE)
-			state = MIB2_UDP_idle;
-		else if (udp->udp_state == TS_DATA_XFER)
-			state = MIB2_UDP_connected;
-		else
-			state = MIB2_UDP_unknown;
-
-		/*
-		 * Create an IPv4 table entry for IPv4 entries and also
-		 * any IPv6 entries which are bound to in6addr_any
-		 * (i.e. anything a IPv4 peer could connect/send to).
-		 */
-		if (udp->udp_ipversion == IPV4_VERSION ||
-		    (udp->udp_state <= TS_IDLE &&
-		    IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
-			ude.udpEntryInfo.ue_state = state;
-			/* If in6addr_any this will set it to INADDR_ANY */
-			ude.udpLocalAddress = V4_PART_OF_V6(udp->udp_v6src);
-			ude.udpLocalPort = ntohs(udp->udp_port);
-			if (udp->udp_state == TS_DATA_XFER) {
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipcl_globalhash_fanout[i];
+		connp = NULL;
+
+		while ((connp = ipcl_get_next_conn(connfp, connp,
+		    IPCL_UDP))) {
+			udp = connp->conn_udp;
+			if (zoneid != connp->conn_zoneid)
+				continue;
+
+			/*
+			 * Note that the port numbers are sent in
+			 * host byte order
+			 */
+
+			if (udp->udp_state == TS_UNBND)
+				state = MIB2_UDP_unbound;
+			else if (udp->udp_state == TS_IDLE)
+				state = MIB2_UDP_idle;
+			else if (udp->udp_state == TS_DATA_XFER)
+				state = MIB2_UDP_connected;
+			else
+				state = MIB2_UDP_unknown;
+
+			/*
+			 * Create an IPv4 table entry for IPv4 entries and also
+			 * any IPv6 entries which are bound to in6addr_any
+			 * (i.e. anything a IPv4 peer could connect/send to).
+			 */
+			if (udp->udp_ipversion == IPV4_VERSION ||
+			    (udp->udp_state <= TS_IDLE &&
+			    IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
+				ude.udpEntryInfo.ue_state = state;
 				/*
-				 * Can potentially get here for v6 socket
-				 * if another process (say, ping) has just
-				 * done a sendto(), changing the state
-				 * from the TS_IDLE above to TS_DATA_XFER
-				 * by the time we hit this part of the code.
+				 * If in6addr_any this will set it to
+				 * INADDR_ANY
 				 */
-				ude.udpEntryInfo.ue_RemoteAddress =
-				    V4_PART_OF_V6(udp->udp_v6dst);
-				ude.udpEntryInfo.ue_RemotePort =
-				    ntohs(udp->udp_dstport);
-			} else {
-				ude.udpEntryInfo.ue_RemoteAddress = 0;
-				ude.udpEntryInfo.ue_RemotePort = 0;
+				ude.udpLocalAddress =
+				    V4_PART_OF_V6(udp->udp_v6src);
+				ude.udpLocalPort = ntohs(udp->udp_port);
+				if (udp->udp_state == TS_DATA_XFER) {
+					/*
+					 * Can potentially get here for
+					 * v6 socket if another process
+					 * (say, ping) has just done a
+					 * sendto(), changing the state
+					 * from the TS_IDLE above to
+					 * TS_DATA_XFER by the time we hit
+					 * this part of the code.
+					 */
+					ude.udpEntryInfo.ue_RemoteAddress =
+					    V4_PART_OF_V6(udp->udp_v6dst);
+					ude.udpEntryInfo.ue_RemotePort =
+					    ntohs(udp->udp_dstport);
+				} else {
+					ude.udpEntryInfo.ue_RemoteAddress = 0;
+					ude.udpEntryInfo.ue_RemotePort = 0;
+				}
+				(void) snmp_append_data2(mp_conn_data,
+				    &mp_conn_tail, (char *)&ude, sizeof (ude));
 			}
-			(void) snmp_append_data2(mp_conn_data, &mp_conn_tail,
-			    (char *)&ude, sizeof (ude));
-		}
-		if (udp->udp_ipversion == IPV6_VERSION) {
-			ude6.udp6EntryInfo.ue_state  = state;
-			ude6.udp6LocalAddress = udp->udp_v6src;
-			ude6.udp6LocalPort = ntohs(udp->udp_port);
-			ude6.udp6IfIndex = udp->udp_bound_if;
-			if (udp->udp_state == TS_DATA_XFER) {
-				ude6.udp6EntryInfo.ue_RemoteAddress =
-				    udp->udp_v6dst;
-				ude6.udp6EntryInfo.ue_RemotePort =
-				    ntohs(udp->udp_dstport);
-			} else {
-				ude6.udp6EntryInfo.ue_RemoteAddress =
-				    sin6_null.sin6_addr;
-				ude6.udp6EntryInfo.ue_RemotePort = 0;
+			if (udp->udp_ipversion == IPV6_VERSION) {
+				ude6.udp6EntryInfo.ue_state  = state;
+				ude6.udp6LocalAddress = udp->udp_v6src;
+				ude6.udp6LocalPort = ntohs(udp->udp_port);
+				ude6.udp6IfIndex = udp->udp_bound_if;
+				if (udp->udp_state == TS_DATA_XFER) {
+					ude6.udp6EntryInfo.ue_RemoteAddress =
+					    udp->udp_v6dst;
+					ude6.udp6EntryInfo.ue_RemotePort =
+					    ntohs(udp->udp_dstport);
+				} else {
+					ude6.udp6EntryInfo.ue_RemoteAddress =
+					    sin6_null.sin6_addr;
+					ude6.udp6EntryInfo.ue_RemotePort = 0;
+				}
+				(void) snmp_append_data2(mp6_conn_data,
+				    &mp6_conn_tail, (char *)&ude6,
+				    sizeof (ude6));
 			}
-			(void) snmp_append_data2(mp6_conn_data, &mp6_conn_tail,
-			    (char *)&ude6, sizeof (ude6));
-		}
-	}
-	mutex_exit(&udp_g_lock);
+		}
+	}
 
 	/* IPv4 UDP endpoints */
 	optp = (struct opthdr *)&mp_conn_ctl->b_rptr[
@@ -4754,7 +5477,7 @@
  * to do the appropriate locking.
  */
 /* ARGSUSED */
-static int
+int
 udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
     uchar_t *ptr, int len)
 {
@@ -4789,7 +5512,7 @@
 		state = "UnkState";
 	print_len = snprintf((char *)mp->b_wptr, buf_len,
 	    MI_COL_PTRFMT_STR "%4d %5u %s %s %5u %s\n",
-	    (void *)udp, udp->udp_zoneid, ntohs(udp->udp_port),
+	    (void *)udp, udp->udp_connp->conn_zoneid, ntohs(udp->udp_port),
 	    inet_ntop(AF_INET6, &udp->udp_v6src,
 		addrbuf1, sizeof (addrbuf1)),
 	    inet_ntop(AF_INET6, &udp->udp_v6dst,
@@ -4807,9 +5530,11 @@
 static int
 udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 {
-	IDP	idp;
-	udp_t	*udp;
 	zoneid_t zoneid;
+	connf_t	*connfp;
+	conn_t	*connp = Q_TO_CONN(q);
+	udp_t	*udp = connp->conn_udp;
+	int	i;
 
 	/*
 	 * Because of the ndd constraint, at most we can have 64K buffer
@@ -4837,21 +5562,22 @@
 	    " zone lport src addr        dest addr       port  state");
 	/*    1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
-
-	mutex_enter(&udp_g_lock);
-	for (idp = mi_first_ptr(&udp_g_head);
-	    (udp = (udp_t *)idp) != 0;
-	    idp = mi_next_ptr(&udp_g_head, idp)) {
-
-		if (zoneid != GLOBAL_ZONEID &&
-		    zoneid != udp->udp_zoneid)
-			continue;
-
-		udp_report_item(mp->b_cont, udp);
-	}
-	mutex_exit(&udp_g_lock);
+	zoneid = connp->conn_zoneid;
+
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipcl_globalhash_fanout[i];
+		connp = NULL;
+
+		while ((connp = ipcl_get_next_conn(connfp, connp,
+		    IPCL_UDP))) {
+			udp = connp->conn_udp;
+			if (zoneid != GLOBAL_ZONEID &&
+			    zoneid != connp->conn_zoneid)
+				continue;
+
+			udp_report_item(mp->b_cont, udp);
+		}
+	}
 	udp_last_ndd_get_info_time = ddi_get_lbolt();
 	return (0);
 }
@@ -4862,32 +5588,44 @@
  * passed in mp.  This message is freed.
  */
 static void
-udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
+udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen,
+    t_scalar_t err)
 {
+	struct T_unitdata_req *tudr;
 	mblk_t	*mp1;
-	struct T_unitdata_req	*tudr = (struct T_unitdata_req *)mp->b_rptr;
-	uchar_t	*destaddr, *optaddr;
-
-	if ((mp->b_wptr < mp->b_rptr) ||
-	    (mp->b_wptr - mp->b_rptr) < sizeof (struct T_unitdata_req)) {
-		goto done;
-	}
-	destaddr = mp->b_rptr + tudr->DEST_offset;
-	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
-	    destaddr + tudr->DEST_length < mp->b_rptr ||
-	    destaddr + tudr->DEST_length > mp->b_wptr) {
-		goto done;
-	}
-	optaddr = mp->b_rptr + tudr->OPT_offset;
-	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
-	    optaddr + tudr->OPT_length < mp->b_rptr ||
-	    optaddr + tudr->OPT_length > mp->b_wptr) {
-		goto done;
-	}
-	mp1 = mi_tpi_uderror_ind((char *)destaddr, tudr->DEST_length,
-	    (char *)optaddr, tudr->OPT_length, err);
-	if (mp1)
-		qreply(q, mp1);
+	uchar_t	*optaddr;
+	t_scalar_t optlen;
+
+	if (DB_TYPE(mp) == M_DATA) {
+		ASSERT(destaddr != NULL && destlen != 0);
+		optaddr = NULL;
+		optlen = 0;
+	} else {
+		if ((mp->b_wptr < mp->b_rptr) ||
+		    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+			goto done;
+		}
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+		destaddr = mp->b_rptr + tudr->DEST_offset;
+		if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+		    destaddr + tudr->DEST_length < mp->b_rptr ||
+		    destaddr + tudr->DEST_length > mp->b_wptr) {
+			goto done;
+		}
+		optaddr = mp->b_rptr + tudr->OPT_offset;
+		if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+		    optaddr + tudr->OPT_length < mp->b_rptr ||
+		    optaddr + tudr->OPT_length > mp->b_wptr) {
+			goto done;
+		}
+		destlen = tudr->DEST_length;
+		optlen = tudr->OPT_length;
+	}
+
+	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
+	    (char *)optaddr, optlen, err);
+	if (mp1 != NULL)
+		putnext(UDP_RD(q), mp1);
 
 done:
 	freemsg(mp);
@@ -4900,9 +5638,8 @@
 static void
 udp_unbind(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp;
-
-	udp = (udp_t *)q->q_ptr;
+	udp_t *udp = Q_TO_UDP(q);
+
 	/* If a bind has not been done, we can't unbind. */
 	if (udp->udp_state == TS_UNBND) {
 		udp_err_ack(q, mp, TOUTSTATE, 0);
@@ -4939,8 +5676,13 @@
 			return;
 		}
 	}
-	/* Pass the unbind to IP */
-	putnext(q, mp);
+	/*
+	 * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
+	 * and therefore ip_unbind must never return NULL.
+	 */
+	mp = ip_unbind(q, mp);
+	ASSERT(mp != NULL);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -4994,193 +5736,47 @@
 	return (port);
 }
 
-/*
- * This routine handles all messages passed downstream.  It either
- * consumes the message or passes it downstream; it never queues a
- * a message.
- */
-static void
-udp_wput(queue_t *q, mblk_t *mp)
+static mblk_t *
+udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
+    uint_t srcid, int *error)
 {
-	uchar_t		*rptr = mp->b_rptr;
-	struct 		datab *db;
-	ipha_t		*ipha;
-	udpha_t		*udpha;
-	mblk_t		*mp1;
-	int		ip_hdr_length;
-#define	tudr ((struct T_unitdata_req *)rptr)
-	uint32_t	ip_len;
-	udp_t		*udp;
-	sin6_t		*sin6;
-	sin_t		*sin;
-	ipaddr_t	v4dst;
-	uint16_t	port;
-	uint_t		srcid;
-
-	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
-		"udp_wput_start: q %p mp %p", q, mp);
-
-	db = mp->b_datap;
-	switch (db->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
-		if (mp->b_wptr - rptr >= sizeof (struct T_unitdata_req)) {
-			/* Detect valid T_UNITDATA_REQ here */
-			if (((union T_primitives *)rptr)->type
-			    == T_UNITDATA_REQ)
-				break;
-		}
-		/* FALLTHRU */
-	default:
-		qwriter(q, mp, udp_wput_other, PERIM_INNER);
-		return;
-	}
-
-	udp = (udp_t *)q->q_ptr;
-
-	/* Handle UNITDATA_REQ messages here */
-	if (udp->udp_state == TS_UNBND) {
-		/* If a port has not been bound to the stream, fail. */
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EPROTO);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "outstate");
-		return;
-	}
-	mp1 = mp->b_cont;
-	if (mp1 == NULL) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EPROTO);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "badaddr");
-		return;
-	}
-
-	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EADDRNOTAVAIL);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "badaddr");
-		return;
-	}
-
-	switch (udp->udp_family) {
-	case AF_INET6:
-		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin6) ||
-		    tudr->DEST_length != sizeof (sin6_t) ||
-		    sin6->sin6_family != AF_INET6) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-
-		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-			/*
-			 * Destination is a non-IPv4-compatible IPv6 address.
-			 * Send out an IPv6 format packet.
-			 */
-			udp_wput_ipv6(q, mp, sin6, tudr->OPT_length);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "udp_wput_ipv6");
-			return;
-		}
-		/*
-		 * If the local address is not zero or a mapped address return
-		 * an error.
-		 * I would be possible to send an IPv4 packet but the
-		 * response would never make it back to the application
-		 * since it is bound to a non-mapped address.
-		 */
-		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-		/* Send IPv4 packet without modifying udp_ipversion */
-		/* Extract port and ipaddr */
-		port = sin6->sin6_port;
-		IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
-		srcid = sin6->__sin6_src_id;
-		break;
-
-	case AF_INET:
-		sin = (sin_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin) ||
-		    tudr->DEST_length != sizeof (sin_t) ||
-		    sin->sin_family != AF_INET) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-		/* Extract port and ipaddr */
-		port = sin->sin_port;
-		v4dst = sin->sin_addr.s_addr;
-		srcid = 0;
-		break;
-	}
-
-
-	/*
-	 * If options passed in, feed it for verification and handling
-	 */
-	if (tudr->OPT_length != 0) {
-		int error;
-
-		if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
-			/* failure */
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, error);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			    "udp_wput_end: q %p (%S)", q,
-			    "udp_unitdata_opt_process");
-			return;
-		}
-		ASSERT(error == 0);
-		/*
-		 * Note: success in processing options.
-		 * mp option buffer represented by
-		 * OPT_length/offset now potentially modified
-		 * and contain option setting results
-		 */
-	}
+	udp_t	*udp = connp->conn_udp;
+	queue_t	*q = connp->conn_wq;
+	mblk_t	*mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+	mblk_t	*mp2;
+	ipha_t	*ipha;
+	int	ip_hdr_length;
+	uint32_t ip_len;
+	udpha_t	*udpha;
+
+	*error = 0;
+
+	/* mp1 points to the M_DATA mblk carrying the packet */
+	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
 
 	/* Add an IP header */
 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
 	    udp->udp_ip_snd_options_len;
 	ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
-	if ((mp1->b_datap->db_ref != 1) ||
-	    ((uchar_t *)ipha < mp1->b_datap->db_base) ||
+	if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) ||
 	    !OK_32PTR(ipha)) {
-		uchar_t *wptr;
-
-		mp1 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
-		if (!mp1) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, ENOMEM);
+		mp2 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
+		if (mp2 == NULL) {
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "allocbfail2");
-			return;
-		}
-		mp1->b_cont = mp->b_cont;
-		mp->b_cont = mp1;
-		wptr = mp1->b_datap->db_lim;
-		mp1->b_wptr = wptr;
-		ipha = (ipha_t *)(wptr - ip_hdr_length);
-	}
-	mp1->b_rptr = (uchar_t *)ipha;
-
-	ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
-	    (uintptr_t)UINT_MAX);
-
+			    "udp_wput_end: q %p (%S)", q, "allocbfail2");
+			*error = ENOMEM;
+			goto done;
+		}
+		mp2->b_wptr = DB_LIM(mp2);
+		mp2->b_cont = mp1;
+		mp1 = mp2;
+		if (DB_TYPE(mp) != M_DATA)
+			mp->b_cont = mp1;
+		else
+			mp = mp1;
+
+		ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length);
+	}
 	ip_hdr_length -= UDPH_SIZE;
 #ifdef	_BIG_ENDIAN
 	/* Set version, header length, and tos */
@@ -5206,24 +5802,25 @@
 	if (srcid != 0 && ipha->ipha_src == INADDR_ANY) {
 		in6_addr_t v6src;
 
-		ip_srcid_find_id(srcid, &v6src, udp->udp_zoneid);
+		ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid);
 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
 	}
 
 	ipha->ipha_fragment_offset_and_flags = 0;
 	ipha->ipha_ident = 0;
 
+	mp1->b_rptr = (uchar_t *)ipha;
+
+	ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
+	    (uintptr_t)UINT_MAX);
+
 	/* Determine length of packet */
 	ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha);
-	{
-		mblk_t	*mp2;
-		if ((mp2 = mp1->b_cont) != NULL) {
-			do {
-				ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr)
-				    <= (uintptr_t)UINT_MAX);
-				ip_len += (uint32_t)(mp2->b_wptr - mp2->b_rptr);
-			} while ((mp2 = mp2->b_cont) != NULL);
-		}
+	if ((mp2 = mp1->b_cont) != NULL) {
+		do {
+			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+			ip_len += (uint32_t)MBLKL(mp2);
+		} while ((mp2 = mp2->b_cont) != NULL);
 	}
 	/*
 	 * If the size of the packet is greater than the maximum allowed by
@@ -5231,19 +5828,18 @@
 	 * the size will have wrapped and be inconsistent with the msg size.
 	 */
 	if (ip_len > IP_MAXPACKET) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EMSGSIZE);
 		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
 		    "udp_wput_end: q %p (%S)", q, "IP length exceeded");
-		return;
+		*error = EMSGSIZE;
+		goto done;
 	}
 	ipha->ipha_length = htons((uint16_t)ip_len);
 	ip_len -= ip_hdr_length;
 	ip_len = htons((uint16_t)ip_len);
 	udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length);
+
 	/*
-	 * Copy in the destination address and port from the T_UNITDATA
-	 * request
+	 * Copy in the destination address
 	 */
 	if (v4dst == INADDR_ANY)
 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
@@ -5310,41 +5906,648 @@
 	/* Set UDP length and checksum */
 	*((uint32_t *)&udpha->uha_length) = ip_len;
 
-	freeb(mp);
+	if (DB_TYPE(mp) != M_DATA) {
+		ASSERT(mp != mp1);
+		freeb(mp);
+	}
+
+	/* mp has been consumed and we'll return success */
+	ASSERT(*error == 0);
+	mp = NULL;
 
 	/* We're done.  Pass the packet to ip. */
 	BUMP_MIB(&udp_mib, udpOutDatagrams);
 	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
 		"udp_wput_end: q %p (%S)", q, "end");
-	putnext(q, mp1);
-#undef tudr
+
+	if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
+	    CONN_OUTBOUND_POLICY_PRESENT(connp) ||
+	    connp->conn_dontroute || connp->conn_xmit_if_ill != NULL ||
+	    connp->conn_nofailover_ill != NULL ||
+	    connp->conn_outgoing_ill != NULL ||
+	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
+	    IPP_ENABLED(IPP_LOCAL_OUT) || ip_g_mrouter != NULL) {
+		UDP_STAT(udp_ip_send);
+		ip_output(connp, mp1, connp->conn_wq, IP_WPUT);
+	} else {
+		udp_send_data(udp, connp->conn_wq, mp1, ipha);
+	}
+
+done:
+	if (*error != 0) {
+		ASSERT(mp != NULL);
+		BUMP_MIB(&udp_mib, udpOutErrors);
+	}
+	return (mp);
+}
+
+static void
+udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
+{
+	conn_t	*connp = udp->udp_connp;
+	ipaddr_t src, dst;
+	ill_t	*ill;
+	ire_t	*ire;
+	ipif_t	*ipif = NULL;
+	mblk_t	*ire_fp_mp;
+	uint_t	ire_fp_mp_len;
+	uint16_t *up;
+	uint32_t cksum, hcksum_txflags;
+	queue_t	*dev_q;
+	boolean_t retry_caching;
+
+	dst = ipha->ipha_dst;
+	src = ipha->ipha_src;
+	ASSERT(ipha->ipha_ident == 0);
+
+	if (CLASSD(dst)) {
+		int err;
+
+		ipif = conn_get_held_ipif(connp,
+		    &connp->conn_multicast_ipif, &err);
+
+		if (ipif == NULL || ipif->ipif_isv6 ||
+		    (ipif->ipif_ill->ill_phyint->phyint_flags &
+		    PHYI_LOOPBACK)) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			UDP_STAT(udp_ip_send);
+			ip_output(connp, mp, q, IP_WPUT);
+			return;
+		}
+	}
+
+	retry_caching = B_FALSE;
+	mutex_enter(&connp->conn_lock);
+	ire = connp->conn_ire_cache;
+	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
+
+	if (ire == NULL || ire->ire_addr != dst ||
+	    (ire->ire_marks & IRE_MARK_CONDEMNED)) {
+		retry_caching = B_TRUE;
+	} else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) {
+		ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+
+		ASSERT(ipif != NULL);
+		if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
+		    stq_ill->ill_group != ipif->ipif_ill->ill_group))
+			retry_caching = B_TRUE;
+	}
+
+	if (!retry_caching) {
+		ASSERT(ire != NULL);
+		IRE_REFHOLD(ire);
+		mutex_exit(&connp->conn_lock);
+	} else {
+		boolean_t cached = B_FALSE;
+
+		connp->conn_ire_cache = NULL;
+		mutex_exit(&connp->conn_lock);
+
+		/* Release the old ire */
+		if (ire != NULL) {
+			IRE_REFRELE_NOTR(ire);
+			ire = NULL;
+		}
+
+		if (CLASSD(dst)) {
+			ASSERT(ipif != NULL);
+			ire = ire_ctable_lookup(dst, 0, 0, ipif,
+			    connp->conn_zoneid, MATCH_IRE_ILL_GROUP);
+		} else {
+			ASSERT(ipif == NULL);
+			ire = ire_cache_lookup(dst, connp->conn_zoneid);
+		}
+
+		if (ire == NULL) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			UDP_STAT(udp_ire_null);
+			ip_output(connp, mp, q, IP_WPUT);
+			return;
+		}
+		IRE_REFHOLD_NOTR(ire);
+
+		mutex_enter(&connp->conn_lock);
+		if (!(connp->conn_state_flags & CONN_CLOSING) &&
+		    connp->conn_ire_cache == NULL) {
+			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
+			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
+				connp->conn_ire_cache = ire;
+				cached = B_TRUE;
+			}
+			rw_exit(&ire->ire_bucket->irb_lock);
+		}
+		mutex_exit(&connp->conn_lock);
+
+		/*
+		 * We can continue to use the ire but since it was not
+		 * cached, we should drop the extra reference.
+		 */
+		if (!cached)
+			IRE_REFRELE_NOTR(ire);
+	}
+	ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION);
+	ASSERT(!CLASSD(dst) || ipif != NULL);
+
+	if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) ||
+	    (ire->ire_flags & RTF_MULTIRT) || ire->ire_stq == NULL ||
+	    ire->ire_max_frag < ntohs(ipha->ipha_length) ||
+	    (ire_fp_mp = ire->ire_fp_mp) == NULL ||
+	    (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) {
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		UDP_STAT(udp_ip_ire_send);
+		IRE_REFRELE(ire);
+		ip_output(connp, mp, q, IP_WPUT);
+		return;
+	}
+
+	BUMP_MIB(&ip_mib, ipOutRequests);
+
+	ill = ire_to_ill(ire);
+	ASSERT(ill != NULL);
+
+	dev_q = ire->ire_stq->q_next;
+	ASSERT(dev_q != NULL);
+	/*
+	 * If the service thread is already running, or if the driver
+	 * queue is currently flow-controlled, queue this packet.
+	 */
+	if ((q->q_first != NULL || connp->conn_draining) ||
+	    ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+		if (ip_output_queue) {
+			(void) putq(q, mp);
+		} else {
+			BUMP_MIB(&ip_mib, ipOutDiscards);
+			freemsg(mp);
+		}
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		IRE_REFRELE(ire);
+		return;
+	}
+
+	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
+#ifndef _BIG_ENDIAN
+	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
+#endif
+
+	if (src == INADDR_ANY && !connp->conn_unspec_src) {
+		if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC))
+			src = ipha->ipha_src = ipif->ipif_src_addr;
+		else
+			src = ipha->ipha_src = ire->ire_src_addr;
+	}
+
+	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+		ASSERT(ill->ill_hcksum_capab != NULL);
+		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+	} else {
+		hcksum_txflags = 0;
+	}
+
+	/* pseudo-header checksum (do it in parts for IP header checksum) */
+	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+
+	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
+	up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+	if (*up != 0) {
+		IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags,
+		    mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
+		    ntohs(ipha->ipha_length), cksum);
+
+		/* Software checksum? */
+		if (DB_CKSUMFLAGS(mp) == 0) {
+			UDP_STAT(udp_out_sw_cksum);
+			UDP_STAT_UPDATE(udp_out_sw_cksum_bytes,
+			    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
+		}
+	}
+
+	ipha->ipha_fragment_offset_and_flags |=
+	    (uint32_t)htons(ire->ire_frag_flag);
+
+	/* Calculate IP header checksum if hardware isn't capable */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
+		    ((uint16_t *)ipha)[4]);
+	}
+
+	if (CLASSD(dst)) {
+		ilm_t *ilm;
+
+		ILM_WALKER_HOLD(ill);
+		ilm = ilm_lookup_ill(ill, dst, ALL_ZONES);
+		ILM_WALKER_RELE(ill);
+		if (ilm != NULL) {
+			ip_multicast_loopback(q, ill, mp,
+			    connp->conn_multicast_loop ? 0 :
+			    IP_FF_NO_MCAST_LOOP, connp->conn_zoneid);
+		}
+
+		/* If multicast TTL is 0 then we are done */
+		if (ipha->ipha_ttl == 0) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			freemsg(mp);
+			IRE_REFRELE(ire);
+			return;
+		}
+	}
+
+	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
+	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
+	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
+
+	UPDATE_OB_PKT_COUNT(ire);
+	ire->ire_last_used_time = lbolt;
+
+	if (ILL_POLL_CAPABLE(ill)) {
+		/*
+		 * Send the packet directly to DLD, where it may be queued
+		 * depending on the availability of transmit resources at
+		 * the media layer.
+		 */
+		IP_POLL_ILL_TX(ill, mp);
+	} else {
+		putnext(ire->ire_stq, mp);
+	}
+
+	if (ipif != NULL)
+		ipif_refrele(ipif);
+	IRE_REFRELE(ire);
 }
 
 /*
- * udp_wput_ipv6():
+ * This routine handles all messages passed downstream.  It either
+ * consumes the message or passes it downstream; it never queues a
+ * a message.
+ */
+static void
+udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
+{
+	sin6_t		*sin6;
+	sin_t		*sin;
+	ipaddr_t	v4dst;
+	uint16_t	port;
+	uint_t		srcid;
+	queue_t		*q = connp->conn_wq;
+	udp_t		*udp = connp->conn_udp;
+	t_scalar_t	optlen;
+	int		error = 0;
+	struct sockaddr_storage ss;
+
+	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
+	    "udp_wput_start: connp %p mp %p", connp, mp);
+
+	/*
+	 * We directly handle several cases here: T_UNITDATA_REQ message
+	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for both
+	 * connected and non-connected socket.  The latter carries the
+	 * address structure along when this routine gets called.
+	 */
+	switch (DB_TYPE(mp)) {
+	case M_DATA:
+		if (!udp->udp_direct_sockfs || udp->udp_state != TS_DATA_XFER) {
+			if (!udp->udp_direct_sockfs ||
+			    addr == NULL || addrlen == 0) {
+				/* Not connected; address is required */
+				BUMP_MIB(&udp_mib, udpOutErrors);
+				UDP_STAT(udp_out_err_notconn);
+				freemsg(mp);
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: connp %p (%S)", connp,
+				    "not-connected; address required");
+				return;
+			}
+			ASSERT(udp->udp_issocket);
+			UDP_DBGSTAT(udp_data_notconn);
+			/* Not connected; do some more checks below */
+			optlen = 0;
+			break;
+		}
+		/* M_DATA for connected socket */
+		UDP_DBGSTAT(udp_data_conn);
+		IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst);
+
+		/* Initialize addr and addrlen as if they're passed in */
+		if (udp->udp_family == AF_INET) {
+			sin = (sin_t *)&ss;
+			sin->sin_family = AF_INET;
+			sin->sin_port = udp->udp_dstport;
+			sin->sin_addr.s_addr = v4dst;
+			addr = (struct sockaddr *)sin;
+			addrlen = sizeof (*sin);
+		} else {
+			sin6 = (sin6_t *)&ss;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = udp->udp_dstport;
+			sin6->sin6_flowinfo = udp->udp_flowinfo;
+			sin6->sin6_addr = udp->udp_v6dst;
+			sin6->sin6_scope_id = 0;
+			sin6->__sin6_src_id = 0;
+			addr = (struct sockaddr *)sin6;
+			addrlen = sizeof (*sin6);
+		}
+
+		if (udp->udp_family == AF_INET ||
+		    IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst)) {
+			/*
+			 * Handle both AF_INET and AF_INET6; the latter
+			 * for IPV4 mapped destination addresses.  Note
+			 * here that both addr and addrlen point to the
+			 * corresponding struct depending on the address
+			 * family of the socket.
+			 */
+			mp = udp_output_v4(connp, mp, v4dst,
+			    udp->udp_dstport, 0, &error);
+		} else {
+			mp = udp_output_v6(connp, mp, sin6, 0, &error);
+		}
+		if (error != 0) {
+			ASSERT(addr != NULL && addrlen != 0);
+			goto ud_error;
+		}
+		return;
+	case M_PROTO:
+	case M_PCPROTO: {
+		struct T_unitdata_req *tudr;
+
+		ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX);
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+
+		/* Handle valid T_UNITDATA_REQ here */
+		if (MBLKL(mp) >= sizeof (*tudr) &&
+		    ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) {
+			if (mp->b_cont == NULL) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "badaddr");
+				error = EPROTO;
+				goto ud_error;
+			}
+
+			if (!MBLKIN(mp, 0, tudr->DEST_offset +
+			    tudr->DEST_length)) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "badaddr");
+				error = EADDRNOTAVAIL;
+				goto ud_error;
+			}
+			/*
+			 * If a port has not been bound to the stream, fail.
+			 * This is not a problem when sockfs is directly
+			 * above us, because it will ensure that the socket
+			 * is first bound before allowing data to be sent.
+			 */
+			if (udp->udp_state == TS_UNBND) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "outstate");
+				error = EPROTO;
+				goto ud_error;
+			}
+			addr = (struct sockaddr *)
+			    &mp->b_rptr[tudr->DEST_offset];
+			addrlen = tudr->DEST_length;
+			optlen = tudr->OPT_length;
+			if (optlen != 0)
+				UDP_STAT(udp_out_opt);
+			break;
+		}
+		/* FALLTHRU */
+	}
+	default:
+		udp_become_writer(connp, mp, udp_wput_other_wrapper,
+		    SQTAG_UDP_OUTPUT);
+		return;
+	}
+	ASSERT(addr != NULL);
+
+	switch (udp->udp_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)addr;
+		if (!OK_32PTR((char *)sin6) || addrlen != sizeof (sin6_t) ||
+		    sin6->sin6_family != AF_INET6) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+
+		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			/*
+			 * Destination is a non-IPv4-compatible IPv6 address.
+			 * Send out an IPv6 format packet.
+			 */
+			mp = udp_output_v6(connp, mp, sin6, optlen, &error);
+			if (error != 0)
+				goto ud_error;
+
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "udp_output_v6");
+			return;
+		}
+		/*
+		 * If the local address is not zero or a mapped address
+		 * return an error.  It would be possible to send an IPv4
+		 * packet but the response would never make it back to the
+		 * application since it is bound to a non-mapped address.
+		 */
+		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
+		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+		/* Send IPv4 packet without modifying udp_ipversion */
+		/* Extract port and ipaddr */
+		port = sin6->sin6_port;
+		IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
+		srcid = sin6->__sin6_src_id;
+		break;
+
+	case AF_INET:
+		sin = (sin_t *)addr;
+		if (!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t) ||
+		    sin->sin_family != AF_INET) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+		/* Extract port and ipaddr */
+		port = sin->sin_port;
+		v4dst = sin->sin_addr.s_addr;
+		srcid = 0;
+		break;
+	}
+
+	/*
+	 * If options passed in, feed it for verification and handling
+	 */
+	if (optlen != 0) {
+		ASSERT(DB_TYPE(mp) != M_DATA);
+		if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
+			/* failure */
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q,
+			    "udp_unitdata_opt_process");
+			goto ud_error;
+		}
+		/*
+		 * Note: success in processing options.
+		 * mp option buffer represented by
+		 * OPT_length/offset now potentially modified
+		 * and contain option setting results
+		 */
+	}
+	ASSERT(error == 0);
+	mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error);
+	if (error != 0) {
+ud_error:
+		UDP_STAT(udp_out_err_output);
+		ASSERT(mp != NULL);
+		/* mp is freed by the following routine */
+		udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
+		    (t_scalar_t)error);
+	}
+}
+
+/* ARGSUSED */
+static void
+udp_output_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_output((conn_t *)arg, mp, NULL, 0);
+	_UDP_EXIT((conn_t *)arg);
+}
+
+static void
+udp_wput(queue_t *q, mblk_t *mp)
+{
+	_UDP_ENTER(Q_TO_CONN(UDP_WR(q)), mp, udp_output_wrapper,
+	    SQTAG_UDP_WPUT);
+}
+
+/*
+ * Allocate and prepare a T_UNITDATA_REQ message.
+ */
+static mblk_t *
+udp_tudr_alloc(struct sockaddr *addr, socklen_t addrlen)
+{
+	struct T_unitdata_req *tudr;
+	mblk_t *mp;
+
+	mp = allocb(sizeof (*tudr) + addrlen, BPRI_MED);
+	if (mp != NULL) {
+		mp->b_wptr += sizeof (*tudr) + addrlen;
+		DB_TYPE(mp) = M_PROTO;
+
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+		tudr->PRIM_type = T_UNITDATA_REQ;
+		tudr->DEST_length = addrlen;
+		tudr->DEST_offset = (t_scalar_t)sizeof (*tudr);
+		tudr->OPT_length = 0;
+		tudr->OPT_offset = 0;
+		bcopy(addr, tudr+1, addrlen);
+	}
+	return (mp);
+}
+
+/*
+ * Entry point for sockfs when udp is in "direct sockfs" mode.  This mode
+ * is valid when we are directly beneath the stream head, and thus sockfs
+ * is able to bypass STREAMS and directly call us, passing along the sockaddr
+ * structure without the cumbersome T_UNITDATA_REQ interface.  Note that
+ * this is done for both connected and non-connected endpoint.
+ */
+void
+udp_wput_data(queue_t *q, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
+{
+	conn_t	*connp;
+	udp_t	*udp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
+
+	/* udpsockfs should only send down M_DATA for this entry point */
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	mutex_enter(&connp->conn_lock);
+	UDP_MODE_ASSERTIONS(udp, UDP_ENTER);
+
+	if (udp->udp_mode != UDP_MT_HOT) {
+		/*
+		 * We can't enter this conn right away because another
+		 * thread is currently executing as writer; therefore we
+		 * need to deposit the message into the squeue to be
+		 * drained later.  If a socket address is present, we
+		 * need to create a T_UNITDATA_REQ message as placeholder.
+		 */
+		if (addr != NULL && addrlen != 0) {
+			mblk_t *tudr_mp = udp_tudr_alloc(addr, addrlen);
+
+			if (tudr_mp == NULL) {
+				mutex_exit(&connp->conn_lock);
+				BUMP_MIB(&udp_mib, udpOutErrors);
+				UDP_STAT(udp_out_err_tudr);
+				freemsg(mp);
+				return;
+			}
+			/* Tag the packet with T_UNITDATA_REQ */
+			tudr_mp->b_cont = mp;
+			mp = tudr_mp;
+		}
+		mutex_exit(&connp->conn_lock);
+		udp_enter(connp, mp, udp_output_wrapper, SQTAG_UDP_WPUT);
+		return;
+	}
+
+	/* We can execute as reader right away. */
+	UDP_READERS_INCREF(udp);
+	mutex_exit(&connp->conn_lock);
+
+	udp_output(connp, mp, addr, addrlen);
+
+	mutex_enter(&connp->conn_lock);
+	UDP_MODE_ASSERTIONS(udp, UDP_EXIT);
+	UDP_READERS_DECREF(udp);
+	mutex_exit(&connp->conn_lock);
+}
+
+/*
+ * udp_output_v6():
  * Assumes that udp_wput did some sanity checking on the destination
  * address.
  */
-static void
-udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
+static mblk_t *
+udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen,
+    int *error)
 {
-	ip6_t			*ip6h;
-	ip6i_t			*ip6i;	/* mp1->b_rptr even if no ip6i_t */
-	mblk_t			*mp1;
-	int			udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
-	size_t			ip_len;
-	udpha_t			*udph;
-	udp_t			*udp;
-	ip6_pkt_t		ipp_s;	/* For ancillary data options */
-	ip6_pkt_t		*ipp = &ipp_s;
-	ip6_pkt_t		*tipp;	/* temporary ipp */
-	uint32_t		csum = 0;
-	uint_t			ignore = 0;
-	uint_t			option_exists = 0, is_sticky = 0;
-	uint8_t			*cp;
-	uint8_t			*nxthdr_ptr;
-
-	udp = (udp_t *)q->q_ptr;
+	ip6_t		*ip6h;
+	ip6i_t		*ip6i;	/* mp1->b_rptr even if no ip6i_t */
+	mblk_t		*mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+	mblk_t		*mp2;
+	int		udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
+	size_t		ip_len;
+	udpha_t		*udph;
+	udp_t		*udp = connp->conn_udp;
+	queue_t		*q = connp->conn_wq;
+	ip6_pkt_t	ipp_s;	/* For ancillary data options */
+	ip6_pkt_t	*ipp = &ipp_s;
+	ip6_pkt_t	*tipp;	/* temporary ipp */
+	uint32_t	csum = 0;
+	uint_t		ignore = 0;
+	uint_t		option_exists = 0, is_sticky = 0;
+	uint8_t		*cp;
+	uint8_t		*nxthdr_ptr;
+
+	*error = 0;
+
+	/* mp1 points to the M_DATA mblk carrying the packet */
+	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
+	ASSERT(tudr_optlen == 0 || DB_TYPE(mp) != M_DATA);
 
 	/*
 	 * If the local address is a mapped address return
@@ -5354,9 +6557,8 @@
 	 * since it is bound to a mapped address.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EADDRNOTAVAIL);
-		return;
+		*error = EADDRNOTAVAIL;
+		goto done;
 	}
 
 	ipp->ipp_fields = 0;
@@ -5366,17 +6568,12 @@
 	 * If TPI options passed in, feed it for verification and handling
 	 */
 	if (tudr_optlen != 0) {
-		int 		error;
-
-		if (udp_unitdata_opt_process(q, mp, &error,
-		    (void *)ipp) < 0) {
+		if (udp_unitdata_opt_process(q, mp, error, (void *)ipp) < 0) {
 			/* failure */
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, error);
-			return;
+			goto done;
 		}
 		ignore = ipp->ipp_sticky_ignored;
-		ASSERT(error == 0);
+		ASSERT(*error == 0);
 	}
 
 	if (sin6->sin6_scope_id != 0 &&
@@ -5389,8 +6586,7 @@
 		option_exists |= IPPF_SCOPE_ID;
 	}
 
-	if ((udp->udp_sticky_ipp.ipp_fields == 0) &&
-	    (ipp->ipp_fields == 0)) {
+	if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) {
 		/* No sticky options nor ancillary data. */
 		goto no_options;
 	}
@@ -5475,7 +6671,8 @@
 	if (!(ignore & IPPF_USE_MIN_MTU)) {
 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
 			option_exists |= IPPF_USE_MIN_MTU;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_USE_MIN_MTU) {
+		} else if (udp->udp_sticky_ipp.ipp_fields &
+		    IPPF_USE_MIN_MTU) {
 			option_exists |= IPPF_USE_MIN_MTU;
 			is_sticky |= IPPF_USE_MIN_MTU;
 		}
@@ -5518,26 +6715,28 @@
 		udp_ip_hdr_len += sizeof (ip6i_t);
 
 	/* check/fix buffer config, setup pointers into it */
-	mp1 = mp->b_cont;
 	ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len];
-	if ((mp1->b_datap->db_ref != 1) ||
-	    ((unsigned char *)ip6h < mp1->b_datap->db_base) ||
+	if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) ||
 	    !OK_32PTR(ip6h)) {
 		/* Try to get everything in a single mblk next time */
 		if (udp_ip_hdr_len > udp->udp_max_hdr_len) {
 			udp->udp_max_hdr_len = udp_ip_hdr_len;
-			(void) mi_set_sth_wroff(RD(q),
+			(void) mi_set_sth_wroff(UDP_RD(q),
 			    udp->udp_max_hdr_len + udp_wroff_extra);
 		}
-		mp1 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
-		if (!mp1) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, ENOMEM);
-			return;
-		}
-		mp1->b_cont = mp->b_cont;
-		mp->b_cont = mp1;
-		mp1->b_wptr = mp1->b_datap->db_lim;
+		mp2 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
+		if (mp2 == NULL) {
+			*error = ENOMEM;
+			goto done;
+		}
+		mp2->b_wptr = DB_LIM(mp2);
+		mp2->b_cont = mp1;
+		mp1 = mp2;
+		if (DB_TYPE(mp) != M_DATA)
+			mp->b_cont = mp1;
+		else
+			mp = mp1;
+
 		ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len);
 	}
 	mp1->b_rptr = (unsigned char *)ip6h;
@@ -5624,7 +6823,7 @@
 		if (sin6->__sin6_src_id != 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
 			ip_srcid_find_id(sin6->__sin6_src_id,
-			    &ip6h->ip6_src, udp->udp_zoneid);
+			    &ip6h->ip6_src, connp->conn_zoneid);
 		}
 	}
 
@@ -5731,9 +6930,8 @@
 				 * Drop packet - only support Type 0 routing.
 				 * Notify the application as well.
 				 */
-				udp_ud_err(q, mp, EPROTO);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EPROTO;
+				goto done;
 			}
 
 			/*
@@ -5741,9 +6939,8 @@
 			 * addresses in the header. Thus it must be even.
 			 */
 			if (rth->ip6r_len & 0x1) {
-				udp_ud_err(q, mp, EPROTO);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EPROTO;
+				goto done;
 			}
 			/*
 			 * Shuffle the routing header and ip6_dst
@@ -5758,9 +6955,8 @@
 			 * for subsequent hops.
 			 */
 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
-				udp_ud_err(q, mp, EADDRNOTAVAIL);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EADDRNOTAVAIL;
+				goto done;
 			}
 
 			cp += (rth->ip6r_len + 1)*8;
@@ -5769,14 +6965,11 @@
 
 	/* count up length of UDP packet */
 	ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN;
-	{
-		mblk_t *mp2;
-
-		if ((mp2 = mp1->b_cont) != NULL) {
-			do {
-				ip_len += mp2->b_wptr - mp2->b_rptr;
-			} while ((mp2 = mp2->b_cont) != NULL);
-		}
+	if ((mp2 = mp1->b_cont) != NULL) {
+		do {
+			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+			ip_len += (uint32_t)MBLKL(mp2);
+		} while ((mp2 = mp2->b_cont) != NULL);
 	}
 
 	/*
@@ -5785,9 +6978,8 @@
 	 * the size will have wrapped and be inconsistent with the msg size.
 	 */
 	if (ip_len > IP_MAXPACKET) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EMSGSIZE);
-		return;
+		*error = EMSGSIZE;
+		goto done;
 	}
 
 	/* Store the UDP length. Subtract length of extension hdrs */
@@ -5810,11 +7002,25 @@
 #endif
 	ip6h->ip6_plen = ip_len;
 
-	freeb(mp);
+	if (DB_TYPE(mp) != M_DATA) {
+		ASSERT(mp != mp1);
+		freeb(mp);
+	}
+
+	/* mp has been consumed and we'll return success */
+	ASSERT(*error == 0);
+	mp = NULL;
 
 	/* We're done. Pass the packet to IP */
 	BUMP_MIB(&udp_mib, udpOutDatagrams);
-	putnext(q, mp1);
+	ip_output_v6(connp, mp1, q, IP_WPUT);
+
+done:
+	if (*error != 0) {
+		ASSERT(mp != NULL);
+		BUMP_MIB(&udp_mib, udpOutErrors);
+	}
+	return (mp);
 }
 
 static void
@@ -5823,26 +7029,18 @@
 	uchar_t	*rptr = mp->b_rptr;
 	struct datab *db;
 	struct iocblk *iocp;
-	udp_t	*udp;
 	cred_t	*cr;
+	conn_t	*connp = Q_TO_CONN(q);
+	udp_t	*udp = connp->conn_udp;
 
 	TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START,
 		"udp_wput_other_start: q %p", q);
 
-	udp = (udp_t *)q->q_ptr;
 	db = mp->b_datap;
 
-	cr = DB_CREDDEF(mp, udp->udp_credp);
+	cr = DB_CREDDEF(mp, connp->conn_cred);
 
 	switch (db->db_type) {
-	case M_DATA:
-		/* Not connected */
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		freemsg(mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			"udp_wput_other_end: q %p (%S)",
-			q, "not-connected");
-		return;
 	case M_PROTO:
 	case M_PCPROTO:
 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
@@ -5852,7 +7050,7 @@
 				q, "protoshort");
 			return;
 		}
-		switch (((union T_primitives *)rptr)->type) {
+		switch (((t_primp_t)rptr)->type) {
 		case T_ADDR_REQ:
 			udp_addr_req(q, mp);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
@@ -5885,7 +7083,7 @@
 			 * be bad.  Valid T_UNITDATA_REQs are handled
 			 * in udp_wput.
 			 */
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
+			udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 				"udp_wput_other_end: q %p (%S)",
 				q, "unitdatareq");
@@ -5897,14 +7095,26 @@
 			return;
 		case T_SVR4_OPTMGMT_REQ:
 			if (!snmpcom_req(q, mp, udp_snmp_set, udp_snmp_get, cr))
-				(void) svr4_optcom_req(q, mp, cr, &udp_opt_obj);
+				/*
+				 * Use upper queue for option processing in
+				 * case the request is not handled at this
+				 * level and needs to be passed down to IP.
+				 */
+				(void) svr4_optcom_req(_WR(UDP_RD(q)),
+				    mp, cr, &udp_opt_obj);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 			    "udp_wput_other_end: q %p (%S)",
 			    q, "optmgmtreq");
 			return;
 
 		case T_OPTMGMT_REQ:
-			(void) tpi_optcom_req(q, mp, cr, &udp_opt_obj);
+			/*
+			 * Use upper queue for option processing in
+			 * case the request is not handled at this
+			 * level and needs to be passed down to IP.
+			 */
+			(void) tpi_optcom_req(_WR(UDP_RD(q)),
+			    mp, cr, &udp_opt_obj);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 				"udp_wput_other_end: q %p (%S)",
 				q, "optmgmtreq");
@@ -5954,10 +7164,9 @@
 				 * don't know the peer's name.
 				 */
 				iocp->ioc_error = ENOTCONN;
-err_ret:;
 				iocp->ioc_count = 0;
 				mp->b_datap->db_type = M_IOCACK;
-				qreply(q, mp);
+				putnext(UDP_RD(q), mp);
 				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 					"udp_wput_other_end: q %p (%S)",
 					q, "getpeername");
@@ -5982,13 +7191,45 @@
 			/* nd_getset performs the necessary checking */
 		case ND_GET:
 			if (nd_getset(q, udp_g_nd, mp)) {
-				qreply(q, mp);
+				putnext(UDP_RD(q), mp);
 				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 					"udp_wput_other_end: q %p (%S)",
 					q, "get");
 				return;
 			}
 			break;
+		case _SIOCSOCKFALLBACK:
+			/*
+			 * Either sockmod is about to be popped and the
+			 * socket would now be treated as a plain stream,
+			 * or a module is about to be pushed so we could
+			 * no longer use read-side synchronous stream.
+			 * Drain any queued data and disable direct sockfs
+			 * interface from now on.
+			 */
+			if (!udp->udp_issocket) {
+				DB_TYPE(mp) = M_IOCNAK;
+				iocp->ioc_error = EINVAL;
+			} else {
+				udp->udp_issocket = B_FALSE;
+				if (udp->udp_direct_sockfs) {
+					/*
+					 * Disable read-side synchronous
+					 * stream interface and drain any
+					 * queued data.
+					 */
+					udp_rcv_drain(UDP_RD(q), udp,
+					    B_FALSE);
+					ASSERT(!udp->udp_direct_sockfs);
+					UDP_STAT(udp_sock_fallback);
+				}
+				DB_TYPE(mp) = M_IOCACK;
+				iocp->ioc_error = 0;
+			}
+			iocp->ioc_count = 0;
+			iocp->ioc_rval = 0;
+			putnext(UDP_RD(q), mp);
+			return;
 		default:
 			break;
 		}
@@ -6004,7 +7245,15 @@
 	}
 	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 		"udp_wput_other_end: q %p (%S)", q, "end");
-	putnext(q, mp);
+	ip_output(connp, mp, q, IP_WPUT);
+}
+
+/* ARGSUSED */
+static void
+udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_wput_other(((conn_t *)arg)->conn_wq, mp);
+	udp_exit((conn_t *)arg);
 }
 
 /*
@@ -6017,11 +7266,11 @@
 	mblk_t	*mp1;
 	STRUCT_HANDLE(strbuf, sb);
 	uint16_t port;
-	udp_t	*udp;
 	in6_addr_t	v6addr;
 	ipaddr_t	v4addr;
 	uint32_t	flowinfo = 0;
 	int		addrlen;
+	udp_t		*udp = Q_TO_UDP(q);
 
 	/* Make sure it is one of ours. */
 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -6029,9 +7278,11 @@
 	case TI_GETPEERNAME:
 		break;
 	default:
-		putnext(q, mp);
+		ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
 		return;
 	}
+
+	q = WR(UDP_RD(q));
 	switch (mi_copy_state(q, mp, &mp1)) {
 	case -1:
 		return;
@@ -6068,7 +7319,6 @@
 	 */
 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
 	    (void *)mp1->b_rptr);
-	udp = (udp_t *)q->q_ptr;
 	if (udp->udp_family == AF_INET)
 		addrlen = sizeof (sin_t);
 	else
@@ -6113,6 +7363,10 @@
 		port = udp->udp_port;
 		break;
 	case TI_GETPEERNAME:
+		if (udp->udp_state != TS_DATA_XFER) {
+			mi_copy_done(q, mp, ENOTCONN);
+			return;
+		}
 		if (udp->udp_family == AF_INET) {
 			ASSERT(udp->udp_ipversion == IPV4_VERSION);
 			v4addr = V4_PART_OF_V6(udp->udp_v6dst);
@@ -6163,21 +7417,23 @@
 udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
     void *thisdg_attrs)
 {
-	udp_t	*udp;
 	struct T_unitdata_req *udreqp;
 	int is_absreq_failure;
 	cred_t *cr;
-
-	ASSERT(((union T_primitives *)mp->b_rptr)->type);
-
-	udp = (udp_t *)q->q_ptr;
-
-	cr = DB_CREDDEF(mp, udp->udp_credp);
+	conn_t	*connp = Q_TO_CONN(q);
+
+	ASSERT(((t_primp_t)mp->b_rptr)->type);
+
+	cr = DB_CREDDEF(mp, connp->conn_cred);
 
 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
 	*errorp = 0;
 
-	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
+	/*
+	 * Use upper queue for option processing since the callback
+	 * routines expect to be called in UDP instance instead of IP.
+	 */
+	*errorp = tpi_optcom_buf(_WR(UDP_RD(q)), mp, &udreqp->OPT_length,
 	    udreqp->OPT_offset, cr, &udp_opt_obj,
 	    thisdg_attrs, &is_absreq_failure);
 
@@ -6198,7 +7454,6 @@
 	int i;
 
 	UDP6_MAJ = ddi_name_to_major(UDP6);
-	mutex_init(&udp_g_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr,
 	    udp_opt_obj.odb_opt_arr_cnt);
@@ -6218,7 +7473,11 @@
 		    NULL);
 	}
 	(void) udp_param_register(udp_param_arr, A_CNT(udp_param_arr));
+
 	udp_kstat_init();
+
+	udp_cache = kmem_cache_create("udp_cache", sizeof (udp_t),
+	    CACHE_ALIGN_SIZE, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
@@ -6228,14 +7487,16 @@
 
 	nd_free(&udp_g_nd);
 
-	mutex_destroy(&udp_g_lock);
 	for (i = 0; i < udp_bind_fanout_size; i++) {
 		mutex_destroy(&udp_bind_fanout[i].uf_lock);
 	}
+
 	kmem_free(udp_bind_fanout, udp_bind_fanout_size *
 	    sizeof (udp_fanout_t));
+
 	udp_kstat_fini();
 
+	kmem_cache_destroy(udp_cache);
 }
 
 static void
@@ -6250,9 +7511,9 @@
 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
 	};
 
-	udp_mibkp = kstat_create("udp", 0, "udp", "mib2", KSTAT_TYPE_NAMED,
-					NUM_OF_FIELDS(udp_named_kstat_t),
-					0);
+	udp_mibkp = kstat_create(UDP_MOD_NAME, 0, UDP_MOD_NAME,
+	    "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(udp_named_kstat_t), 0);
+
 	if (udp_mibkp == NULL)
 		return;
 
@@ -6264,12 +7525,24 @@
 	udp_mibkp->ks_update = udp_kstat_update;
 
 	kstat_install(udp_mibkp);
+
+	if ((udp_ksp = kstat_create(UDP_MOD_NAME, 0, "udpstat",
+	    "net", KSTAT_TYPE_NAMED,
+	    sizeof (udp_statistics) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		udp_ksp->ks_data = &udp_statistics;
+		kstat_install(udp_ksp);
+	}
 }
 
 static void
 udp_kstat_fini(void)
 {
-	if (udp_mibkp) {
+	if (udp_ksp != NULL) {
+		kstat_delete(udp_ksp);
+		udp_ksp = NULL;
+	}
+	if (udp_mibkp != NULL) {
 		kstat_delete(udp_mibkp);
 		udp_mibkp = NULL;
 	}
@@ -6296,6 +7569,269 @@
 	return (0);
 }
 
+/* ARGSUSED */
+static void
+udp_rput(queue_t *q, mblk_t *mp)
+{
+	/*
+	 * We get here whenever we do qreply() from IP,
+	 * i.e as part of handlings ioctls, etc.
+	 */
+	putnext(q, mp);
+}
+
+/*
+ * Read-side synchronous stream info entry point, called as a
+ * result of handling certain STREAMS ioctl operations.
+ */
+static int
+udp_rinfop(queue_t *q, infod_t *dp)
+{
+	mblk_t	*mp;
+	uint_t	cmd = dp->d_cmd;
+	int	res = 0;
+	int	error = 0;
+	udp_t	*udp = Q_TO_UDP(RD(UDP_WR(q)));
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&udp->udp_drain_lock);
+	/* If shutdown on read has happened, return nothing */
+	mutex_enter(&stp->sd_lock);
+	if (stp->sd_flag & STREOF) {
+		mutex_exit(&stp->sd_lock);
+		goto done;
+	}
+	mutex_exit(&stp->sd_lock);
+
+	if ((mp = udp->udp_rcv_list_head) == NULL)
+		goto done;
+
+	ASSERT(DB_TYPE(mp) != M_DATA && mp->b_cont != NULL);
+
+	if (cmd & INFOD_COUNT) {
+		/*
+		 * Return the number of messages.
+		 */
+		dp->d_count += udp->udp_rcv_msgcnt;
+		res |= INFOD_COUNT;
+	}
+	if (cmd & INFOD_BYTES) {
+		/*
+		 * Return size of all data messages.
+		 */
+		dp->d_bytes += udp->udp_rcv_cnt;
+		res |= INFOD_BYTES;
+	}
+	if (cmd & INFOD_FIRSTBYTES) {
+		/*
+		 * Return size of first data message.
+		 */
+		dp->d_bytes = msgdsize(mp);
+		res |= INFOD_FIRSTBYTES;
+		dp->d_cmd &= ~INFOD_FIRSTBYTES;
+	}
+	if (cmd & INFOD_COPYOUT) {
+		mblk_t *mp1 = mp->b_cont;
+		int n;
+		/*
+		 * Return data contents of first message.
+		 */
+		ASSERT(DB_TYPE(mp1) == M_DATA);
+		while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+			n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+			if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+			    UIO_READ, dp->d_uiop)) != 0) {
+				goto done;
+			}
+			mp1 = mp1->b_cont;
+		}
+		res |= INFOD_COPYOUT;
+		dp->d_cmd &= ~INFOD_COPYOUT;
+	}
+done:
+	mutex_exit(&udp->udp_drain_lock);
+
+	dp->d_res |= res;
+
+	return (error);
+}
+
+/*
+ * Read-side synchronous stream entry point.  This is called as a result
+ * of recv/read operation done at sockfs, and is guaranteed to execute
+ * outside of the interrupt thread context.  It returns a single datagram
+ * (b_cont chain of T_UNITDATA_IND plus data) to the upper layer.
+ */
+static int
+udp_rrw(queue_t *q, struiod_t *dp)
+{
+	mblk_t	*mp;
+	udp_t	*udp = Q_TO_UDP(_RD(UDP_WR(q)));
+
+	/* We should never get here when we're in SNMP mode */
+	ASSERT(!(udp->udp_connp->conn_flags & IPCL_UDPMOD));
+
+	/*
+	 * Dequeue datagram from the head of the list and return
+	 * it to caller; also ensure that RSLEEP sd_wakeq flag is
+	 * set/cleared depending on whether or not there's data
+	 * remaining in the list.
+	 */
+	mutex_enter(&udp->udp_drain_lock);
+	if (!udp->udp_direct_sockfs) {
+		mutex_exit(&udp->udp_drain_lock);
+		UDP_STAT(udp_rrw_busy);
+		return (EBUSY);
+	}
+	if ((mp = udp->udp_rcv_list_head) != NULL) {
+		uint_t size = msgdsize(mp);
+
+		/* Last datagram in the list? */
+		if ((udp->udp_rcv_list_head = mp->b_next) == NULL)
+			udp->udp_rcv_list_tail = NULL;
+		mp->b_next = NULL;
+
+		udp->udp_rcv_cnt -= size;
+		udp->udp_rcv_msgcnt--;
+		UDP_STAT(udp_rrw_msgcnt);
+
+		/* No longer flow-controlling? */
+		if (udp->udp_rcv_cnt < udp->udp_rcv_hiwat &&
+		    udp->udp_rcv_msgcnt < udp->udp_rcv_hiwat)
+			udp->udp_drain_qfull = B_FALSE;
+	}
+	if (udp->udp_rcv_list_head == NULL) {
+		/*
+		 * Either we just dequeued the last datagram or
+		 * we get here from sockfs and have nothing to
+		 * return; in this case clear RSLEEP.
+		 */
+		ASSERT(udp->udp_rcv_cnt == 0);
+		ASSERT(udp->udp_rcv_msgcnt == 0);
+		ASSERT(udp->udp_rcv_list_tail == NULL);
+		STR_WAKEUP_CLEAR(STREAM(q));
+	} else {
+		/*
+		 * More data follows; we need udp_rrw() to be
+		 * called in future to pick up the rest.
+		 */
+		STR_WAKEUP_SET(STREAM(q));
+	}
+	mutex_exit(&udp->udp_drain_lock);
+	dp->d_mp = mp;
+	return (0);
+}
+
+/*
+ * Enqueue a completely-built T_UNITDATA_IND message into the receive
+ * list; this is typically executed within the interrupt thread context
+ * and so we do things as quickly as possible.
+ */
+static void
+udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, uint_t pkt_len)
+{
+	ASSERT(q == RD(q));
+	ASSERT(pkt_len == msgdsize(mp));
+	ASSERT(mp->b_next == NULL && mp->b_cont != NULL);
+	ASSERT(DB_TYPE(mp) == M_PROTO && DB_TYPE(mp->b_cont) == M_DATA);
+	ASSERT(MBLKL(mp) >= sizeof (struct T_unitdata_ind));
+
+	mutex_enter(&udp->udp_drain_lock);
+	/*
+	 * Wake up and signal the receiving app; it is okay to do this
+	 * before enqueueing the mp because we are holding the drain lock.
+	 * One of the advantages of synchronous stream is the ability for
+	 * us to find out when the application performs a read on the
+	 * socket by way of udp_rrw() entry point being called.  We need
+	 * to generate SIGPOLL/SIGIO for each received data in the case
+	 * of asynchronous socket just as in the strrput() case.  However,
+	 * we only wake the application up when necessary, i.e. during the
+	 * first enqueue.  When udp_rrw() is called, we send up a single
+	 * datagram upstream and call STR_WAKEUP_SET() again when there
+	 * are still data remaining in our receive queue.
+	 */
+	if (udp->udp_rcv_list_head == NULL) {
+		STR_WAKEUP_SET(STREAM(q));
+		udp->udp_rcv_list_head = mp;
+	} else {
+		udp->udp_rcv_list_tail->b_next = mp;
+	}
+	udp->udp_rcv_list_tail = mp;
+	udp->udp_rcv_cnt += pkt_len;
+	udp->udp_rcv_msgcnt++;
+
+	/* Need to flow-control? */
+	if (udp->udp_rcv_cnt >= udp->udp_rcv_hiwat ||
+	    udp->udp_rcv_msgcnt >= udp->udp_rcv_hiwat)
+		udp->udp_drain_qfull = B_TRUE;
+
+	/* Update poll events and send SIGPOLL/SIGIO if necessary */
+	STR_SENDSIG(STREAM(q));
+	mutex_exit(&udp->udp_drain_lock);
+}
+
+/*
+ * Drain the contents of receive list to the module upstream; we do
+ * this during close or when we fallback to the slow mode due to
+ * sockmod being popped or a module being pushed on top of us.
+ */
+static void
+udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing)
+{
+	mblk_t *mp;
+
+	ASSERT(q == RD(q));
+
+	mutex_enter(&udp->udp_drain_lock);
+	/*
+	 * There is no race with a concurrent udp_input() sending
+	 * up packets using putnext() after we have cleared the
+	 * udp_direct_sockfs flag but before we have completed
+	 * sending up the packets in udp_rcv_list, since we are
+	 * either a writer or we have quiesced the conn.
+	 */
+	udp->udp_direct_sockfs = B_FALSE;
+	mutex_exit(&udp->udp_drain_lock);
+
+	if (udp->udp_rcv_list_head != NULL)
+		UDP_STAT(udp_drain);
+
+	/*
+	 * Send up everything via putnext(); note here that we
+	 * don't need the udp_drain_lock to protect us since
+	 * nothing can enter udp_rrw() and that we currently
+	 * have exclusive access to this udp.
+	 */
+	while ((mp = udp->udp_rcv_list_head) != NULL) {
+		udp->udp_rcv_list_head = mp->b_next;
+		mp->b_next = NULL;
+		udp->udp_rcv_cnt -= msgdsize(mp);
+		udp->udp_rcv_msgcnt--;
+		if (closing) {
+			freemsg(mp);
+		} else {
+			putnext(q, mp);
+		}
+	}
+	ASSERT(udp->udp_rcv_cnt == 0);
+	ASSERT(udp->udp_rcv_msgcnt == 0);
+	ASSERT(udp->udp_rcv_list_head == NULL);
+	udp->udp_rcv_list_tail = NULL;
+	udp->udp_drain_qfull = B_FALSE;
+}
+
+static size_t
+udp_set_rcv_hiwat(udp_t *udp, size_t size)
+{
+	/* We add a bit of extra buffering */
+	size += size >> 1;
+	if (size > udp_max_buf)
+		size = udp_max_buf;
+
+	udp->udp_rcv_hiwat = size;
+	return (size);
+}
+
 /*
  * Little helper for IPsec's NAT-T processing.
  */

--- a/usr/src/uts/common/inet/udp/udp6ddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udp6ddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992,1997-2002 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,7 +36,13 @@
 #define	INET_DEVMINOR	IPV6_MINOR
 #define	INET_DEVDESC	"UDP6 STREAMS driver %I%"
 #define	INET_STRTAB	udpinfo
-#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS	/* since we're really ip */
+#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define	INET_MODMTFLAGS	(D_MP|D_SYNCSTR|_D_DIRECT)
 
 #include "../inetddi.c"

--- a/usr/src/uts/common/inet/udp/udpddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udpddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -32,20 +32,23 @@
 #include <sys/modctl.h>
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/udp_impl.h>
 
 #define	INET_NAME	"udp"
 #define	INET_MODDESC	"UDP STREAMS module %I%"
 #define	INET_DEVDESC	"UDP STREAMS driver %I%"
 #define	INET_DEVMINOR	IPV4_MINOR
 #define	INET_STRTAB	udpinfo
-#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS	/* since as a driver we're ip */
-#define	INET_MODMTFLAGS (D_MP | D_MTQPAIR | D_MTPUTSHARED | _D_MTOCSHARED)
+#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define	INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT)
 
 #include "../inetddi.c"
 
-extern void udp_ddi_init(void);
-extern void udp_ddi_destroy(void);
-
 int
 _init(void)
 {

--- a/usr/src/uts/common/inet/udp_impl.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,13 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+/*
+ * UDP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -43,32 +50,42 @@
 #include <inet/common.h>
 #include <inet/ip.h>
 
+#define	UDP_MOD_ID		5607
+
+/* udp_mode. UDP_MT_HOT and UDP_SQUEUE are stable modes. Rest are transient */
+typedef enum {
+	UDP_MT_HOT = 0,			/* UDP endpoint is MT HOT */
+	UDP_MT_QUEUED = 1,		/* Messages enqueued in udp_mphead */
+	UDP_QUEUED_SQUEUE = 2,		/* Messages enqueued in conn_sqp */
+	UDP_SQUEUE = 3			/* Single threaded using squeues */
+} udp_mode_t;
+
 /* Internal udp control structure, one per open stream */
 typedef	struct udp_s {
-	uint32_t 	udp_state;	/* TPI state */
-	in_port_t 	udp_port;	/* Port bound to this stream */
-	in_port_t 	udp_dstport;	/* Connected port */
-	in6_addr_t 	udp_v6src;	/* Source address of this stream */
-	in6_addr_t 	udp_bound_v6src; /* Explicitly bound address */
-	in6_addr_t 	udp_v6dst;	/* Connected destination */
+	uint32_t	udp_state;	/* TPI state */
+	in_port_t	udp_port;	/* Port bound to this stream */
+	in_port_t	udp_dstport;	/* Connected port */
+	in6_addr_t	udp_v6src;	/* Source address of this stream */
+	in6_addr_t	udp_bound_v6src; /* Explicitly bound address */
+	in6_addr_t	udp_v6dst;	/* Connected destination */
 	uint32_t	udp_flowinfo;	/* Connected flow id and tclass */
-	uint32_t 	udp_max_hdr_len; /* For write offset in stream head */
+	uint32_t	udp_max_hdr_len; /* For write offset in stream head */
 	sa_family_t	udp_family;	/* Family from socket() call */
 	/*
 	 * IP format that packets transmitted from this struct should use.
 	 * Value can be IP4_VERSION or IPV6_VERSION.
 	 */
 	ushort_t	udp_ipversion;
-	uint32_t 	udp_ip_snd_options_len; /* Len of IPv4 options */
+	uint32_t	udp_ip_snd_options_len; /* Len of IPv4 options */
 	uchar_t		*udp_ip_snd_options;    /* Ptr to IPv4 options */
-	uint32_t 	udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
+	uint32_t	udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
 	uchar_t		*udp_ip_rcv_options;    /* Ptr to IPv4 options recvd */
-	cred_t		*udp_credp;		/* Credentials at open */
 	uchar_t		udp_multicast_ttl;	/* IP*_MULTICAST_TTL/HOPS */
-	ipaddr_t 	udp_multicast_if_addr;  /* IP_MULTICAST_IF option */
+	ipaddr_t	udp_multicast_if_addr;  /* IP_MULTICAST_IF option */
 	uint_t		udp_multicast_if_index;	/* IPV6_MULTICAST_IF option */
 	int		udp_bound_if;		/* IP*_BOUND_IF option */
 	int		udp_xmit_if;		/* IP_XMIT_IF option */
+	conn_t		*udp_connp;
 	uint32_t
 		udp_debug : 1,		/* SO_DEBUG "socket" option. */
 		udp_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
@@ -76,35 +93,36 @@
 		udp_useloopback : 1,	/* SO_USELOOPBACK "socket" option */
 
 		udp_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
-		udp_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
 		udp_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
 		udp_recvdstaddr : 1,	/* IP_RECVDSTADDR option */
+		udp_recvopts : 1,	/* IP_RECVOPTS option */
 
-		udp_recvopts : 1,	/* IP_RECVOPTS option */
 		udp_discon_pending : 1,	/* T_DISCON_REQ in progress */
 		udp_unspec_source : 1,	/* IP*_UNSPEC_SRC option */
 		udp_ipv6_recvpktinfo : 1,	/* IPV6_RECVPKTINFO option  */
+		udp_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
 
-		udp_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
 		udp_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
 		udp_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
 		udp_ipv6_recvrthdr : 1,		/* IPV6_RECVRTHDR option */
+		udp_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
 
-		udp_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
 		udp_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 		udp_anon_priv_bind : 1,
 		udp_exclbind : 1,	/* ``exclusive'' binding */
+		udp_recvif : 1,		/* IP_RECVIF option */
 
-		udp_recvif : 1,		/* IP_RECVIF option */
 		udp_recvslla : 1,	/* IP_RECVSLLA option */
 		udp_recvttl : 1,	/* IP_RECVTTL option */
 		udp_recvucred : 1,	/* IP_RECVUCRED option */
+		udp_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
 
-		udp_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
 		udp_ipv6_recvrthdrdstopts : 1,	/* IPV6_RECVRTHDRDSTOPTS */
+		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
+		udp_issocket : 1,	/* socket mode */
+		udp_direct_sockfs : 1,	/* direct calls to/from sockfs */
 
-		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
-		udp_pad_to_bit_31 : 7;
+		udp_pad_to_bit_31 : 4;
 
 	uint8_t		udp_type_of_service;	/* IP_TOS option */
 	uint8_t		udp_ttl;		/* TTL or hoplimit */
@@ -114,7 +132,20 @@
 	uint_t		udp_sticky_hdrs_len;	/* Incl. ip6h and any ip6i */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */
 	struct udp_s	**udp_ptpbhn; /* Pointer to previous bind hash next. */
-	zoneid_t	udp_zoneid;	/* ID of owning zone */
+	udp_mode_t	udp_mode;	/* Current mode of operation */
+	mblk_t		*udp_mphead;	/* Head of the queued operations */
+	mblk_t		*udp_mptail;	/* Tail of the queued operations */
+	uint_t		udp_mpcount;	/* Number of messages in the queue */
+	uint_t		udp_reader_count; /* Number of reader threads */
+	uint_t		udp_squeue_count; /* Number of messages in conn_sqp */
+
+	kmutex_t	udp_drain_lock;		/* lock for udp_rcv_list */
+	boolean_t	udp_drain_qfull;	/* drain queue is full */
+	mblk_t		*udp_rcv_list_head;	/* b_next chain of mblks */
+	mblk_t		*udp_rcv_list_tail;	/* last mblk in chain */
+	uint_t		udp_rcv_cnt;		/* total data in rcv_list */
+	uint_t		udp_rcv_msgcnt;		/* total messages in rcv_list */
+	size_t		udp_rcv_hiwat;		/* receive high watermark */
 } udp_t;
 
 /* UDP Protocol header */
@@ -127,6 +158,92 @@
 } udpha_t;
 #define	UDPH_SIZE	8
 
+/* Named Dispatch Parameter Management Structure */
+typedef struct udpparam_s {
+	uint32_t udp_param_min;
+	uint32_t udp_param_max;
+	uint32_t udp_param_value;
+	char	*udp_param_name;
+} udpparam_t;
+
+extern udpparam_t udp_param_arr[];
+
+#define	udp_wroff_extra			udp_param_arr[0].udp_param_value
+#define	udp_ipv4_ttl			udp_param_arr[1].udp_param_value
+#define	udp_ipv6_hoplimit		udp_param_arr[2].udp_param_value
+#define	udp_smallest_nonpriv_port	udp_param_arr[3].udp_param_value
+#define	udp_do_checksum			udp_param_arr[4].udp_param_value
+#define	udp_smallest_anon_port		udp_param_arr[5].udp_param_value
+#define	udp_largest_anon_port		udp_param_arr[6].udp_param_value
+#define	udp_xmit_hiwat			udp_param_arr[7].udp_param_value
+#define	udp_xmit_lowat			udp_param_arr[8].udp_param_value
+#define	udp_recv_hiwat			udp_param_arr[9].udp_param_value
+#define	udp_max_buf			udp_param_arr[10].udp_param_value
+#define	udp_ndd_get_info_interval	udp_param_arr[11].udp_param_value
+
+/* Kstats */
+typedef struct {				/* Class "net" kstats */
+	kstat_named_t	udp_ip_send;
+	kstat_named_t	udp_ip_ire_send;
+	kstat_named_t	udp_ire_null;
+	kstat_named_t	udp_drain;
+	kstat_named_t	udp_sock_fallback;
+	kstat_named_t	udp_rrw_busy;
+	kstat_named_t	udp_rrw_msgcnt;
+	kstat_named_t	udp_out_sw_cksum;
+	kstat_named_t	udp_out_sw_cksum_bytes;
+	kstat_named_t	udp_out_opt;
+	kstat_named_t	udp_out_err_notconn;
+	kstat_named_t	udp_out_err_output;
+	kstat_named_t	udp_out_err_tudr;
+	kstat_named_t	udp_in_pktinfo;
+	kstat_named_t	udp_in_recvdstaddr;
+	kstat_named_t	udp_in_recvopts;
+	kstat_named_t	udp_in_recvif;
+	kstat_named_t	udp_in_recvslla;
+	kstat_named_t	udp_in_recvucred;
+	kstat_named_t	udp_in_recvttl;
+	kstat_named_t	udp_in_recvhopopts;
+	kstat_named_t	udp_in_recvhoplimit;
+	kstat_named_t	udp_in_recvdstopts;
+	kstat_named_t	udp_in_recvrtdstopts;
+	kstat_named_t	udp_in_recvrthdr;
+	kstat_named_t	udp_in_recvpktinfo;
+	kstat_named_t	udp_in_recvtclass;
+#ifdef DEBUG
+	kstat_named_t	udp_data_conn;
+	kstat_named_t	udp_data_notconn;
+#endif
+} udp_stat_t;
+
+extern udp_stat_t	udp_statistics;
+
+#define	UDP_STAT(x)		(udp_statistics.x.value.ui64++)
+#define	UDP_STAT_UPDATE(x, n)	(udp_statistics.x.value.ui64 += (n))
+#ifdef DEBUG
+#define	UDP_DBGSTAT(x)		UDP_STAT(x)
+#else
+#define	UDP_DBGSTAT(x)
+#endif /* DEBUG */
+
+extern major_t	UDP6_MAJ;
+
+extern int	udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+		    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+extern int	udp_snmp_get(queue_t *, mblk_t *);
+extern int	udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
+extern void	udp_close_free(conn_t *);
+extern void	udp_quiesce_conn(conn_t *);
+extern void	udp_ddi_init(void);
+extern void	udp_ddi_destroy(void);
+extern void	udp_resume_bind(conn_t *, mblk_t *);
+extern void	udp_conn_recv(conn_t *, mblk_t *);
+extern boolean_t udp_compute_checksum(void);
+extern void	udp_wput_data(queue_t *, mblk_t *, struct sockaddr *,
+		    socklen_t);
+
 #endif	/*  _KERNEL */
 
 #ifdef	__cplusplus

--- a/usr/src/uts/common/io/gld.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/gld.c	Sat Oct 22 22:50:14 2005 -0700
@@ -3415,6 +3415,8 @@
 			dlhp->hcksum_txflags |= HCKSUM_INET_PARTIAL;
 		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V4)
 			dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V4;
+		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V6)
+			dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V6;
 		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_IPHDR)
 			dlhp->hcksum_txflags |= HCKSUM_IPHDRCKSUM;

--- a/usr/src/uts/common/io/stream.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/stream.c	Sat Oct 22 22:50:14 2005 -0700
@@ -1690,6 +1690,21 @@
 }
 
 /*
+ * Calculate number of data bytes in a single data message block taking
+ * multidata messages into account.
+ */
+
+#define	ADD_MBLK_SIZE(mp, size) 					\
+	if (DB_TYPE(mp) != M_MULTIDATA) {				\
+		(size) += MBLKL(mp);					\
+	} else {							\
+		uint_t	pinuse;						\
+									\
+		mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse);	\
+		(size) += pinuse;					\
+	}
+
+/*
  * Like getq() but does not backenable.  This is used by the stream
  * head when a putback() is likely.  The caller must call qbackenable()
  * after it is done with accessing the queue.
@@ -1721,7 +1736,7 @@
 
 		/* Get message byte count for q_count accounting */
 		for (tmp = bp; tmp; tmp = tmp->b_cont) {
-			bytecnt += (tmp->b_wptr - tmp->b_rptr);
+			ADD_MBLK_SIZE(tmp, bytecnt);
 			mblkcnt++;
 		}
 
@@ -1941,7 +1956,7 @@
 
 	/* Get the size of the message for q_count accounting */
 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
 
@@ -2433,9 +2448,10 @@
 
 	/* Get message byte count for q_count accounting */
 	for (tmp = bp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
+
 	if (qbp) {
 		qbp->qb_count += bytecnt;
 		qbp->qb_mblkcnt += mblkcnt;
@@ -2617,7 +2633,7 @@
 
 	/* Get message byte count for q_count accounting */
 	for (tmp = bp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
 	if (qbp) {
@@ -2748,7 +2764,7 @@
 
 	/* Get mblk and byte count for q_count accounting */
 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}

--- a/usr/src/uts/common/io/strsun.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/strsun.c	Sat Oct 22 22:50:14 2005 -0700
@@ -37,7 +37,9 @@
 #include <sys/errno.h>
 #include <sys/stream.h>
 #include <sys/stropts.h>
+#include <sys/strsubr.h>
 #include <sys/strsun.h>
+#include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 
 void
@@ -243,3 +245,63 @@
 	freemsg(datamp);
 	return (0);
 }
+
+/* Copy userdata into a new mblk_t */
+mblk_t *
+mcopyinuio(struct stdata *stp, uio_t *uiop, ssize_t iosize,
+    ssize_t maxblk, int *errorp)
+{
+	mblk_t	*head = NULL, **tail = &head;
+	size_t	offset = stp->sd_wroff;
+
+	if (iosize == INFPSZ || iosize > uiop->uio_resid)
+		iosize = uiop->uio_resid;
+
+	if (maxblk == INFPSZ)
+		maxblk = iosize;
+
+	/* Nothing to do in these cases, so we're done */
+	if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
+		goto done;
+
+	if (stp->sd_flag & STRCOPYCACHED)
+		uiop->uio_extflg |= UIO_COPY_CACHED;
+
+	/*
+	 * We will enter the loop below if iosize is 0; it will allocate an
+	 * empty message block and call uiomove(9F) which will just return.
+	 * We could avoid that with an extra check but would only slow
+	 * down the much more likely case where iosize is larger than 0.
+	 */
+	do {
+		ssize_t blocksize;
+		mblk_t  *mp;
+
+		blocksize = MIN(iosize, maxblk);
+		ASSERT(blocksize >= 0);
+		if ((mp = allocb_cred(offset + blocksize, CRED())) == NULL) {
+			*errorp = ENOMEM;
+			return (head);
+		}
+		mp->b_rptr += offset;
+		mp->b_wptr = mp->b_rptr + blocksize;
+		DB_CPID(mp) = curproc->p_pid;
+
+		*tail = mp;
+		tail = &mp->b_cont;
+
+		/* uiomove(9F) either returns 0 or EFAULT */
+		if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
+		    UIO_WRITE, uiop)) != 0) {
+			ASSERT(*errorp != ENOMEM);
+			freemsg(head);
+			return (NULL);
+		}
+
+		iosize -= blocksize;
+	} while (iosize > 0);
+
+done:
+	*errorp = 0;
+	return (head);
+}

--- a/usr/src/uts/common/os/streamio.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/os/streamio.c	Sat Oct 22 22:50:14 2005 -0700
@@ -2642,11 +2642,18 @@
 int
 strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp)
 {
+	return (strwrite_common(vp, uiop, crp, 0));
+}
+
+/* ARGSUSED2 */
+int
+strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag)
+{
 	struct stdata *stp;
 	struct queue *wqp;
 	ssize_t rmin, rmax;
 	ssize_t iosize;
-	char waitflag;
+	int waitflag;
 	int tempmode;
 	int error = 0;
 	int b_flag;
@@ -2701,7 +2708,7 @@
 	/*
 	 * Do until count satisfied or error.
 	 */
-	waitflag = WRITEWAIT;
+	waitflag = WRITEWAIT | wflag;
 	if (stp->sd_flag & OLDNDELAY)
 		tempmode = uiop->uio_fmode & ~FNDELAY;
 	else
@@ -2803,79 +2810,6 @@
 }
 
 /*
- * kstrwritemp() has very similar semantics as that of strwrite().
- * The main difference is it obtains mblks from the caller and also
- * does not do any copy as done in strwrite() from user buffers to
- * kernel buffers.
- *
- *
- * Currently, this routine is used by sendfile to send data allocated
- * within the kernel without any copying. This interface does not use the
- * synchronous stream interface as synch. stream interface implies
- * copying.
- */
-int
-kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
-{
-	struct stdata *stp;
-	struct queue *wqp;
-	char waitflag;
-	int tempmode;
-	int error;
-	int done = 0;
-
-	ASSERT(vp->v_stream);
-	stp = vp->v_stream;
-
-	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
-		mutex_enter(&stp->sd_lock);
-		error = strwriteable(stp, B_FALSE, B_TRUE);
-		mutex_exit(&stp->sd_lock);
-		if (error != 0)
-			return (error);
-	}
-
-	/*
-	 * First, check for flow control without grabbing the sd_lock.
-	 * If we would block, re-check with the lock. This is similar
-	 * to the logic used by strwrite().
-	 */
-	wqp = stp->sd_wrq;
-	if (canputnext(wqp)) {
-		putnext(wqp, mp);
-		return (0);
-	}
-
-	waitflag = WRITEWAIT;
-	if (stp->sd_flag & OLDNDELAY)
-		tempmode = fmode & ~FNDELAY;
-	else
-		tempmode = fmode;
-
-	mutex_enter(&stp->sd_lock);
-	do {
-		if (canputnext(wqp)) {
-			mutex_exit(&stp->sd_lock);
-			putnext(wqp, mp);
-			return (0);
-		}
-		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
-		    &done);
-	} while (error == 0 && !done);
-
-	mutex_exit(&stp->sd_lock);
-	/*
-	 * EAGAIN tells the application to try again. ENOMEM
-	 * is returned only if the memory allocation size
-	 * exceeds the physical limits of the system. ENOMEM
-	 * can't be true here.
-	 */
-	if (error == ENOMEM)
-		error = EAGAIN;
-	return (error);
-}
-
-/*
  * Stream head write service routine.
  * Its job is to wake up any sleeping writers when a queue
  * downstream needs data (part of the flow control in putq and getq).

--- a/usr/src/uts/common/os/strsubr.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/os/strsubr.c	Sat Oct 22 22:50:14 2005 -0700
@@ -2437,6 +2437,18 @@
 	if (devflag & D_SYNCSTR)
 		qflag |= QSYNCSTR;
 
+	/*
+	 * Private flag used by a transport module to indicate
+	 * to sockfs that it supports direct-access mode without
+	 * having to go through STREAMS.
+	 */
+	if (devflag & _D_DIRECT) {
+		/* Reject unless the module is fully-MT (no perimeter) */
+		if ((qflag & QMT_TYPEMASK) != QMTSAFE)
+			goto bad;
+		qflag |= _QDIRECT;
+	}
+
 	*qflagp = qflag;
 	*sqtypep = sqtype;
 	return (0);
@@ -8236,11 +8248,11 @@
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		/* Associate values for M_DATA type */
-		mp->b_datap->db_cksumstart = (intptr_t)start;
-		mp->b_datap->db_cksumstuff = (intptr_t)stuff;
-		mp->b_datap->db_cksumend = (intptr_t)end;
-		mp->b_datap->db_struioun.cksum.flags = flags;
-		mp->b_datap->db_cksum16 = (uint16_t)value;
+		DB_CKSUMSTART(mp) = (intptr_t)start;
+		DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
+		DB_CKSUMEND(mp) = (intptr_t)end;
+		DB_CKSUMFLAGS(mp) = flags;
+		DB_CKSUM16(mp) = (uint16_t)value;
 
 	} else {
 		pattrinfo_t pa_info;
@@ -8258,6 +8270,8 @@
 			hck->hcksum_end_offset = end;
 			hck->hcksum_cksum_val.inet_cksum = (uint16_t)value;
 			hck->hcksum_flags = flags;
+		} else {
+			rc = -1;
 		}
 	}
 	return (rc);
@@ -8271,20 +8285,16 @@
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		if (flags != NULL) {
-			*flags = mp->b_datap->db_struioun.cksum.flags;
+			*flags = DB_CKSUMFLAGS(mp);
 			if (*flags & HCK_PARTIALCKSUM) {
 				if (start != NULL)
-					*start = (uint32_t)
-					    mp->b_datap->db_cksumstart;
+					*start = (uint32_t)DB_CKSUMSTART(mp);
 				if (stuff != NULL)
-					*stuff = (uint32_t)
-					    mp->b_datap->db_cksumstuff;
+					*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
 				if (end != NULL)
-					*end =
-					    (uint32_t)mp->b_datap->db_cksumend;
+					*end = (uint32_t)DB_CKSUMEND(mp);
 				if (value != NULL)
-					*value =
-					    (uint32_t)mp->b_datap->db_cksum16;
+					*value = (uint32_t)DB_CKSUM16(mp);
 			}
 		}
 	} else {

--- a/usr/src/uts/common/sys/conf.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/conf.h	Sat Oct 22 22:50:14 2005 -0700
@@ -24,7 +24,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -219,6 +219,8 @@
 
 #define	D_U64BIT	0x40000	/* Driver supports unsigned 64-bit uio offset */
 
+#define	_D_DIRECT	0x80000	/* Private flag for transport modules */
+
 #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
 
 #ifdef	__cplusplus

--- a/usr/src/uts/common/sys/dlpi.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/dlpi.h	Sat Oct 22 22:50:14 2005 -0700
@@ -689,6 +689,8 @@
 					/* ability */
 #define	HCKSUM_INET_FULL_V4	0x04	/* Full 1's complement checksum */
 					/* ability for IPv4 packets. */
+#define	HCKSUM_INET_FULL_V6	0x08	/* Full 1's complement checksum */
+					/* ability for IPv6 packets. */
 #define	HCKSUM_IPHDRCKSUM	0x10	/* IPv4 Header checksum offload */
 					/* capability */
 #ifdef _KERNEL

--- a/usr/src/uts/common/sys/gld.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/gld.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -240,9 +240,12 @@
 #define	GLD_CAP_LINKSTATE	0x00000001 /* will call gld_linkstate() */
 #define	GLD_CAP_CKSUM_IPHDR	0x00000008 /* IP checksum offload	*/
 #define	GLD_CAP_CKSUM_PARTIAL	0x00000010 /* TCP/UDP partial		*/
-#define	GLD_CAP_CKSUM_FULL_V4	0x00000020 /* TCP/UDP full		*/
-#define	GLD_CAP_CKSUM_ANY	0x00000038 /* any or all of the above	*/
+#define	GLD_CAP_CKSUM_FULL_V4	0x00000020 /* TCP/UDP full for IPv4	*/
 #define	GLD_CAP_ZEROCOPY	0x00000040 /* zerocopy */
+#define	GLD_CAP_CKSUM_FULL_V6	0x00000080 /* TCP/UDP full for IPv6	*/
+#define	GLD_CAP_CKSUM_ANY				\
+	(GLD_CAP_CKSUM_IPHDR|GLD_CAP_CKSUM_PARTIAL|	\
+	GLD_CAP_CKSUM_FULL_V4|GLD_CAP_CKSUM_FULL_V6)
 
 /* values of gldm_linkstate, as passed to gld_linkstate() */
 #define	GLD_LINKSTATE_DOWN	-1

--- a/usr/src/uts/common/sys/multidata.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/multidata.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -68,19 +68,24 @@
 /*
  * Multidata packet descriptor information.
  */
-typedef struct pdescinfo_s {
-	uint_t	flags;		/* misc. flags */
-	uchar_t	*hdr_base;	/* start address of header area */
-	uchar_t *hdr_rptr;	/* start address of header data */
-	uchar_t *hdr_wptr;	/* end address of header data */
-	uchar_t	*hdr_lim;	/* end address of header area */
-	uint_t	pld_cnt;	/* number of payload area */
-	struct pld_ary_s {
-		int pld_pbuf_idx;	/* payload buffer index */
-		uchar_t *pld_rptr;	/* start address of payload data */
-		uchar_t *pld_wptr;	/* pointer to end of payload data */
-	} pld_ary[MULTIDATA_MAX_PBUFS];
-} pdescinfo_t;
+struct pld_ary_s {
+	int pld_pbuf_idx;	/* payload buffer index */
+	uchar_t *pld_rptr;	/* start address of payload data */
+	uchar_t *pld_wptr;	/* pointer to end of payload data */
+};
+
+#define	PDESCINFO_STRUCT(elems) 					\
+{									\
+	uint_t	flags;		/* misc. flags */			\
+	uchar_t	*hdr_base;	/* start address of header area */	\
+	uchar_t *hdr_rptr;	/* start address of header data */	\
+	uchar_t *hdr_wptr;	/* end address of header data */	\
+	uchar_t	*hdr_lim;	/* end address of header area */	\
+	uint_t	pld_cnt;	/* number of payload area */		\
+	struct pld_ary_s	pld_ary[(elems)];			\
+}
+
+typedef struct pdescinfo_s PDESCINFO_STRUCT(MULTIDATA_MAX_PBUFS) pdescinfo_t;
 
 /*
  * Possible values for flags

--- a/usr/src/uts/common/sys/multidata_impl.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/multidata_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -191,21 +191,6 @@
 	uint_t	mmd_pbuf_ref;	/* descriptors referring to payload buffer(s) */
 };
 
-/*
- * Smaller and private version of pdescinfo_t used specifically for tcp,
- * which allows for only two payload spans per packet.  Any changes made
- * to the pdescinfo_t structure must be reflected here as well.
- */
-typedef struct tcp_pdescinfo_s {
-	uint_t	flags;		/* misc. flags */
-	uchar_t	*hdr_base;	/* start address of header area */
-	uchar_t *hdr_rptr;	/* start address of header data */
-	uchar_t *hdr_wptr;	/* end address of header data */
-	uchar_t	*hdr_lim;	/* end address of header area */
-	uint_t	pld_cnt;	/* number of payload area */
-	struct pld_ary_s pld_ary[2];
-} tcp_pdescinfo_t;
-
 #ifdef _KERNEL
 
 extern void mmd_init(void);

--- a/usr/src/uts/common/sys/socketvar.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/socketvar.h	Sat Oct 22 22:50:14 2005 -0700
@@ -100,6 +100,7 @@
 };
 
 typedef struct sonodeops sonodeops_t;
+typedef struct sonode sonode_t;
 
 /*
  * The sonode represents a socket. A sonode never exist in the file system
@@ -364,7 +365,7 @@
 #define	SS_DONEREAD		0x00080000 /* NCAfs: all data read */
 #define	SS_MOREDATA		0x00100000 /* NCAfs: NCA has more data */
 
-#define	SS_TCP_FAST_ACCEPT	0x00200000 /* Use TCP's accept fast-path */
+#define	SS_DIRECT		0x00200000 /* transport is directly below */
 
 #define	SS_LADDR_VALID		0x01000000	/* so_laddr valid for user */
 #define	SS_FADDR_VALID		0x02000000	/* so_faddr valid for user */
@@ -769,8 +770,10 @@
 extern void	so_flush_discon_ind(struct sonode *);
 extern int	sowaitconnected(struct sonode *, int, int);
 
+extern int	sostream_direct(struct sonode *, struct uio *,
+		    mblk_t *, cred_t *);
 extern int	sosend_dgram(struct sonode *, struct sockaddr *,
-			socklen_t, struct uio *, int);
+		    socklen_t, struct uio *, int);
 extern int	sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int);
 extern void	so_installhooks(struct sonode *);
 extern int	so_strinit(struct sonode *, struct sonode *);

--- a/usr/src/uts/common/sys/sockio.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/sockio.h	Sat Oct 22 22:50:14 2005 -0700
@@ -265,9 +265,9 @@
 #define	SIOCDXARP	_IOW('i', 168, struct xarpreq)	/* delete ARP entry */
 
 /*
- * IOCTL to indicate to the transport that the sockmod is being popped
+ * IOCTL private to sockfs.
  */
-#define	SIOCPOPSOCKFS	_IOW('i', 169, 0)
+#define	_SIOCSOCKFALLBACK _IOW('i', 169, 0)
 
 /*
  * IOCTLs for getting and setting zone associated with an interface, and

--- a/usr/src/uts/common/sys/stream.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/stream.h	Sat Oct 22 22:50:14 2005 -0700
@@ -171,6 +171,8 @@
 #define	_QINSERTING	0x04000000	/* Private, module is being inserted */
 #define	_QREMOVING	0x08000000	/* Private, module is being removed */
 #define	_QASSOCIATED	0x10000000	/* queue is associated with a device */
+#define	_QDIRECT	0x20000000	/* Private; transport module uses */
+					/* direct interface to/from sockfs */
 
 /* queue sqflags (protected by SQLOCK). */
 #define	Q_SQQUEUED	0x01		/* Queue is in the syncq list */

--- a/usr/src/uts/common/sys/strsubr.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/strsubr.h	Sat Oct 22 22:50:14 2005 -0700
@@ -1096,6 +1096,8 @@
 extern void strclean(struct vnode *);
 extern void str_cn_clean();	/* XXX hook for consoles signal cleanup */
 extern int strwrite(struct vnode *, struct uio *, cred_t *);
+extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int);
+extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
 extern int strread(struct vnode *, struct uio *, cred_t *);
 extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *);
 extern int strrput(queue_t *, mblk_t *);
@@ -1180,6 +1182,7 @@
 extern mblk_t *allocb_cred(size_t, cred_t *);
 extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *);
 extern mblk_t *allocb_tmpl(size_t, const mblk_t *);
+extern mblk_t *allocb_tryhard(size_t);
 extern void mblk_setcred(mblk_t *, cred_t *);
 extern void strpollwakeup(vnode_t *, short);
 extern int putnextctl_wait(queue_t *, int);
@@ -1188,7 +1191,6 @@
     unsigned char, int, int);
 extern int kstrgetmsg(struct vnode *, mblk_t **, struct uio *,
     unsigned char *, int *, clock_t, rval_t *);
-extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
 
 extern void strsetrerror(vnode_t *, int, int, errfunc_t);
 extern void strsetwerror(vnode_t *, int, int, errfunc_t);
@@ -1217,6 +1219,8 @@
 extern void freemsgchain(mblk_t *);
 extern mblk_t *copymsgchain(mblk_t *);
 
+extern mblk_t *mcopyinuio(struct stdata *, uio_t *, ssize_t, ssize_t, int *);
+
 /*
  * shared or externally configured data structures
  */
@@ -1263,6 +1267,19 @@
 extern struct queue *WR(queue_t *);
 extern int SAMESTR(queue_t *);
 
+/*
+ * The following hardware checksum related macros are private
+ * interfaces that are subject to change without notice.
+ */
+#ifdef _KERNEL
+#define	DB_CKSUMSTART(mp)	((mp)->b_datap->db_cksumstart)
+#define	DB_CKSUMEND(mp)		((mp)->b_datap->db_cksumend)
+#define	DB_CKSUMSTUFF(mp)	((mp)->b_datap->db_cksumstuff)
+#define	DB_CKSUMFLAGS(mp)	((mp)->b_datap->db_struioun.cksum.flags)
+#define	DB_CKSUM16(mp)		((mp)->b_datap->db_cksum16)
+#define	DB_CKSUM32(mp)		((mp)->b_datap->db_cksum32)
+#endif	/* _KERNEL */
+
 #ifdef	__cplusplus
 }
 #endif

--- a/usr/src/uts/common/syscall/sendfile.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/syscall/sendfile.c	Sat Oct 22 22:50:14 2005 -0700
@@ -73,6 +73,89 @@
 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
 		int);
 
+/*
+ * kstrwritemp() has very similar semantics as that of strwrite().
+ * The main difference is it obtains mblks from the caller and also
+ * does not do any copy as done in strwrite() from user buffers to
+ * kernel buffers.
+ *
+ * Currently, this routine is used by sendfile to send data allocated
+ * within the kernel without any copying. This interface does not use the
+ * synchronous stream interface as synch. stream interface implies
+ * copying.
+ */
+int
+kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
+{
+	struct stdata *stp;
+	struct queue *wqp;
+	char waitflag;
+	int tempmode;
+	int error = 0;
+	int done = 0;
+	struct sonode *so;
+	boolean_t direct;
+
+	ASSERT(vp->v_stream);
+	stp = vp->v_stream;
+
+	so = VTOSO(vp);
+	direct = (so->so_state & SS_DIRECT);
+
+	/*
+	 * This is the sockfs direct fast path. canputnext() need
+	 * not be accurate so we don't grab the sd_lock here. If
+	 * we get flow-controlled, we grab sd_lock just before the
+	 * do..while loop below to emulate what strwrite() does.
+	 */
+	wqp = stp->sd_wrq;
+	if (canputnext(wqp) && direct &&
+	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
+		return (sostream_direct(so, NULL, mp, CRED()));
+	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
+		/* Fast check of flags before acquiring the lock */
+		mutex_enter(&stp->sd_lock);
+		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
+		mutex_exit(&stp->sd_lock);
+		if (error != 0) {
+			if (!(stp->sd_flag & STPLEX) &&
+			    (stp->sd_wput_opt & SW_SIGPIPE)) {
+				tsignal(curthread, SIGPIPE);
+				error = EPIPE;
+			}
+			return (error);
+		}
+	}
+
+	waitflag = WRITEWAIT;
+	if (stp->sd_flag & OLDNDELAY)
+		tempmode = fmode & ~FNDELAY;
+	else
+		tempmode = fmode;
+
+	mutex_enter(&stp->sd_lock);
+	do {
+		if (canputnext(wqp)) {
+			mutex_exit(&stp->sd_lock);
+			putnext(wqp, mp);
+			return (0);
+		}
+		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
+		    &done);
+	} while (error == 0 && !done);
+
+	mutex_exit(&stp->sd_lock);
+	/*
+	 * EAGAIN tells the application to try again. ENOMEM
+	 * is returned only if the memory allocation size
+	 * exceeds the physical limits of the system. ENOMEM
+	 * can't be true here.
+	 */
+	if (error == ENOMEM)
+		error = EAGAIN;
+	return (error);
+}
+
 #define	SEND_MAX_CHUNK	16
 
 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
@@ -1045,7 +1128,7 @@
 				goto err;
 			}
 
-			if ((so->so_state & SS_TCP_FAST_ACCEPT) &&
+			if ((so->so_state & SS_DIRECT) &&
 			    (so->so_priv != NULL)) {
 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
 			} else {

--- a/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Oct 22 22:50:14 2005 -0700
@@ -482,6 +482,7 @@
 	NO_UNLOAD_STUB(sockfs, sosendfile64,  	nomod_zero);
 	NO_UNLOAD_STUB(sockfs, sock_getfasync,  nomod_zero);
 	NO_UNLOAD_STUB(sockfs, nl7c_sendfilev,  nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sostream_direct,	nomod_zero);
 	END_MODULE(sockfs);
 #endif
 
@@ -529,12 +530,6 @@
 	END_MODULE(spdsock);
 #endif
 
-#ifndef UDP_MODULE
-	MODULE(udp,drv);
-	WSTUB(udp, udp_compute_checksum, nomod_zero);
-	END_MODULE(udp);
-#endif
-
 #ifndef NATTYMOD_MODULE
 	MODULE(nattymod, strmod);
 	WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);

--- a/usr/src/uts/sparc/ml/modstubs.s	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/sparc/ml/modstubs.s	Sat Oct 22 22:50:14 2005 -0700
@@ -368,6 +368,7 @@
 	NO_UNLOAD_STUB(sockfs, sosendfile64,  	nomod_zero);
 	NO_UNLOAD_STUB(sockfs, sock_getfasync,  nomod_zero);
 	NO_UNLOAD_STUB(sockfs, nl7c_sendfilev,  nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sostream_direct,	nomod_zero);
 	END_MODULE(sockfs);
 #endif
 
@@ -415,12 +416,6 @@
 	END_MODULE(spdsock);
 #endif
 
-#ifndef UDP_MODULE
-	MODULE(udp,drv);
-	WSTUB(udp, udp_compute_checksum, nomod_zero);
-	END_MODULE(udp);
-#endif
-
 #ifndef NATTYMOD_MODULE
 	MODULE(nattymod, strmod);
 	WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);

author	masputra
	Sat, 22 Oct 2005 22:50:14 -0700
changeset 741	40027a3621ac
parent 740	70e4862c9a1a
child 742	588610e3e562

usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/mdb/common/modules/genunix/genunix.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/mdb/common/modules/genunix/net.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/mdb/common/modules/genunix/net.h		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/rcm_daemon/common/ip_rcm.c		file \| annotate \| diff \| comparison \| revisions
usr/src/pkgdefs/etc/exception_list_i386		file \| annotate \| diff \| comparison \| revisions
usr/src/pkgdefs/etc/exception_list_sparc		file \| annotate \| diff \| comparison \| revisions
usr/src/tools/scripts/bfu.sh		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/Makefile.files		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/sockfs/sockstr.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/sockfs/socktpi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/sockfs/sockvnops.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/Makefile		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/arp/arp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/common.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/inet_common.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/igmp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ip.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ip6.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ip_if.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ip_multi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ip_ndp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/ipclassifier.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip/tun.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip6.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ip_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ipclassifier.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/ipp_common.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/led.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/optcom.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/optcom.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/snmpcom.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/squeue.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp/tcp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp/tcp6ddi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp/tcp_fusion.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp/tcpddi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/tcp_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/udp/udp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/udp/udp6ddi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/udp/udpddi.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/inet/udp_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/io/gld.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/io/stream.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/io/strsun.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/os/streamio.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/os/strsubr.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/conf.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/dlpi.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/gld.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/multidata.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/multidata_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/socketvar.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/sockio.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/stream.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/sys/strsubr.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/syscall/sendfile.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/intel/ia32/ml/modstubs.s		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/sparc/ml/modstubs.s		file \| annotate \| diff \| comparison \| revisions