PSARC 2005/082 Yosemite: UDP Performance Enhancement
authormasputra
Sat, 22 Oct 2005 22:50:14 -0700
changeset 741 40027a3621ac
parent 740 70e4862c9a1a
child 742 588610e3e562
PSARC 2005/082 Yosemite: UDP Performance Enhancement 4796051 Solaris needs a more complete HW checksumming support 4905227 duplicate macros in ipclassifier.h and ip.h 4915681 need hardware checksum offload for the case of IP/UDP reassembly 6201076 outbound flow-control dysfunctional, ip to ce using mdt 6223331 ipv6 flow control may corrupt UDP packets 6223809 16-bit aligned IP header should be allowed for all x86 platforms 6275398 Galaxy hangs when running lmbench 6281836 Yosemite project integration into Solaris 6281885 xge needs to support IPv6 checksum offload 6282776 IPv6 NCE fast path is not created for incoming solicitation 6304890 IP transmit-side checksum logic needs to be tightened 6304902 IP6_IN_NOCKSUM is obsolete and should be torched 6304904 UDP should reject TI_GETPEERNAME for non-connected endpoint 6306768 IP and UDP device and module definitions need to be centralized
usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c
usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c
usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
usr/src/cmd/mdb/common/modules/genunix/genunix.c
usr/src/cmd/mdb/common/modules/genunix/net.c
usr/src/cmd/mdb/common/modules/genunix/net.h
usr/src/cmd/rcm_daemon/common/ip_rcm.c
usr/src/pkgdefs/etc/exception_list_i386
usr/src/pkgdefs/etc/exception_list_sparc
usr/src/tools/scripts/bfu.sh
usr/src/uts/common/Makefile.files
usr/src/uts/common/fs/sockfs/sockstr.c
usr/src/uts/common/fs/sockfs/socktpi.c
usr/src/uts/common/fs/sockfs/sockvnops.c
usr/src/uts/common/inet/Makefile
usr/src/uts/common/inet/arp/arp.c
usr/src/uts/common/inet/common.h
usr/src/uts/common/inet/inet_common.c
usr/src/uts/common/inet/ip.h
usr/src/uts/common/inet/ip/igmp.c
usr/src/uts/common/inet/ip/ip.c
usr/src/uts/common/inet/ip/ip6.c
usr/src/uts/common/inet/ip/ip_if.c
usr/src/uts/common/inet/ip/ip_multi.c
usr/src/uts/common/inet/ip/ip_ndp.c
usr/src/uts/common/inet/ip/ipclassifier.c
usr/src/uts/common/inet/ip/tun.c
usr/src/uts/common/inet/ip6.h
usr/src/uts/common/inet/ip_impl.h
usr/src/uts/common/inet/ipclassifier.h
usr/src/uts/common/inet/ipp_common.h
usr/src/uts/common/inet/led.h
usr/src/uts/common/inet/optcom.c
usr/src/uts/common/inet/optcom.h
usr/src/uts/common/inet/snmpcom.c
usr/src/uts/common/inet/squeue.c
usr/src/uts/common/inet/tcp.h
usr/src/uts/common/inet/tcp/tcp.c
usr/src/uts/common/inet/tcp/tcp6ddi.c
usr/src/uts/common/inet/tcp/tcp_fusion.c
usr/src/uts/common/inet/tcp/tcpddi.c
usr/src/uts/common/inet/tcp_impl.h
usr/src/uts/common/inet/udp/udp.c
usr/src/uts/common/inet/udp/udp6ddi.c
usr/src/uts/common/inet/udp/udpddi.c
usr/src/uts/common/inet/udp_impl.h
usr/src/uts/common/io/gld.c
usr/src/uts/common/io/stream.c
usr/src/uts/common/io/strsun.c
usr/src/uts/common/os/streamio.c
usr/src/uts/common/os/strsubr.c
usr/src/uts/common/sys/conf.h
usr/src/uts/common/sys/dlpi.h
usr/src/uts/common/sys/gld.h
usr/src/uts/common/sys/multidata.h
usr/src/uts/common/sys/multidata_impl.h
usr/src/uts/common/sys/socketvar.h
usr/src/uts/common/sys/sockio.h
usr/src/uts/common/sys/stream.h
usr/src/uts/common/sys/strsubr.h
usr/src/uts/common/syscall/sendfile.c
usr/src/uts/intel/ia32/ml/modstubs.s
usr/src/uts/sparc/ml/modstubs.s
--- a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c	Sat Oct 22 22:50:14 2005 -0700
@@ -76,6 +76,7 @@
 #include <netinet/in.h>
 #include <sys/tihdr.h>
 #include <inet/mib2.h>
+#include <inet/ip.h>
 #include <sys/ethernet.h>
 #include <sys/ser_sync.h>
 
@@ -92,27 +93,6 @@
 static const char rcsid[] = RCSID;
 #endif
 
-/* Need to use UDP for ifconfig compatibility */
-#if !defined(UDP_DEV_NAME)
-#define	UDP_DEV_NAME		"/dev/udp"
-#endif /* UDP_DEV_NAME */
-
-#if !defined(IP_DEV_NAME)
-#define	IP_DEV_NAME		"/dev/ip"
-#endif /* IP_DEV_NAME */
-
-#if !defined(UDP6_DEV_NAME)
-#define	UDP6_DEV_NAME		"/dev/udp6"
-#endif /* UDP6_DEV_NAME */
-
-#if !defined(IP6_DEV_NAME)
-#define	IP6_DEV_NAME		"/dev/ip6"
-#endif /* IP6_DEV_NAME */
-
-#if !defined(IP_MOD_NAME)
-#define	IP_MOD_NAME		"ip"
-#endif /* IP_MOD_NAME */
-
 #define	PPPSTRTIMOUT	1	/* Timeout in seconds for ioctl */
 #define	MAX_POLLFDS	32
 #define	NMODULES	32
--- a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,18 +55,6 @@
 #include "ncaconf.h"
 
 /* NCA does not support IPv6... */
-#ifndef	IP_DEV_NAME
-#define	IP_DEV_NAME	"/dev/ip"
-#endif
-
-#ifndef	IP_MOD_NAME
-#define	IP_MOD_NAME	"ip"
-#endif
-
-#ifndef	UDP_DEV_NAME
-#define	UDP_DEV_NAME	"/dev/udp"
-#endif
-
 #ifndef	NCA_MOD_NAME
 #define	NCA_MOD_NAME	"nca"
 #endif
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c	Sat Oct 22 22:50:14 2005 -0700
@@ -18,6 +18,8 @@
 #include <sys/dlpi.h>
 #include <libdlpi.h>
 
+#include <inet/ip.h>
+
 #define	LOOPBACK_IF	"lo0"
 
 #define	NONE_STR	"none"
@@ -26,26 +28,6 @@
 #define	ARP_MOD_NAME	"arp"
 #endif
 
-#ifndef	IP_DEV_NAME
-#define	IP_DEV_NAME	"/dev/ip"
-#endif
-
-#ifndef	IP_MOD_NAME
-#define	IP_MOD_NAME	"ip"
-#endif
-
-#ifndef	IP6_DEV_NAME
-#define	IP6_DEV_NAME	"/dev/ip6"
-#endif
-
-#ifndef	UDP_DEV_NAME
-#define	UDP_DEV_NAME	"/dev/udp"
-#endif
-
-#ifndef	UDP6_DEV_NAME
-#define	UDP6_DEV_NAME	"/dev/udp6"
-#endif
-
 #define	ADDRBITS_V4	32	/* number of bits in IPv4 address */
 #define	ADDRBITS_V6	128	/* number of bits in IPv6 address */
 
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c	Sat Oct 22 22:50:14 2005 -0700
@@ -3883,9 +3883,6 @@
 		mi_walk_init, mi_walk_step, mi_walk_fini, NULL },
 	{ "sonode", "given a sonode, walk its children",
 		sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL },
-	{ "udp", "walk UDP connections using MI",
-		mi_payload_walk_init, mi_payload_walk_step,
-		mi_payload_walk_fini, &mi_udp_arg },
 
 	/* from nvpair.c */
 	{ NVPAIR_WALKER_NAME, NVPAIR_WALKER_DESCR,
--- a/usr/src/cmd/mdb/common/modules/genunix/net.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c	Sat Oct 22 22:50:14 2005 -0700
@@ -107,7 +107,8 @@
 static int
 net_udp_active(const udp_t *udp)
 {
-	return ((udp->udp_state != TS_UNBND) && (udp->udp_state != TS_IDLE));
+	return ((udp->udp_state == TS_IDLE) ||
+	    (udp->udp_state == TS_DATA_XFER));
 }
 
 static int
@@ -355,11 +356,6 @@
 	delete_mi_payload_walk_data(wsp->walk_data, arg->mi_pwa_size);
 }
 
-const mi_payload_walk_arg_t mi_udp_arg = {
-	"udp", "udp_g_head", sizeof (udp_t),
-	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
-};
-
 const mi_payload_walk_arg_t mi_ar_arg = {
 	"arp", "ar_g_head", sizeof (ar_t),
 	MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
@@ -595,7 +591,7 @@
 	tcp = (tcp_t *)((uintptr_t)connp + (tcp_kaddr - kaddr));
 
 	if ((uintptr_t)tcp < (uintptr_t)connp ||
-	    (uintptr_t)&tcp->tcp_connp > (uintptr_t)connp + itc_size ||
+	    (uintptr_t)(tcp + 1) > (uintptr_t)connp + itc_size ||
 	    (uintptr_t)tcp->tcp_connp != kaddr) {
 		mdb_warn("conn_tcp %p is invalid", tcp_kaddr);
 		return (WALK_NEXT);
@@ -603,7 +599,7 @@
 	connp->conn_tcp = tcp;
 	tcp->tcp_connp = connp;
 
-	if (!(opts & NETSTAT_ALL || net_tcp_active(tcp)) ||
+	if (!((opts & NETSTAT_ALL) || net_tcp_active(tcp)) ||
 	    (af == AF_INET && !net_tcp_ipv4(tcp)) ||
 	    (af == AF_INET6 && !net_tcp_ipv6(tcp))) {
 		return (WALK_NEXT);
@@ -639,45 +635,57 @@
 	return (netstat_tcp_cb(kaddr, walk_data, cb_data, AF_INET6));
 }
 
+/*ARGSUSED*/
 static int
-netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af)
 {
-	const udp_t *udp = walk_data;
 	const uintptr_t opts = (uintptr_t)cb_data;
+	udp_t udp;
+	conn_t connp;
+
+	if (mdb_vread(&udp, sizeof (udp_t), kaddr) == -1) {
+		mdb_warn("failed to read udp at %p", kaddr);
+		return (WALK_ERR);
+	}
 
-	if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv4(udp)))
-		return (WALK_NEXT);
+	if (mdb_vread(&connp, sizeof (conn_t),
+	    (uintptr_t)udp.udp_connp) == -1) {
+		mdb_warn("failed to read udp_connp at %p",
+		    (uintptr_t)udp.udp_connp);
+		return (WALK_ERR);
+	}
 
-	mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
-	net_ipv4addrport_pr(&udp->udp_v6src, udp->udp_port);
-	mdb_printf(" ");
-	net_ipv4addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
-	mdb_printf(" %4i\n", udp->udp_zoneid);
+	if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) ||
+	    (af == AF_INET && !net_udp_ipv4(&udp)) ||
+	    (af == AF_INET6 && !net_udp_ipv6(&udp))) {
+		return (WALK_NEXT);
+	}
+
+	mdb_printf("%0?p %2i ", kaddr, udp.udp_state);
+	if (af == AF_INET) {
+		net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port);
+		mdb_printf(" ");
+		net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+	} else if (af == AF_INET6) {
+		net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port);
+		mdb_printf(" ");
+		net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+	}
+	mdb_printf(" %4i\n", connp.conn_zoneid);
 
 	return (WALK_NEXT);
 }
 
 static int
+netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+{
+	return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET));
+}
+
+static int
 netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
 {
-	const udp_t *udp = walk_data;
-	const uintptr_t opts = (uintptr_t)cb_data;
-
-	if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv6(udp)))
-		return (WALK_NEXT);
-
-	mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
-	net_ipv6addrport_pr(&udp->udp_v6src, udp->udp_port);
-	mdb_printf(" ");
-
-	/* Remote */
-	if (udp->udp_state == TS_DATA_XFER)
-		net_ipv6addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
-	else
-		mdb_printf("%*s.0    ", ADDR_V6_WIDTH, "0:0:0:0:0:0:0:0");
-	mdb_printf(" %4i\n", udp->udp_zoneid);
-
-	return (WALK_NEXT);
+	return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET6));
 }
 
 /*
@@ -855,7 +863,7 @@
 			    "UDPv4", ADDR_V4_WIDTH, "Local Address",
 			    ADDR_V4_WIDTH, "Remote Address", "Zone");
 
-			if (mdb_walk("genunix`udp", netstat_udpv4_cb,
+			if (mdb_walk("udp_cache", netstat_udpv4_cb,
 			    (void *)(uintptr_t)opts) == -1) {
 				mdb_warn("failed to walk genunix`udp");
 				return (DCMD_ERR);
@@ -870,12 +878,11 @@
 			    "UDPv6", ADDR_V6_WIDTH, "Local Address",
 			    ADDR_V6_WIDTH, "Remote Address", "Zone");
 
-			if (mdb_walk("genunix`udp", netstat_udpv6_cb,
+			if (mdb_walk("udp_cache", netstat_udpv6_cb,
 			    (void *)(uintptr_t)opts) == -1) {
 				mdb_warn("failed to walk genunix`udp");
 				return (DCMD_ERR);
 			}
-
 		}
 	}
 
--- a/usr/src/cmd/mdb/common/modules/genunix/net.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2000, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,7 +33,6 @@
 extern "C" {
 #endif
 
-extern struct mi_payload_walk_arg_s mi_udp_arg;
 extern struct mi_payload_walk_arg_s mi_ar_arg;
 extern struct mi_payload_walk_arg_s mi_icmp_arg;
 extern struct mi_payload_walk_arg_s mi_ill_arg;
--- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c	Sat Oct 22 22:50:14 2005 -0700
@@ -54,6 +54,7 @@
 #include <libdevinfo.h>
 #include <sys/systeminfo.h>
 #include <netdb.h>
+#include <inet/ip.h>
 
 #include <ipmp_mpathd.h>
 #include "rcm_module.h"
@@ -70,12 +71,7 @@
 /* Some generic well-knowns and defaults used in this module */
 #define	SLASH_DEV		"/dev"		/* /dev directory */
 
-#define	IP_DEV_NAME		"/dev/ip"	/* IPV4 ip device */
-#define	IP6_DEV_NAME		"/dev/ip6"	/* IPV6 ip device */
-#define	IP_MOD_NAME		"ip"		/* ip module */
 #define	ARP_MOD_NAME		"arp"		/* arp module */
-#define	UDP_DEV_NAME		"/dev/udp"	/* IPv4 udp device */
-#define	UDP6_DEV_NAME		"/dev/udp6"	/* IPv6 udp device */
 #define	IP_MAX_MODS		9		/* max modules pushed on intr */
 #define	MAX_RECONFIG_SIZE	1024		/* Max. reconfig string size */
 
--- a/usr/src/pkgdefs/etc/exception_list_i386	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/pkgdefs/etc/exception_list_i386	Sat Oct 22 22:50:14 2005 -0700
@@ -347,6 +347,8 @@
 usr/include/inet/arp_impl.h	i386
 usr/include/inet/rawip_impl.h	i386
 usr/include/inet/udp_impl.h	i386
+usr/include/inet/tcp_impl.h	i386
+usr/include/inet/ip_impl.h	i386
 usr/include/inet/ip_ndp.h	i386
 usr/include/inet/ipdrop.h	i386
 usr/include/inet/tun.h		i386
--- a/usr/src/pkgdefs/etc/exception_list_sparc	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/pkgdefs/etc/exception_list_sparc	Sat Oct 22 22:50:14 2005 -0700
@@ -336,6 +336,8 @@
 usr/include/inet/arp_impl.h	sparc
 usr/include/inet/rawip_impl.h	sparc
 usr/include/inet/udp_impl.h	sparc
+usr/include/inet/tcp_impl.h	sparc
+usr/include/inet/ip_impl.h	sparc
 usr/include/inet/ip_ndp.h	sparc
 usr/include/inet/ipdrop.h	sparc
 usr/include/inet/tun.h		sparc
--- a/usr/src/tools/scripts/bfu.sh	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/tools/scripts/bfu.sh	Sat Oct 22 22:50:14 2005 -0700
@@ -2002,11 +2002,10 @@
 fi
 
 update_script="/ws/onnv-gate/public/bin/update_ce"
-if [ "$plat" = "SUNW,Sun-Fire-15000" ] && ifconfig -a | egrep '^ce' \
-	    >/dev/null 2>/dev/null; then
-	# Sun Fire 12K/15K/20K/25K requires CE version 1.146 or later.
+if ifconfig -a | egrep '^ce' >/dev/null 2>/dev/null; then
+	# CE version 1.148 or later is required
 	cever=`modinfo | grep 'CE Ethernet' | sed 's/.*v1\.//' | tr -d ')' | \
-	    nawk '{ if ($1 < 146) print "BAD"; else print $1 }'`
+	    nawk '{ if ($1 < 148) print "BAD"; else print $1 }'`
 	if [ "$cever" = "BAD" ]; then
 		fail "You must run $update_script to upgrade your ce driver."
 	fi
--- a/usr/src/uts/common/Makefile.files	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/Makefile.files	Sat Oct 22 22:50:14 2005 -0700
@@ -416,13 +416,9 @@
 
 RTS_OBJS +=	rtsddi.o rts.o rts_opt_data.o
 
-IP_OBJS +=	igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
-		ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
-		ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
-		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
-		spd.o ipclassifier.o inet_common.o ip_squeue.o tcp.o \
-		tcp_trace.o tcp_opt_data.o tcp_sack.o squeue.o ip_sadb.o \
-		sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
+IP_TCP_OBJS =	tcp.o tcp_trace.o tcp_opt_data.o tcp_sack.o tcp_fusion.o
+IP_UDP_OBJS =	udp.o udp_opt_data.o
+IP_SCTP_OBJS =	sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
 		sctp_init.o sctp_input.o sctp_cookie.o \
 		sctp_conn.o sctp_error.o sctp_snmp.o \
 		sctp_param.o sctp_shutdown.o sctp_common.o \
@@ -430,6 +426,16 @@
 		sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \
 		sctp_addr.o
 
+IP_OBJS +=	igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
+		ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+		ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
+		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
+		spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
+		ip_sadb.o \
+		$(IP_TCP_OBJS) \
+		$(IP_UDP_OBJS) \
+		$(IP_SCTP_OBJS)
+
 IP6_OBJS +=	ip6ddi.o
 
 KEYSOCK_OBJS +=	keysockddi.o keysock.o keysock_opt_data.o
@@ -467,7 +473,7 @@
 
 6TO4TUN_OBJS +=	6to4tun.o
 
-UDP_OBJS +=	udpddi.o udp.o udp_opt_data.o
+UDP_OBJS +=	udpddi.o
 
 UDP6_OBJS +=	udp6ddi.o
 
--- a/usr/src/uts/common/fs/sockfs/sockstr.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockstr.c	Sat Oct 22 22:50:14 2005 -0700
@@ -137,21 +137,23 @@
 
 	ASSERT(so->so_version != SOV_STREAM);
 
-	/* tell the transport below that sockmod is being popped */
-	if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
-		int	rval;
-		mblk_t	**mpp;
+	if (so->so_state & SS_DIRECT) {
+		mblk_t **mpp;
+		int rval;
 
+		/*
+		 * Tell the transport below that sockmod is being popped
+		 */
 		mutex_exit(&so->so_lock);
-		error = strioctl(vp, SIOCPOPSOCKFS, NULL, 0, K_TO_K, CRED(),
+		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
 		    &rval);
 		mutex_enter(&so->so_lock);
 		if (error != 0) {
-			dprintso(so, 0,
-			    ("so_sock2stream(%p): SIOCPOPSOCKFS failed\n", so));
+			dprintso(so, 0, ("so_sock2stream(%p): "
+			    "_SIOCSOCKFALLBACK failed\n", so));
 			goto exit;
 		}
-		so->so_state &= ~SS_TCP_FAST_ACCEPT;
+		so->so_state &= ~SS_DIRECT;
 
 		for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
 		    mpp = &mp->b_next) {
@@ -412,7 +414,7 @@
 
 		/* the following do_tcapability may update so->so_mode */
 		if ((tso->so_serv_type != T_CLTS) &&
-		    ((so->so_state & SS_TCP_FAST_ACCEPT) == 0)) {
+		    !(so->so_state & SS_DIRECT)) {
 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
 			if (error)
 				return (error);
--- a/usr/src/uts/common/fs/sockfs/socktpi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -57,6 +57,7 @@
 
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/sockio.h>
 #include <netinet/in.h>
 #include <sys/un.h>
 #include <sys/strsun.h>
@@ -72,6 +73,7 @@
 #include <inet/ip.h>
 #include <inet/ip6.h>
 #include <inet/tcp.h>
+#include <inet/udp_impl.h>
 
 #include <fs/sockfs/nl7c.h>
 #include <sys/zone.h>
@@ -185,6 +187,10 @@
 		    struct uio *);
 static int	sotpi_shutdown(struct sonode *, int);
 static int	sotpi_getsockname(struct sonode *);
+static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
+		    struct uio *, void *, t_uscalar_t, int);
+static int	sodgram_direct(struct sonode *, struct sockaddr *,
+		    socklen_t, struct uio *, int);
 
 sonodeops_t sotpi_sonodeops = {
 	sotpi_accept,		/* sop_accept		*/
@@ -222,16 +228,40 @@
 	so = VTOSO(vp);
 
 	flags = FREAD|FWRITE;
-	if (tso != NULL) {
-		if ((tso->so_state & (SS_TCP_FAST_ACCEPT)) != 0) {
-			flags |= SO_ACCEPTOR|SO_SOCKSTR;
-			so->so_state |= SS_TCP_FAST_ACCEPT;
-		}
-	} else {
-		if ((so->so_type == SOCK_STREAM) &&
-		    (so->so_family == AF_INET || so->so_family == AF_INET6)) {
-			flags |= SO_SOCKSTR;
-			so->so_state |= SS_TCP_FAST_ACCEPT;
+
+	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
+	    (domain == AF_INET || domain == AF_INET6) &&
+	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
+	    protocol == IPPROTO_IP)) {
+		/* Tell tcp or udp that it's talking to sockets */
+		flags |= SO_SOCKSTR;
+
+		/*
+		 * Here we indicate to socktpi_open() our attempt to
+		 * make direct calls between sockfs and transport.
+		 * The final decision is left to socktpi_open().
+		 */
+		so->so_state |= SS_DIRECT;
+
+		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
+		if (so->so_type == SOCK_STREAM && tso != NULL) {
+			if (tso->so_state & SS_DIRECT) {
+				/*
+				 * Inherit SS_DIRECT from listener and pass
+				 * SO_ACCEPTOR open flag to tcp, indicating
+				 * that this is an accept fast-path instance.
+				 */
+				flags |= SO_ACCEPTOR;
+			} else {
+				/*
+				 * SS_DIRECT is not set on listener, meaning
+				 * that the listener has been converted from
+				 * a socket to a stream.  Ensure that the
+				 * acceptor inherits these settings.
+				 */
+				so->so_state &= ~SS_DIRECT;
+				flags &= ~SO_SOCKSTR;
+			}
 		}
 	}
 
@@ -1052,7 +1082,7 @@
 }
 
 /* bind the socket */
-int
+static int
 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
     int flags)
 {
@@ -1372,7 +1402,7 @@
 	case AF_INET:
 	case AF_INET6:
 		if ((optlen == sizeof (intptr_t)) &&
-		    ((so->so_state & SS_TCP_FAST_ACCEPT) != 0)) {
+		    ((so->so_state & SS_DIRECT) != 0)) {
 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
 			    &opt, conn_ind->OPT_length);
 		} else {
@@ -1385,7 +1415,19 @@
 			 * problems when sockfs sends a normal T_CONN_RES
 			 * message down the new stream.
 			 */
-			so->so_state &= ~SS_TCP_FAST_ACCEPT;
+			if (so->so_state & SS_DIRECT) {
+				int rval;
+				/*
+				 * For consistency we inform tcp to disable
+				 * direct interface on the listener, though
+				 * we can certainly live without doing this
+				 * because no data will ever travel upstream
+				 * on the listening socket.
+				 */
+				so->so_state &= ~SS_DIRECT;
+				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
+				    0, 0, K_TO_K, CRED(), &rval);
+			}
 			opt = NULL;
 			optlen = 0;
 		}
@@ -1554,9 +1596,10 @@
 	if (nso->so_options & SO_LINGER)
 		nso->so_linger = so->so_linger;
 
-	if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
+	if ((so->so_state & SS_DIRECT) != 0) {
 		mblk_t *ack_mp;
 
+		ASSERT(nso->so_state & SS_DIRECT);
 		ASSERT(opt != NULL);
 
 		conn_res->OPT_length = optlen;
@@ -3308,13 +3351,8 @@
  * Assumes caller has verified that SS_ISBOUND etc. are set.
  */
 static int
-sosend_dgramcmsg(struct sonode *so,
-		struct sockaddr *name,
-		t_uscalar_t namelen,
-		struct uio *uiop,
-		void *control,
-		t_uscalar_t controllen,
-		int flags)
+sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+    struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
 {
 	struct T_unitdata_req	tudr;
 	mblk_t			*mp;
@@ -3636,11 +3674,8 @@
  * name and the source address is passed as an option.
  */
 int
-sosend_dgram(struct sonode	*so,
-		struct sockaddr	*name,
-		socklen_t	namelen,
-		struct uio	*uiop,
-		int		flags)
+sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
+    struct uio *uiop, int flags)
 {
 	struct T_unitdata_req	tudr;
 	mblk_t			*mp;
@@ -3651,7 +3686,7 @@
 	socklen_t		srclen;
 	ssize_t			len;
 
-	ASSERT(name && namelen);
+	ASSERT(name != NULL && namelen != 0);
 
 	len = uiop->uio_resid;
 	if (len > so->so_tidu_size) {
@@ -3659,14 +3694,14 @@
 		goto done;
 	}
 
-	/*
-	 * Length and family checks.
-	 */
+	/* Length and family checks */
 	error = so_addr_verify(so, name, namelen);
-	if (error) {
-		eprintsoline(so, error);
+	if (error != 0)
 		goto done;
-	}
+
+	if (so->so_state & SS_DIRECT)
+		return (sodgram_direct(so, name, namelen, uiop, flags));
+
 	if (so->so_family == AF_UNIX) {
 		if (so->so_state & SS_FADDR_NOXLATE) {
 			/*
@@ -4061,8 +4096,7 @@
 	if (msg->msg_controllen != 0) {
 		if (!(so_mode & SM_CONNREQUIRED)) {
 			error = sosend_dgramcmsg(so, name, namelen, uiop,
-				msg->msg_control, msg->msg_controllen,
-				flags);
+			    msg->msg_control, msg->msg_controllen, flags);
 		} else {
 			if (flags & MSG_OOB) {
 				/* Can't generate T_EXDATA_REQ with options */
@@ -4080,7 +4114,7 @@
 	if (!(so_mode & SM_CONNREQUIRED)) {
 		/*
 		 * If there is no SO_DONTROUTE to turn off return immediately
-		 * from sosend_dgram. This can allow tail-call optimizations.
+		 * from send_dgram. This can allow tail-call optimizations.
 		 */
 		if (!dontroute) {
 			return (sosend_dgram(so, name, namelen, uiop, flags));
@@ -4104,13 +4138,16 @@
 
 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
 				/*
-				 * If there is no SO_DONTROUTE to turn off
-				 * return immediately from strwrite. This can
-				 * allow tail-call optimizations.
+				 * If there is no SO_DONTROUTE to turn off,
+				 * SS_DIRECT is on, and there is no flow
+				 * control, we can take the fast path.
 				 */
-				if (!dontroute)
-					return (strwrite(SOTOV(so), uiop,
-							CRED()));
+				if (!dontroute &&
+				    (so_state & SS_DIRECT) &&
+				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
+					return (sostream_direct(so, uiop,
+					    NULL, CRED()));
+				}
 				error = strwrite(SOTOV(so), uiop, CRED());
 				goto done;
 			}
@@ -4140,6 +4177,206 @@
 }
 
 /*
+ * Sending data on a datagram socket.
+ * Assumes caller has verified that SS_ISBOUND etc. are set.
+ */
+/* ARGSUSED */
+static int
+sodgram_direct(struct sonode *so, struct sockaddr *name,
+    socklen_t namelen, struct uio *uiop, int flags)
+{
+	struct T_unitdata_req	tudr;
+	mblk_t			*mp;
+	int			error = 0;
+	void			*addr;
+	socklen_t		addrlen;
+	ssize_t			len;
+	struct stdata		*stp = SOTOV(so)->v_stream;
+	int			so_state;
+	queue_t			*udp_wq;
+
+	ASSERT(name != NULL && namelen != 0);
+	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
+	ASSERT(!(so->so_mode & SM_EXDATA));
+	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
+	ASSERT(SOTOV(so)->v_type == VSOCK);
+
+	/* Caller checked for proper length */
+	len = uiop->uio_resid;
+	ASSERT(len <= so->so_tidu_size);
+
+	/* Length and family checks have been done by caller */
+	ASSERT(name->sa_family == so->so_family);
+	ASSERT(so->so_family == AF_INET ||
+	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
+	ASSERT(so->so_family == AF_INET6 ||
+	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
+
+	addr = name;
+	addrlen = namelen;
+
+	if (stp->sd_sidp != NULL &&
+	    (error = straccess(stp, JCWRITE)) != 0)
+		goto done;
+
+	so_state = so->so_state;
+
+	/*
+	 * For UDP we don't break up the copyin into smaller pieces
+	 * as in the TCP case.  That means if ENOMEM is returned by
+	 * mcopyinuio() then the uio vector has not been modified at
+	 * all and we fallback to either strwrite() or kstrputmsg()
+	 * below.  Note also that we never generate priority messages
+	 * from here.
+	 */
+	udp_wq = stp->sd_wrq->q_next;
+	if (canput(udp_wq) &&
+	    (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
+		ASSERT(DB_TYPE(mp) == M_DATA);
+		ASSERT(uiop->uio_resid == 0);
+#ifdef C2_AUDIT
+		if (audit_active)
+			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+		udp_wput_data(udp_wq, mp, addr, addrlen);
+		return (0);
+	}
+	if (error != 0 && error != ENOMEM)
+		return (error);
+
+	/*
+	 * For connected, let strwrite() handle the blocking case.
+	 * Otherwise we fall thru and use kstrputmsg().
+	 */
+	if (so_state & SS_ISCONNECTED)
+		return (strwrite(SOTOV(so), uiop, CRED()));
+
+	tudr.PRIM_type = T_UNITDATA_REQ;
+	tudr.DEST_length = addrlen;
+	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
+	tudr.OPT_length = 0;
+	tudr.OPT_offset = 0;
+
+	mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR);
+	if (mp == NULL) {
+		/*
+		 * Caught a signal waiting for memory.
+		 * Let send* return EINTR.
+		 */
+		error = EINTR;
+		goto done;
+	}
+
+#ifdef C2_AUDIT
+	if (audit_active)
+		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+
+	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
+done:
+#ifdef SOCK_DEBUG
+	if (error != 0) {
+		eprintsoline(so, error);
+	}
+#endif /* SOCK_DEBUG */
+	return (error);
+}
+
+int
+sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
+{
+	struct stdata *stp = SOTOV(so)->v_stream;
+	ssize_t iosize, rmax, maxblk;
+	queue_t *tcp_wq = stp->sd_wrq->q_next;
+	int error = 0, wflag = 0;
+
+	ASSERT(so->so_mode & SM_BYTESTREAM);
+	ASSERT(SOTOV(so)->v_type == VSOCK);
+
+	if (stp->sd_sidp != NULL &&
+	    (error = straccess(stp, JCWRITE)) != 0)
+		return (error);
+
+	if (uiop == NULL) {
+		/*
+		 * kstrwritemp() should have checked sd_flag and
+		 * flow-control before coming here.  If we end up
+		 * here it means that we can simply pass down the
+		 * data to tcp.
+		 */
+		ASSERT(mp != NULL);
+		tcp_wput(tcp_wq, mp);
+		return (0);
+	}
+
+	/* Fallback to strwrite() to do proper error handling */
+	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
+		return (strwrite(SOTOV(so), uiop, cr));
+
+	rmax = stp->sd_qn_maxpsz;
+	ASSERT(rmax >= 0 || rmax == INFPSZ);
+	if (rmax == 0 || uiop->uio_resid <= 0)
+		return (0);
+
+	if (rmax == INFPSZ)
+		rmax = uiop->uio_resid;
+
+	maxblk = stp->sd_maxblk;
+
+	for (;;) {
+		iosize = MIN(uiop->uio_resid, rmax);
+
+		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
+		if (mp == NULL) {
+			/*
+			 * Fallback to strwrite() for ENOMEM; if this
+			 * is our first time in this routine and the uio
+			 * vector has not been modified, we will end up
+			 * calling strwrite() without any flag set.
+			 */
+			if (error == ENOMEM)
+				goto slow_send;
+			else
+				return (error);
+		}
+		ASSERT(uiop->uio_resid >= 0);
+		/*
+		 * If mp is non-NULL and ENOMEM is set, it means that
+		 * mcopyinuio() was able to break down some of the user
+		 * data into one or more mblks.  Send the partial data
+		 * to tcp and let the rest be handled in strwrite().
+		 */
+		ASSERT(error == 0 || error == ENOMEM);
+		tcp_wput(tcp_wq, mp);
+
+		wflag |= NOINTR;
+
+		if (uiop->uio_resid == 0) {	/* No more data; we're done */
+			ASSERT(error == 0);
+			break;
+		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
+		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
+slow_send:
+			/*
+			 * We were able to send down partial data using
+			 * the direct call interface, but are now relying
+			 * on strwrite() to handle the non-fastpath cases.
+			 * If the socket is blocking we will sleep in
+			 * strwaitq() until write is permitted, otherwise,
+			 * we will need to return the amount of bytes
+			 * written so far back to the app.  This is the
+			 * reason why we pass NOINTR flag to strwrite()
+			 * for non-blocking socket, because we don't want
+			 * to return EAGAIN when portion of the user data
+			 * has actually been sent down.
+			 */
+			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
+		}
+	}
+	return (0);
+}
+
+/*
  * Update so_faddr by asking the transport (unless AF_UNIX).
  */
 int
--- a/usr/src/uts/common/fs/sockfs/sockvnops.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/fs/sockfs/sockvnops.c	Sat Oct 22 22:50:14 2005 -0700
@@ -53,6 +53,7 @@
 #include <sys/stropts.h>
 #include <sys/stream.h>
 #include <sys/strsubr.h>
+#include <sys/strsun.h>
 #include <sys/suntpi.h>
 #include <sys/ioctl.h>
 #include <sys/sockio.h>
@@ -87,6 +88,9 @@
 
 #include <fs/sockfs/nl7c.h>
 
+#include <inet/udp_impl.h>
+#include <inet/tcp_impl.h>
+
 static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *);
 static int socktpi_read(struct vnode *, struct uio *, int, struct cred *,
 	struct caller_context *);
@@ -140,6 +144,15 @@
 };
 
 /*
+ * Do direct function call to the transport layer below; this would
+ * also allow the transport to utilize read-side synchronous stream
+ * interface if necessary.  This is a /etc/system tunable that must
+ * not be modified on a running system.  By default this is enabled
+ * for performance reasons and may be disabled for debugging purposes.
+ */
+boolean_t socktpi_direct = B_TRUE;
+
+/*
  * Open routine used by socket() call. Note that vn_open checks for
  * VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is
  * needed since VSOCK type vnodes exist in various underlying filesystems as
@@ -205,6 +218,56 @@
 
 		ASSERT(stp->sd_wrq != NULL);
 		so->so_provinfo = tpi_findprov(stp->sd_wrq);
+
+		/*
+		 * If caller is interested in doing direct function call
+		 * interface to/from transport module, probe the module
+		 * directly beneath the streamhead to see if it qualifies.
+		 *
+		 * We turn off direct interface when qualifications fail;
+		 * note that we do these checks for everything other than
+		 * the tcp acceptor case, because the acceptor inherits
+		 * the capabilities of the listener and we've already done
+		 * the checks against the listening socket.
+		 */
+		if (!(flag & SO_ACCEPTOR) && (so->so_state & SS_DIRECT)) {
+			queue_t *tq = stp->sd_wrq->q_next;
+
+			/*
+			 * SS_DIRECT is currently supported and tested
+			 * only for tcp/udp; this is the main reason to
+			 * have the following assertions.
+			 */
+			ASSERT(so->so_family == AF_INET ||
+			    so->so_family == AF_INET6);
+			ASSERT(so->so_protocol == IPPROTO_UDP ||
+			    so->so_protocol == IPPROTO_TCP ||
+			    so->so_protocol == IPPROTO_IP);
+			ASSERT(so->so_type == SOCK_DGRAM ||
+			    so->so_type == SOCK_STREAM);
+
+			/*
+			 * Abort direct call interface if the module directly
+			 * underneath the stream head is not defined with the
+			 * _D_DIRECT flag.  This could happen in the tcp or
+			 * udp case, when some other module is autopushed
+			 * above it, or for some reasons the expected module
+			 * isn't purely D_MP (which is the main requirement).
+			 */
+			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
+			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
+				int rval;
+
+				/* Continue on without direct calls */
+				so->so_state &= ~SS_DIRECT;
+				if ((error = strioctl(vp, _SIOCSOCKFALLBACK,
+				    0, 0, K_TO_K, CRED(), &rval)) != 0) {
+					(void) socktpi_close(vp, flag, 1,
+					    (offset_t)0, cr);
+					return (error);
+				}
+			}
+		}
 	} else {
 		/*
 		 * While the same socket can not be reopened (unlike specfs)
@@ -436,6 +499,11 @@
 			/* Give NL7C some data */
 			nl7c_data(so, uiop);
 		}
+
+		if ((so_state & SS_DIRECT) &&
+		    canputnext(vp->v_stream->sd_wrq)) {
+			return (sostream_direct(so, uiop, NULL, cr));
+		}
 		return (strwrite(vp, uiop, cr));
 	} else {
 		/* Send T_DATA_REQ messages without MORE_flag set */
@@ -631,7 +699,7 @@
 	case I_SENDFD:
 	case I_RECVFD:
 	case I_ATMARK:
-	case SIOCPOPSOCKFS:
+	case _SIOCSOCKFALLBACK:
 		/*
 		 * These ioctls do not apply to sockets. I_FDINSERT can be
 		 * used to send M_PROTO messages without modifying the socket
@@ -639,8 +707,9 @@
 		 * descriptor passing since they assume a twisted stream.
 		 * SIOCATMARK must be used instead of I_ATMARK.
 		 *
-		 * SIOCPOPSOCKFS from an application should never be
-		 * processed. It is always generated in response to I_POP.
+		 * _SIOCSOCKFALLBACK from an application should never be
+		 * processed.  It is only generated by socktpi_open() or
+		 * in response to I_POP or I_PUSH.
 		 */
 #ifdef DEBUG
 		cmn_err(CE_WARN, "Unsupported STREAMS ioctl 0x%x on socket. "
@@ -724,6 +793,24 @@
 
 	switch (cmd) {
 	case I_PUSH:
+		if (so->so_state & SS_DIRECT) {
+			mutex_enter(&so->so_lock);
+			so_lock_single(so);
+			mutex_exit(&so->so_lock);
+
+			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
+			    CRED(), rvalp);
+
+			mutex_enter(&so->so_lock);
+			if (error == 0)
+				so->so_state &= ~SS_DIRECT;
+			so_unlock_single(so, SOLOCKED);
+			mutex_exit(&so->so_lock);
+
+			if (error != 0)
+				return (error);
+		}
+
 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
 		if (error == 0)
 			so->so_pushcnt++;
--- a/usr/src/uts/common/inet/Makefile	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/Makefile	Sat Oct 22 22:50:14 2005 -0700
@@ -34,7 +34,7 @@
 	ipsec_info.h ip6_asp.h ip_if.h ip_ire.h ip_multi.h ip_ndp.h ip_rts.h \
 	ipsec_impl.h keysock.h led.h mi.h mib2.h nd.h optcom.h sadb.h \
 	sctp_itf.h snmpcom.h tcp.h tcp_sack.h tun.h udp_impl.h arp_impl.h \
-	rawip_impl.h ipp_common.h
+	rawip_impl.h ipp_common.h ip_impl.h tcp_impl.h
 
 ROOTDIRS= $(ROOT)/usr/include/inet
 
--- a/usr/src/uts/common/inet/arp/arp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/arp/arp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -194,7 +194,6 @@
 static int	ar_entry_delete(queue_t *q, mblk_t *mp);
 static int	ar_entry_query(queue_t *q, mblk_t *mp);
 static int	ar_entry_squery(queue_t *q, mblk_t *mp);
-static void	ar_freemsg(mblk_t *mp);
 static int	ar_interface_up(queue_t *q, mblk_t *mp);
 static int	ar_interface_down(queue_t *q, mblk_t *mp);
 static int	ar_interface_on(queue_t *q, mblk_t *mp);
@@ -1231,7 +1230,7 @@
 				ar_ip->ar_arl_ip_assoc = ar_arl;
 			}
 		}
-		ar_freemsg(mp);
+		inet_freemsg(mp);
 	}
 
 	/*
@@ -1745,19 +1744,6 @@
 	return (0);
 }
 
-/* Make sure b_next and b_prev are null and then free the message */
-static void
-ar_freemsg(mblk_t *mp)
-{
-	mblk_t *mp1;
-
-	for (mp1 = mp; mp1; mp1 = mp1->b_cont) {
-		mp1->b_prev = mp1->b_next = NULL;
-		mp1->b_queue = NULL;
-	}
-	freemsg(mp);
-}
-
 /* Process an interface down causing us to detach and unbind. */
 /* ARGSUSED */
 static int
@@ -1936,7 +1922,7 @@
 					BUMP_IRE_STATS(ire_stats_v4,
 					    ire_stats_freed);
 				}
-				ar_freemsg(mp);
+				inet_freemsg(mp);
 			} else {
 				prev = mp;
 			}
@@ -2587,7 +2573,7 @@
 			    *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
 				BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
 			}
-			ar_freemsg(mp);
+			inet_freemsg(mp);
 		} else {
 			mpp = &mp->b_next;
 		}
@@ -2657,7 +2643,7 @@
 		} else {
 			if (ret_val != 0) {
 				/* TODO: find some way to let the guy know? */
-				ar_freemsg(mp);
+				inet_freemsg(mp);
 				BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
 				continue;
 			}
@@ -2849,7 +2835,7 @@
 			    "arp_rput_end: q %p (%S)", q, "proto");
 			return;
 		default:
-			ar_freemsg(mp);
+			inet_freemsg(mp);
 			return;
 		}
 		if ((mp->b_wptr - mp->b_rptr) < sizeof (dl_unitdata_ind_t) ||
--- a/usr/src/uts/common/inet/common.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/common.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992-2001, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -97,13 +97,13 @@
 #define	INET_MAXMINOR		MAXMIN	/* maximum device minor number */
 
 #ifdef _KERNEL
+#include <sys/stream.h>
 
-extern void inet_init(void);
-extern void inet_destroy(void);
 extern void *inet_minor_create(char *, dev_t, int);
 extern void inet_minor_destroy(void *);
 extern dev_t inet_minor_alloc(void *);
 extern void inet_minor_free(void *, dev_t);
+extern void inet_freemsg(mblk_t *);
 
 #endif	/* _KERNEL */
 
--- a/usr/src/uts/common/inet/inet_common.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/inet_common.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -103,3 +103,21 @@
 	ASSERT((dev != OPENFAIL) && (dev != 0) && (dev <= inet_maxminor));
 	vmem_free(((inet_arena_t *)a)->ineta_arena, (void *)dev, 1);
 }
+
+/*
+ * This function is used to free a message that has gone through
+ * mi_copyin processing which modifies the M_IOCTL mblk's b_next
+ * and b_prev pointers. We use this function to set b_next/b_prev
+ * to NULL and free them.
+ */
+void
+inet_freemsg(mblk_t *mp)
+{
+	mblk_t	*bp = mp;
+
+	for (; bp != NULL; bp = bp->b_cont) {
+		bp->b_prev = NULL;
+		bp->b_next = NULL;
+	}
+	freemsg(mp);
+}
--- a/usr/src/uts/common/inet/ip.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip.h	Sat Oct 22 22:50:14 2005 -0700
@@ -52,6 +52,7 @@
 #include <sys/vmem.h>
 #include <sys/squeue.h>
 #include <sys/systm.h>
+#include <sys/multidata.h>
 
 #ifdef DEBUG
 #define	ILL_DEBUG
@@ -67,7 +68,19 @@
  * of flags.
  */
 #define	IP_DEVMTFLAGS D_MP
-#endif
+#endif	/* _KERNEL */
+
+#define	IP_MOD_NAME	"ip"
+#define	IP_DEV_NAME	"/dev/ip"
+#define	IP6_DEV_NAME	"/dev/ip6"
+
+#define	UDP_MOD_NAME	"udp"
+#define	UDP_DEV_NAME	"/dev/udp"
+#define	UDP6_DEV_NAME	"/dev/udp6"
+
+#define	TCP_MOD_NAME	"tcp"
+#define	TCP_DEV_NAME	"/dev/tcp"
+#define	TCP6_DEV_NAME	"/dev/tcp6"
 
 /* Minor numbers */
 #define	IPV4_MINOR	0
@@ -101,8 +114,6 @@
 #define	ILL_FRAG_HASH_TBL_COUNT	((unsigned int)64)
 #define	ILL_FRAG_HASH_TBL_SIZE	(ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
 
-#define	IP_DEV_NAME			"/dev/ip"
-#define	IP_MOD_NAME			"ip"
 #define	IPV4_ADDR_LEN			4
 #define	IP_ADDR_LEN			IPV4_ADDR_LEN
 #define	IP_ARP_PROTO_TYPE		0x0800
@@ -236,6 +247,7 @@
 
 #define	Q_TO_CONN(q)	((conn_t *)(q)->q_ptr)
 #define	Q_TO_TCP(q)	(Q_TO_CONN((q))->conn_tcp)
+#define	Q_TO_UDP(q)	(Q_TO_CONN((q))->conn_udp)
 
 /*
  * The following two macros are used by IP to get the appropriate
@@ -244,13 +256,10 @@
  * from a conn directly if it knows that the conn is not TCP.
  */
 #define	CONNP_TO_WQ(connp)	\
-	(((connp)->conn_tcp == NULL) ? (connp)->conn_wq :	\
-	(connp)->conn_tcp->tcp_wq)
+	(IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
 
 #define	CONNP_TO_RQ(connp)	RD(CONNP_TO_WQ(connp))
 
-#define	IS_TCP_CONN(connp)	(((connp)->conn_flags & IPCL_TCP) != 0)
-
 #define	GRAB_CONN_LOCK(q)	{				\
 	if (q != NULL && CONN_Q(q))				\
 		mutex_enter(&(Q_TO_CONN(q))->conn_lock);	\
@@ -302,9 +311,8 @@
  */
 #define	IP6_NO_IPPOLICY		0x800	/* Don't do IPQoS processing */
 #define	IP6_IN_LLMCAST		0x1000	/* Multicast */
-#define	IP6_IN_NOCKSUM		0x2000	/* Don't compute checksum */
-
-#define	IP_FF_LOOPBACK		0x4000	/* Loopback fanout */
+
+#define	IP_FF_LOOPBACK		0x2000	/* Loopback fanout */
 
 #ifndef	IRE_DB_TYPE
 #define	IRE_DB_TYPE	M_SIG
@@ -357,6 +365,8 @@
 	uint_t		ipf_prev_nexthdr_offset; /* Offset for nexthdr value */
 	uint8_t		ipf_ecn;	/* ECN info for the fragments */
 	uint8_t		ipf_num_dups;	/* Number of times dup frags recvd */
+	uint16_t	ipf_checksum_flags; /* Hardware checksum flags */
+	uint32_t	ipf_checksum;	/* Partial checksum of fragment data */
 } ipf_t;
 
 #define	ipf_src	V4_PART_OF_V6(ipf_v6src)
@@ -623,9 +633,10 @@
  * depends on the atomic 32 bit access to that field.
  */
 #define	CONN_CLOSING		0x01	/* ip_close waiting for ip_wsrv */
-#define	CONN_IPSEC_LOAD_WAIT	0x10	/* waiting for load */
-#define	CONN_CONDEMNED		0x40	/* conn is closing, no more refs */
-#define	CONN_INCIPIENT		0x80	/* conn not yet visible, no refs */
+#define	CONN_IPSEC_LOAD_WAIT	0x02	/* waiting for load */
+#define	CONN_CONDEMNED		0x04	/* conn is closing, no more refs */
+#define	CONN_INCIPIENT		0x08	/* conn not yet visible, no refs */
+#define	CONN_QUIESCED		0x10	/* conn is now quiescent */
 
 /*
  * Parameter to ip_output giving the identity of the caller.
@@ -2593,6 +2604,7 @@
 
 extern int ip_g_forward;
 extern int ipv6_forward;
+extern vmem_t *ip_minor_arena;
 
 #define	ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value
 #define	ip_g_send_redirects		ip_param_arr[5].ip_param_value
@@ -2697,18 +2709,11 @@
 #define	ip1dbg(a)	if (ip_debug > 2) printf a
 #define	ip2dbg(a)	if (ip_debug > 3) printf a
 #define	ip3dbg(a)	if (ip_debug > 4) printf a
-
-#define	ipcsumdbg(a, b) \
-	if (ip_debug == 1) \
-		prom_printf(a); \
-	else if (ip_debug > 1) \
-		{ prom_printf("%smp=%p\n", a, (void *)b); }
 #else
 #define	ip0dbg(a)	/* */
 #define	ip1dbg(a)	/* */
 #define	ip2dbg(a)	/* */
 #define	ip3dbg(a)	/* */
-#define	ipcsumdbg(a, b)	/* */
 #endif	/* IP_DEBUG */
 
 extern const char *dlpi_prim_str(int);
@@ -2717,7 +2722,6 @@
 extern ill_t	*ill_first(int, int, ill_walk_context_t *);
 extern ill_t	*ill_next(ill_walk_context_t *, ill_t *);
 extern void	ill_frag_timer_start(ill_t *);
-extern void	ip_ioctl_freemsg(mblk_t *);
 extern mblk_t	*ip_carve_mp(mblk_t **, ssize_t);
 extern mblk_t	*ip_dlpi_alloc(size_t, t_uscalar_t);
 extern char	*ip_dot_addr(ipaddr_t, char *);
@@ -2749,6 +2753,9 @@
 extern void	ip_rput_dlpi(queue_t *, mblk_t *);
 extern void	ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
 extern void	ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
+
+extern int	ip_snmpmod_close(queue_t *);
+extern void	ip_snmpmod_wput(queue_t *, mblk_t *);
 extern void	ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
 extern void	ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
 extern void	ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *);
@@ -2821,6 +2828,7 @@
 extern int	ip_snmp_get(queue_t *q, mblk_t *mctl);
 extern int	ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
 extern void	ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void	ip_quiesce_conn(conn_t *);
 extern  void    ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *);
 extern void	ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipif_t *,
@@ -2842,6 +2850,7 @@
 			uint32_t, uint32_t, uint32_t, uint32_t);
 extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
 			uint_t);
+extern mblk_t	*ip_unbind(queue_t *, mblk_t *);
 
 /* Hooks for CGTP (multirt routes) filtering module */
 #define	CGTP_FILTER_REV_1	1
@@ -2925,17 +2934,6 @@
 	uint_t ill_mdt_span_limit; /* maximum payload span per packet */
 };
 
-/*
- * ioctl identifier and structure for Multidata Transmit update
- * private M_CTL communication from IP to ULP.
- */
-#define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
-
-typedef struct ip_mdt_info_s {
-	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
-	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
-} ip_mdt_info_t;
-
 struct ill_hcksum_capab_s {
 	uint_t	ill_hcksum_version;	/* interface version */
 	uint_t	ill_hcksum_txflags;	/* capabilities on transmit */
@@ -2991,35 +2989,6 @@
 };
 
 /*
- * Macro that determines whether or not a given ILL is allowed for MDT.
- */
-#define	ILL_MDT_USABLE(ill)	\
-	((ill->ill_capabilities & ILL_CAPAB_MDT) != 0 &&		\
-	ill->ill_mdt_capab != NULL &&					\
-	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
-	ill->ill_mdt_capab->ill_mdt_on != 0)
-
-/*
- * Macro that determines whether or not a given CONN may be considered
- * for fast path prior to proceeding further with Multidata.
- */
-#define	CONN_IS_MD_FASTPATH(connp)	\
-	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
-	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
-	(connp)->conn_xmit_if_ill == NULL &&	/* IP_XMIT_IF */	\
-	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
-	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
-
-/*
- * Macro that determines whether or not a given IPC requires
- * outbound IPSEC processing.
- */
-#define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
-	((connp)->conn_out_enforce_policy ||	\
-	((connp)->conn_latch != NULL &&		\
-	(connp)->conn_latch->ipl_out_policy != NULL))
-
-/*
  * IP squeues exports
  */
 extern int 		ip_squeue_profile;
@@ -3049,12 +3018,15 @@
 extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
 extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 extern void ip_squeue_clean(void *, mblk_t *, void *);
-
-extern	void	ip_resume_tcp_bind(void *, mblk_t *mp, void *);
+extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
+
+extern void tcp_wput(queue_t *, mblk_t *);
+
 extern int	ip_fill_mtuinfo(struct in6_addr *, in_port_t,
 	struct ip6_mtuinfo *);
-
-typedef	void	(*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
+extern	ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
+
+typedef void    (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
 
 /*
  * Squeue tags. Tags only need to be unique when the callback function is the
@@ -3091,6 +3063,11 @@
 #define	SQTAG_TCP_WPUT_OTHER		28
 #define	SQTAG_TCP_CONN_REQ_UNBOUND	29
 #define	SQTAG_TCP_SEND_PENDING		30
+#define	SQTAG_BIND_RETRY		31
+#define	SQTAG_UDP_FANOUT		32
+#define	SQTAG_UDP_INPUT			33
+#define	SQTAG_UDP_WPUT			34
+#define	SQTAG_UDP_OUTPUT		35
 
 #endif	/* _KERNEL */
 
--- a/usr/src/uts/common/inet/ip/igmp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/igmp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -1925,6 +1925,8 @@
 	igmpa->igmpa_group  = ilm->ilm_addr;
 	igmpa->igmpa_cksum  = 0;
 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
+	if (igmpa->igmpa_cksum == 0)
+		igmpa->igmpa_cksum = 0xffff;
 
 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
 	rtralert[1] = RTRALERT_LEN;
--- a/usr/src/uts/common/inet/ip/ip.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c	Sat Oct 22 22:50:14 2005 -0700
@@ -75,9 +75,11 @@
 #include <netinet/sctp.h>
 
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip6_asp.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
 #include <inet/ip_multi.h>
 #include <inet/ip_if.h>
 #include <inet/ip_ire.h>
@@ -110,6 +112,7 @@
 
 #include <inet/ipclassifier.h>
 #include <inet/sctp_ip.h>
+#include <inet/udp_impl.h>
 
 /*
  * Values for squeue switch:
@@ -122,7 +125,8 @@
 /*
  * IP statistics.
  */
-#define	IP_STAT(x)	(ip_statistics.x.value.ui64++)
+#define	IP_STAT(x)		(ip_statistics.x.value.ui64++)
+#define	IP_STAT_UPDATE(x, n)	(ip_statistics.x.value.ui64 += (n))
 
 typedef struct ip_stat {
 	kstat_named_t	ipsec_fanout_proto;
@@ -158,42 +162,68 @@
 	kstat_named_t   ip_ire_redirect_timer_expired;
 	kstat_named_t	ip_ire_pmtu_timer_expired;
 	kstat_named_t	ip_input_multi_squeue;
+	kstat_named_t	ip_tcp_in_full_hw_cksum_err;
+	kstat_named_t	ip_tcp_in_part_hw_cksum_err;
+	kstat_named_t	ip_tcp_in_sw_cksum_err;
+	kstat_named_t	ip_tcp_out_sw_cksum_bytes;
+	kstat_named_t	ip_udp_in_full_hw_cksum_err;
+	kstat_named_t	ip_udp_in_part_hw_cksum_err;
+	kstat_named_t	ip_udp_in_sw_cksum_err;
+	kstat_named_t	ip_udp_out_sw_cksum_bytes;
+	kstat_named_t	ip_frag_mdt_pkt_out;
+	kstat_named_t	ip_frag_mdt_discarded;
+	kstat_named_t	ip_frag_mdt_allocfail;
+	kstat_named_t	ip_frag_mdt_addpdescfail;
+	kstat_named_t	ip_frag_mdt_allocd;
 } ip_stat_t;
 
 static ip_stat_t ip_statistics = {
-	{ "ipsec_fanout_proto", 	KSTAT_DATA_UINT64 },
-	{ "ip_udp_fannorm", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fanmb", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fanothers", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_fast_path", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_slow_path", 		KSTAT_DATA_UINT64 },
-	{ "ip_udp_input_err", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcppullup", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcpoptions", 		KSTAT_DATA_UINT64 },
-	{ "ip_multipkttcp", 		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_fast_path",		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_slow_path",		KSTAT_DATA_UINT64 },
-	{ "ip_tcp_input_error",		KSTAT_DATA_UINT64 },
-	{ "ip_db_ref",			KSTAT_DATA_UINT64 },
-	{ "ip_notaligned1",		KSTAT_DATA_UINT64 },
-	{ "ip_notaligned2",		KSTAT_DATA_UINT64 },
-	{ "ip_multimblk3",		KSTAT_DATA_UINT64 },
-	{ "ip_multimblk4",		KSTAT_DATA_UINT64 },
-	{ "ip_ipoptions",		KSTAT_DATA_UINT64 },
-	{ "ip_classify_fail",		KSTAT_DATA_UINT64 },
-	{ "ip_opt",			KSTAT_DATA_UINT64 },
-	{ "ip_udp_rput_local",		KSTAT_DATA_UINT64 },
-	{ "ipsec_proto_ahesp",		KSTAT_DATA_UINT64 },
-	{ "ip_conn_flputbq",		KSTAT_DATA_UINT64 },
-	{ "ip_conn_walk_drain",		KSTAT_DATA_UINT64 },
-	{ "ip_out_sw_cksum",		KSTAT_DATA_UINT64 },
-	{ "ip_in_sw_cksum",		KSTAT_DATA_UINT64 },
-	{ "ip_trash_ire_reclaim_calls",	KSTAT_DATA_UINT64 },
+	{ "ipsec_fanout_proto",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fannorm",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fanmb",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fanothers",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip_udp_input_err",			KSTAT_DATA_UINT64 },
+	{ "ip_tcppullup",			KSTAT_DATA_UINT64 },
+	{ "ip_tcpoptions",			KSTAT_DATA_UINT64 },
+	{ "ip_multipkttcp",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip_tcp_input_error",			KSTAT_DATA_UINT64 },
+	{ "ip_db_ref",				KSTAT_DATA_UINT64 },
+	{ "ip_notaligned1",			KSTAT_DATA_UINT64 },
+	{ "ip_notaligned2",			KSTAT_DATA_UINT64 },
+	{ "ip_multimblk3",			KSTAT_DATA_UINT64 },
+	{ "ip_multimblk4",			KSTAT_DATA_UINT64 },
+	{ "ip_ipoptions",			KSTAT_DATA_UINT64 },
+	{ "ip_classify_fail",			KSTAT_DATA_UINT64 },
+	{ "ip_opt",				KSTAT_DATA_UINT64 },
+	{ "ip_udp_rput_local",			KSTAT_DATA_UINT64 },
+	{ "ipsec_proto_ahesp",			KSTAT_DATA_UINT64 },
+	{ "ip_conn_flputbq",			KSTAT_DATA_UINT64 },
+	{ "ip_conn_walk_drain",			KSTAT_DATA_UINT64 },
+	{ "ip_out_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip_in_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip_trash_ire_reclaim_calls",		KSTAT_DATA_UINT64 },
 	{ "ip_trash_ire_reclaim_success",	KSTAT_DATA_UINT64 },
-	{ "ip_ire_arp_timer_expired",	KSTAT_DATA_UINT64 },
+	{ "ip_ire_arp_timer_expired",		KSTAT_DATA_UINT64 },
 	{ "ip_ire_redirect_timer_expired",	KSTAT_DATA_UINT64 },
-	{ "ip_ire_pmtu_timer_expired",	KSTAT_DATA_UINT64 },
-	{ "ip_input_multi_squeue",	KSTAT_DATA_UINT64 },
+	{ "ip_ire_pmtu_timer_expired",		KSTAT_DATA_UINT64 },
+	{ "ip_input_multi_squeue",		KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
+	{ "ip_frag_mdt_allocd",			KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *ip_kstat;
@@ -591,28 +621,12 @@
 /* RFC1122 Conformance */
 #define	IP_FORWARD_DEFAULT	IP_FORWARD_NEVER
 
-#ifdef	_BIG_ENDIAN
-#define	IP_HDR_CSUM_TTL_ADJUST	256
-#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
-#define	IP_UDP_CSUM_COMP	IPPROTO_UDP
-#else
-#define	IP_HDR_CSUM_TTL_ADJUST	1
-#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
-#define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
-#endif
-
-#define	TCP_CHECKSUM_OFFSET		16
-#define	UDP_CHECKSUM_OFFSET		6
-
 #define	ILL_MAX_NAMELEN			LIFNAMSIZ
 
-#define	UDPH_SIZE	8
-
 /* Leave room for ip_newroute to tack on the src and target addresses */
 #define	OK_RESOLVER_MP(mp)						\
 	((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
 
-static ipif_t	*conn_get_held_ipif(conn_t *, ipif_t **, int *);
 static int	conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
 
 static mblk_t	*ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t);
@@ -668,6 +682,8 @@
 static boolean_t	ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
 			    ire_t *);
 static int	ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *);
+static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
+		    uint16_t *);
 int		ip_snmp_get(queue_t *, mblk_t *);
 static mblk_t	*ip_snmp_get_mib2_ip(queue_t *, mblk_t *);
 static mblk_t	*ip_snmp_get_mib2_ip6(queue_t *, mblk_t *);
@@ -692,7 +708,6 @@
 static boolean_t	ip_source_routed(ipha_t *);
 static boolean_t	ip_source_route_included(ipha_t *);
 
-static void	ip_unbind(queue_t *, mblk_t *);
 static void	ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t);
 static mblk_t	*ip_wput_frag_copyhdr(uchar_t *, int, int);
 static void	ip_wput_local_options(ipha_t *);
@@ -767,6 +782,15 @@
 time_t	ip_g_frag_timeout = IP_FRAG_TIMEOUT;
 clock_t	ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
 
+/*
+ * Threshold which determines whether MDT should be used when
+ * generating IP fragments; payload size must be greater than
+ * this threshold for MDT to take place.
+ */
+#define	IP_WPUT_FRAG_MDT_MIN	32768
+
+int	ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
+
 /* Protected by ip_mi_lock */
 static void	*ip_g_head;		/* Instance Data List Head */
 kmutex_t	ip_mi_lock;		/* Lock for list of instances */
@@ -1431,7 +1455,7 @@
 };
 
 struct module_info ip_mod_info = {
-	5701, "ip", 1, INFPSZ, 65536, 1024
+	IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
 };
 
 static struct qinit rinit = {
@@ -1930,6 +1954,8 @@
 	/* Send out an ICMP packet */
 	icmph->icmph_checksum = 0;
 	icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xFFFF;
 	if (broadcast || CLASSD(ipha->ipha_dst)) {
 		ipif_t	*ipif_chosen;
 		/*
@@ -3204,6 +3230,8 @@
 	bcopy(stuff, icmph, len);
 	icmph->icmph_checksum = 0;
 	icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xFFFF;
 	BUMP_MIB(&icmp_mib, icmpOutMsgs);
 	put(q, ipsec_mp);
 }
@@ -3704,7 +3732,7 @@
 	ASSERT(!connp->conn_af_isv6);
 	connp->conn_pkt_isv6 = B_FALSE;
 
-	len = mp->b_wptr - mp->b_rptr;
+	len = MBLKL(mp);
 	if (len < (sizeof (*tbr) + 1)) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "ip_bind: bogus msg, len %ld", len);
@@ -3716,7 +3744,7 @@
 	protocol = *mp->b_wptr & 0xFF;
 	tbr = (struct T_bind_req *)mp->b_rptr;
 	/* Reset the message type in preparation for shipping it back. */
-	mp->b_datap->db_type = M_PCPROTO;
+	DB_TYPE(mp) = M_PCPROTO;
 
 	connp->conn_ulp = (uint8_t)protocol;
 
@@ -3762,8 +3790,8 @@
 	 */
 
 	mp1 = mp->b_cont;
-	ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE);
-	ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET);
+	ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE);
+	ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET);
 
 	switch (tbr->ADDR_length) {
 	default:
@@ -4169,7 +4197,7 @@
 	if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
 	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
 	    (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
-	    (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+	    ILL_MDT_CAPABLE(md_ill)) {
 		md_dst_ire = dst_ire;
 		IRE_REFHOLD(md_dst_ire);
 	}
@@ -4689,43 +4717,19 @@
 }
 
 /*
- * IP has been configured as _D_QNEXTLESS for the client side i.e the driver
- * instance. This implies that
- * 1. IP cannot access the read side q_next pointer directly - it must
- *    use routines like putnext and canputnext.
- * 2. ip_close must ensure that all sources of messages being putnext upstream
- *    are gone before qprocsoff is called.
- *
- * #2 is handled by having ip_close do the ipcl_hash_remove and wait for
- * conn_ref to drop to zero before calling qprocsoff.
- */
-
-/* ARGSUSED */
-int
-ip_close(queue_t *q, int flags)
-{
-	conn_t		*connp;
+ * This is called as part of close() for both IP and UDP
+ * in order to quiesce the conn.
+ */
+void
+ip_quiesce_conn(conn_t *connp)
+{
 	boolean_t	drain_cleanup_reqd = B_FALSE;
 	boolean_t	conn_ioctl_cleanup_reqd = B_FALSE;
 	boolean_t	ilg_cleanup_reqd = B_FALSE;
 
-	TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
-
-	/*
-	 * Call the appropriate delete routine depending on whether this is
-	 * a module or device.
-	 */
-	if (WR(q)->q_next != NULL) {
-		/* This is a module close */
-		return (ip_modclose((ill_t *)q->q_ptr));
-	}
-
-	connp = Q_TO_CONN(q);
-	ASSERT(connp->conn_tcp == NULL);
-
-	/*
-	 * We are being closed as /dev/ip or /dev/ip6.
-	 *
+	ASSERT(!IPCL_IS_TCP(connp));
+
+	/*
 	 * Mark the conn as closing, and this conn must not be
 	 * inserted in future into any list. Eg. conn_drain_insert(),
 	 * won't insert this conn into the conn_drain_list.
@@ -4736,6 +4740,7 @@
 	 * cannot get set henceforth.
 	 */
 	mutex_enter(&connp->conn_lock);
+	ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
 	connp->conn_state_flags |= CONN_CLOSING;
 	if (connp->conn_idl != NULL)
 		drain_cleanup_reqd = B_TRUE;
@@ -4745,17 +4750,17 @@
 		ilg_cleanup_reqd = B_TRUE;
 	mutex_exit(&connp->conn_lock);
 
+	if (IPCL_IS_UDP(connp))
+		udp_quiesce_conn(connp);
+
 	if (conn_ioctl_cleanup_reqd)
 		conn_ioctl_cleanup(connp);
 
 	/*
 	 * Remove this conn from any fanout list it is on.
-	 * Then wait until the number of pending putnexts from
-	 * the fanout code drops to zero, before calling qprocsoff.
-	 * This is the guarantee a QNEXTLESS driver provides to
-	 * STREAMS, and is mentioned at the top of this function.
-	 */
-
+	 * and then wait for any threads currently operating
+	 * on this endpoint to finish
+	 */
 	ipcl_hash_remove(connp);
 
 	/*
@@ -4776,7 +4781,6 @@
 
 	conn_delete_ire(connp, NULL);
 
-
 	/*
 	 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
 	 * callers from write side can't be there now because close
@@ -4787,7 +4791,29 @@
 	connp->conn_state_flags |= CONN_CONDEMNED;
 	while (connp->conn_ref != 1)
 		cv_wait(&connp->conn_cv, &connp->conn_lock);
+	connp->conn_state_flags |= CONN_QUIESCED;
 	mutex_exit(&connp->conn_lock);
+}
+
+/* ARGSUSED */
+int
+ip_close(queue_t *q, int flags)
+{
+	conn_t		*connp;
+
+	TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
+
+	/*
+	 * Call the appropriate delete routine depending on whether this is
+	 * a module or device.
+	 */
+	if (WR(q)->q_next != NULL) {
+		/* This is a module close */
+		return (ip_modclose((ill_t *)q->q_ptr));
+	}
+
+	connp = q->q_ptr;
+	ip_quiesce_conn(connp);
 
 	qprocsoff(q);
 
@@ -4801,6 +4827,15 @@
 	 * has completed, and service has completed or won't run in
 	 * future.
 	 */
+	ASSERT(connp->conn_ref == 1);
+
+	/*
+	 * A conn which was previously marked as IPCL_UDP cannot
+	 * retain the flag because it would have been cleared by
+	 * udp_close().
+	 */
+	ASSERT(!IPCL_IS_UDP(connp));
+
 	if (connp->conn_latch != NULL) {
 		IPLATCH_REFRELE(connp->conn_latch);
 		connp->conn_latch = NULL;
@@ -4827,6 +4862,83 @@
 	return (0);
 }
 
+int
+ip_snmpmod_close(queue_t *q)
+{
+	conn_t *connp = Q_TO_CONN(q);
+	ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+	qprocsoff(q);
+
+	if (connp->conn_flags & IPCL_UDPMOD)
+		udp_close_free(connp);
+
+	if (connp->conn_cred != NULL) {
+		crfree(connp->conn_cred);
+		connp->conn_cred = NULL;
+	}
+	CONN_DEC_REF(connp);
+	q->q_ptr = WR(q)->q_ptr = NULL;
+	return (0);
+}
+
+/*
+ * Write side put procedure for TCP module or UDP module instance.  TCP/UDP
+ * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP.
+ * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ.
+ * M_FLUSH messages and ioctls are only passed downstream; we don't flush our
+ * queues as we never enqueue messages there and we don't handle any ioctls.
+ * Everything else is freed.
+ */
+void
+ip_snmpmod_wput(queue_t *q, mblk_t *mp)
+{
+	conn_t	*connp = q->q_ptr;
+	pfi_t	setfn;
+	pfi_t	getfn;
+
+	ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+	switch (DB_TYPE(mp)) {
+	case M_PROTO:
+	case M_PCPROTO:
+		if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
+		    ((((union T_primitives *)mp->b_rptr)->type ==
+			T_SVR4_OPTMGMT_REQ) ||
+		    (((union T_primitives *)mp->b_rptr)->type ==
+			T_OPTMGMT_REQ))) {
+			/*
+			 * This is the only TPI primitive supported. Its
+			 * handling does not require tcp_t, but it does require
+			 * conn_t to check permissions.
+			 */
+			cred_t	*cr = DB_CREDDEF(mp, connp->conn_cred);
+
+			if (connp->conn_flags & IPCL_TCPMOD) {
+				setfn = tcp_snmp_set;
+				getfn = tcp_snmp_get;
+			} else {
+				setfn = udp_snmp_set;
+				getfn = udp_snmp_get;
+			}
+			if (!snmpcom_req(q, mp, setfn, getfn, cr)) {
+				freemsg(mp);
+				return;
+			}
+		} else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
+		    != NULL)
+			qreply(q, mp);
+		break;
+	case M_FLUSH:
+	case M_IOCTL:
+		putnext(q, mp);
+		break;
+	default:
+		freemsg(mp);
+		break;
+	}
+}
+
 /* Return the IP checksum for the IP header at "iph". */
 uint16_t
 ip_csum_hdr(ipha_t *ipha)
@@ -5081,7 +5193,7 @@
  * Send an ICMP error after patching up the packet appropriately.  Returns
  * non-zero if the appropriate MIB should be bumped; zero otherwise.
  */
-static int
+static boolean_t
 ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
     uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid)
 {
@@ -5103,8 +5215,8 @@
 		 * ipsec_check_global_policy() assumes M_DATA as clear
 		 * and M_CTL as secure.
 		 */
-		db_type = mp->b_datap->db_type;
-		mp->b_datap->db_type = M_DATA;
+		db_type = DB_TYPE(mp);
+		DB_TYPE(mp) = M_DATA;
 		secure = B_FALSE;
 	}
 	/*
@@ -5119,17 +5231,17 @@
 		first_mp = ipsec_check_global_policy(first_mp, NULL,
 		    ipha, NULL, mctl_present);
 		if (first_mp == NULL)
-			return (0);
+			return (B_FALSE);
 	}
 
 	if (!mctl_present)
-		mp->b_datap->db_type = db_type;
+		DB_TYPE(mp) = db_type;
 
 	if (flags & IP_FF_SEND_ICMP) {
 		if (flags & IP_FF_HDR_COMPLETE) {
 			if (ip_hdr_complete(ipha, zoneid)) {
 				freemsg(first_mp);
-				return (1);
+				return (B_TRUE);
 			}
 		}
 		if (flags & IP_FF_CKSUM) {
@@ -5152,10 +5264,10 @@
 		}
 	} else {
 		freemsg(first_mp);
-		return (0);
-	}
-
-	return (1);
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
 }
 
 #ifdef DEBUG
@@ -5592,7 +5704,7 @@
 			}
 
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart = (intptr_t)sqp;
+			DB_CKSUMSTART(mp) = (intptr_t)sqp;
 			syn_present = B_TRUE;
 		}
 	}
@@ -5720,7 +5832,6 @@
     boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
     boolean_t ip_policy)
 {
-	queue_t		*rq = connp->conn_rq;
 	boolean_t	mctl_present = (first_mp != NULL);
 	uint32_t	in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
 	uint32_t	ill_index;
@@ -5730,7 +5841,7 @@
 	else
 		first_mp = mp;
 
-	if (!canputnext(rq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		BUMP_MIB(&ip_mib, udpInOverflows);
 		freemsg(first_mp);
 		return;
@@ -5776,7 +5887,9 @@
 		mp = ip_add_info(mp, recv_ill, in_flags);
 	}
 	BUMP_MIB(&ip_mib, ipInDelivers);
-	putnext(rq, mp);
+
+	/* Send it upstream */
+	CONN_UDP_RECV(connp, mp);
 }
 
 /*
@@ -8454,7 +8567,6 @@
 		return (ip_modopen(q, devp, flag, sflag, credp));
 	}
 
-
 	/*
 	 * We are opening as a device. This is an IP client stream, and we
 	 * allocate an conn_t as the instance data.
@@ -8463,6 +8575,9 @@
 	connp->conn_upq = q;
 	q->q_ptr = WR(q)->q_ptr = connp;
 
+	if (flag & SO_SOCKSTR)
+		connp->conn_flags |= IPCL_SOCKET;
+
 	/* Minor tells us which /dev entry was opened */
 	if (geteminor(*devp) == IPV6_MINOR) {
 		connp->conn_flags |= IPCL_ISV6;
@@ -8474,9 +8589,7 @@
 		connp->conn_pkt_isv6 = B_FALSE;
 	}
 
-
-	if ((connp->conn_dev =
-	    inet_minor_alloc(ip_minor_arena)) == 0) {
+	if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
 		q->q_ptr = WR(q)->q_ptr = NULL;
 		CONN_DEC_REF(connp);
 		return (EBUSY);
@@ -10734,381 +10847,455 @@
 }
 
 /*
- * Do fragmentation reassembly.
- * returns B_TRUE if successful else B_FALSE.
+ * Fragmentation reassembly.  Each ILL has a hash table for
+ * queuing packets undergoing reassembly for all IPIFs
+ * associated with the ILL.  The hash is based on the packet
+ * IP ident field.  The ILL frag hash table was allocated
+ * as a timer block at the time the ILL was created.  Whenever
+ * there is anything on the reassembly queue, the timer will
+ * be running.  Returns B_TRUE if successful else B_FALSE;
  * frees mp on failure.
  */
 static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha)
+ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+    uint32_t *cksum_val, uint16_t *cksum_flags)
 {
 	uint32_t	frag_offset_flags;
-	ill_t   *ill = (ill_t *)q->q_ptr;
-	mblk_t *mp = *mpp;
-	mblk_t *t_mp;
+	ill_t		*ill = (ill_t *)q->q_ptr;
+	mblk_t		*mp = *mpp;
+	mblk_t		*t_mp;
 	ipaddr_t	dst;
+	uint8_t		proto = ipha->ipha_protocol;
+	uint32_t	sum_val;
+	uint16_t	sum_flags;
+	ipf_t		*ipf;
+	ipf_t		**ipfp;
+	ipfb_t		*ipfb;
+	uint16_t	ident;
+	uint32_t	offset;
+	ipaddr_t	src;
+	uint_t		hdr_length;
+	uint32_t	end;
+	mblk_t		*mp1;
+	mblk_t		*tail_mp;
+	size_t		count;
+	size_t		msg_len;
+	uint8_t		ecn_info = 0;
+	uint32_t	packet_size;
+	boolean_t	pruned = B_FALSE;
+
+	if (cksum_val != NULL)
+		*cksum_val = 0;
+	if (cksum_flags != NULL)
+		*cksum_flags = 0;
 
 	/*
 	 * Drop the fragmented as early as possible, if
 	 * we don't have resource(s) to re-assemble.
 	 */
-
 	if (ip_reass_queue_bytes == 0) {
 		freemsg(mp);
 		return (B_FALSE);
 	}
 
+	/* Check for fragmentation offset; return if there's none */
+	if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
+	    (IPH_MF | IPH_OFFSET)) == 0)
+		return (B_TRUE);
+
+	/*
+	 * We utilize hardware computed checksum info only for UDP since
+	 * IP fragmentation is a normal occurence for the protocol.  In
+	 * addition, checksum offload support for IP fragments carrying
+	 * UDP payload is commonly implemented across network adapters.
+	 */
+	ASSERT(ill != NULL);
+	if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+		mblk_t *mp1 = mp->b_cont;
+		int32_t len;
+
+		/* Record checksum information from the packet */
+		sum_val = (uint32_t)DB_CKSUM16(mp);
+		sum_flags = DB_CKSUMFLAGS(mp);
+
+		/* IP payload offset from beginning of mblk */
+		offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
+
+		if ((sum_flags & HCK_PARTIALCKSUM) &&
+		    (mp1 == NULL || mp1->b_cont == NULL) &&
+		    offset >= DB_CKSUMSTART(mp) &&
+		    ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
+			uint32_t adj;
+			/*
+			 * Partial checksum has been calculated by hardware
+			 * and attached to the packet; in addition, any
+			 * prepended extraneous data is even byte aligned.
+			 * If any such data exists, we adjust the checksum;
+			 * this would also handle any postpended data.
+			 */
+			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+			    mp, mp1, len, adj);
+
+			/* One's complement subtract extraneous checksum */
+			if (adj >= sum_val)
+				sum_val = ~(adj - sum_val) & 0xFFFF;
+			else
+				sum_val -= adj;
+		}
+	} else {
+		sum_val = 0;
+		sum_flags = 0;
+	}
+
+	/* Clear hardware checksumming flag */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	ident = ipha->ipha_ident;
+	offset = (frag_offset_flags << 3) & 0xFFFF;
+	src = ipha->ipha_src;
 	dst = ipha->ipha_dst;
-
-	/* Clear hardware checksumming flag if set */
-	mp->b_datap->db_struioun.cksum.flags = 0;
-
-	/* Check for fragmentation offset. */
-	frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
-	    (IPH_MF | IPH_OFFSET);
-	if (frag_offset_flags) {
-		ipf_t		*ipf;
-		ipf_t		**ipfp;
-		ipfb_t		*ipfb;
-		uint16_t	ident;
-		uint32_t	offset;
-		ipaddr_t	src;
-		uint_t		hdr_length;
-		uint32_t	end;
-		uint8_t		proto;
-		mblk_t		*mp1;
-		mblk_t		*tail_mp;
-		size_t		count;
-		size_t		msg_len;
-		uint8_t		ecn_info = 0;
-		uint32_t	packet_size;
-		boolean_t 	pruned = B_FALSE;
-
-		ident = ipha->ipha_ident;
-		offset = (frag_offset_flags << 3) & 0xFFFF;
-		src = ipha->ipha_src;
-		hdr_length = IPH_HDR_LENGTH(ipha);
-		end = ntohs(ipha->ipha_length) - hdr_length;
-
-		/*
-		 * if end == 0 then we have a packet with no data, so just
-		 * free it.
-		 */
-		if (end == 0) {
+	hdr_length = IPH_HDR_LENGTH(ipha);
+	end = ntohs(ipha->ipha_length) - hdr_length;
+
+	/* If end == 0 then we have a packet with no data, so just free it */
+	if (end == 0) {
+		freemsg(mp);
+		return (B_FALSE);
+	}
+
+	/* Record the ECN field info. */
+	ecn_info = (ipha->ipha_type_of_service & 0x3);
+	if (offset != 0) {
+		/*
+		 * If this isn't the first piece, strip the header, and
+		 * add the offset to the end value.
+		 */
+		mp->b_rptr += hdr_length;
+		end += offset;
+	}
+
+	msg_len = MBLKSIZE(mp);
+	tail_mp = mp;
+	while (tail_mp->b_cont != NULL) {
+		tail_mp = tail_mp->b_cont;
+		msg_len += MBLKSIZE(tail_mp);
+	}
+
+	/* If the reassembly list for this ILL will get too big, prune it */
+	if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
+	    ip_reass_queue_bytes) {
+		ill_frag_prune(ill,
+		    (ip_reass_queue_bytes < msg_len) ? 0 :
+		    (ip_reass_queue_bytes - msg_len));
+		pruned = B_TRUE;
+	}
+
+	ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
+	mutex_enter(&ipfb->ipfb_lock);
+
+	ipfp = &ipfb->ipfb_ipf;
+	/* Try to find an existing fragment queue for this packet. */
+	for (;;) {
+		ipf = ipfp[0];
+		if (ipf != NULL) {
+			/*
+			 * It has to match on ident and src/dst address.
+			 */
+			if (ipf->ipf_ident == ident &&
+			    ipf->ipf_src == src &&
+			    ipf->ipf_dst == dst &&
+			    ipf->ipf_protocol == proto) {
+				/*
+				 * If we have received too many
+				 * duplicate fragments for this packet
+				 * free it.
+				 */
+				if (ipf->ipf_num_dups > ip_max_frag_dups) {
+					ill_frag_free_pkts(ill, ipfb, ipf, 1);
+					freemsg(mp);
+					mutex_exit(&ipfb->ipfb_lock);
+					return (B_FALSE);
+				}
+				/* Found it. */
+				break;
+			}
+			ipfp = &ipf->ipf_hash_next;
+			continue;
+		}
+
+		/*
+		 * If we pruned the list, do we want to store this new
+		 * fragment?. We apply an optimization here based on the
+		 * fact that most fragments will be received in order.
+		 * So if the offset of this incoming fragment is zero,
+		 * it is the first fragment of a new packet. We will
+		 * keep it.  Otherwise drop the fragment, as we have
+		 * probably pruned the packet already (since the
+		 * packet cannot be found).
+		 */
+		if (pruned && offset != 0) {
+			mutex_exit(&ipfb->ipfb_lock);
 			freemsg(mp);
 			return (B_FALSE);
 		}
-		proto = ipha->ipha_protocol;
-
-		/*
-		 * Fragmentation reassembly.  Each ILL has a hash table for
-		 * queuing packets undergoing reassembly for all IPIFs
-		 * associated with the ILL.  The hash is based on the packet
-		 * IP ident field.  The ILL frag hash table was allocated
-		 * as a timer block at the time the ILL was created.  Whenever
-		 * there is anything on the reassembly queue, the timer will
-		 * be running.
-		 */
-		ASSERT(ill != NULL);
-
-		/* Record the ECN field info. */
-		ecn_info = (ipha->ipha_type_of_service & 0x3);
-		if (offset != 0) {
-			/*
-			 * If this isn't the first piece, strip the header, and
-			 * add the offset to the end value.
-			 */
-			mp->b_rptr += hdr_length;
-			end += offset;
-		}
-
-		msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
-		tail_mp = mp;
-		while (tail_mp->b_cont != NULL) {
-			tail_mp = tail_mp->b_cont;
-			msg_len += tail_mp->b_datap->db_lim -
-			    tail_mp->b_datap->db_base;
-		}
-
-		/*
-		 * If the reassembly list for this ILL will get too big
-		 * prune it.
-		 */
-		if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
-		    ip_reass_queue_bytes) {
-			ill_frag_prune(ill,
-			    (ip_reass_queue_bytes < msg_len) ? 0 :
-			    (ip_reass_queue_bytes - msg_len));
-			pruned = B_TRUE;
-		}
-
-		ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
-		mutex_enter(&ipfb->ipfb_lock);
-
-		ipfp = &ipfb->ipfb_ipf;
-		/* Try to find an existing fragment queue for this packet. */
-		for (;;) {
-			ipf = ipfp[0];
-			if (ipf != NULL) {
-				/*
-				 * It has to match on ident and src/dst address.
-				 */
-				if (ipf->ipf_ident == ident &&
-				    ipf->ipf_src == src &&
-				    ipf->ipf_dst == dst &&
-				    ipf->ipf_protocol == proto) {
-					/*
-					 * If we have received too many
-					 * duplicate fragments for this packet
-					 * free it.
-					 */
-					if (ipf->ipf_num_dups >
-					    ip_max_frag_dups) {
-						ill_frag_free_pkts(ill, ipfb,
-						    ipf, 1);
-						freemsg(mp);
-						mutex_exit(&ipfb->ipfb_lock);
-						return (B_FALSE);
-					}
-					/* Found it. */
-					break;
-				}
-				ipfp = &ipf->ipf_hash_next;
-				continue;
-			}
-
-			/*
-			 * If we pruned the list, do we want to store this new
-			 * fragment?. We apply an optimization here based on the
-			 * fact that most fragments will be received in order.
-			 * So if the offset of this incoming fragment is zero,
-			 * it is the first fragment of a new packet. We will
-			 * keep it.  Otherwise drop the fragment, as we have
-			 * probably pruned the packet already (since the
-			 * packet cannot be found).
-			 */
-			if (pruned && offset != 0) {
-				mutex_exit(&ipfb->ipfb_lock);
-				freemsg(mp);
-				return (B_FALSE);
-			}
-
-			if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS)  {
-				/*
-				 * Too many fragmented packets in this hash
-				 * bucket. Free the oldest.
-				 */
-				ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
-				    1);
-			}
-
-			/* New guy.  Allocate a frag message. */
-			mp1 = allocb(sizeof (*ipf), BPRI_MED);
-			if (mp1 == NULL) {
-				BUMP_MIB(&ip_mib, ipInDiscards);
-				freemsg(mp);
+
+		if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS)  {
+			/*
+			 * Too many fragmented packets in this hash
+			 * bucket. Free the oldest.
+			 */
+			ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
+		}
+
+		/* New guy.  Allocate a frag message. */
+		mp1 = allocb(sizeof (*ipf), BPRI_MED);
+		if (mp1 == NULL) {
+			BUMP_MIB(&ip_mib, ipInDiscards);
+			freemsg(mp);
 reass_done:
-				mutex_exit(&ipfb->ipfb_lock);
-				return (B_FALSE);
-			}
-
-
-			BUMP_MIB(&ip_mib, ipReasmReqds);
-			mp1->b_cont = mp;
-
-			/* Initialize the fragment header. */
-			ipf = (ipf_t *)mp1->b_rptr;
-			ipf->ipf_mp = mp1;
-			ipf->ipf_ptphn = ipfp;
-			ipfp[0] = ipf;
-			ipf->ipf_hash_next = NULL;
-			ipf->ipf_ident = ident;
-			ipf->ipf_protocol = proto;
-			ipf->ipf_src = src;
-			ipf->ipf_dst = dst;
-			ipf->ipf_nf_hdr_len = 0;
-			/* Record reassembly start time. */
-			ipf->ipf_timestamp = gethrestime_sec();
-			/* Record ipf generation and account for frag header */
-			ipf->ipf_gen = ill->ill_ipf_gen++;
-			ipf->ipf_count = mp1->b_datap->db_lim -
-			    mp1->b_datap->db_base;
-			ipf->ipf_last_frag_seen = B_FALSE;
-			ipf->ipf_ecn = ecn_info;
-			ipf->ipf_num_dups = 0;
-			ipfb->ipfb_frag_pkts++;
-
-			/*
-			 * We handle reassembly two ways.  In the easy case,
-			 * where all the fragments show up in order, we do
-			 * minimal bookkeeping, and just clip new pieces on
-			 * the end.  If we ever see a hole, then we go off
-			 * to ip_reassemble which has to mark the pieces and
-			 * keep track of the number of holes, etc.  Obviously,
-			 * the point of having both mechanisms is so we can
-			 * handle the easy case as efficiently as possible.
-			 */
-			if (offset == 0) {
-				/* Easy case, in-order reassembly so far. */
-				ipf->ipf_count += msg_len;
-				ipf->ipf_tail_mp = tail_mp;
-				/*
-				 * Keep track of next expected offset in
-				 * ipf_end.
-				 */
-				ipf->ipf_end = end;
-				ipf->ipf_nf_hdr_len = hdr_length;
-			} else {
-				/* Hard case, hole at the beginning. */
-				ipf->ipf_tail_mp = NULL;
-				/*
-				 * ipf_end == 0 means that we have given up
-				 * on easy reassembly.
-				 */
-				ipf->ipf_end = 0;
-				/*
-				 * ipf_hole_cnt is set by ip_reassemble.
-				 * ipf_count is updated by ip_reassemble.
-				 * No need to check for return value here
-				 * as we don't expect reassembly to complete
-				 * or fail for the first fragment itself.
-				 */
-				(void) ip_reassemble(mp, ipf,
-				    (frag_offset_flags & IPH_OFFSET) << 3,
-				    (frag_offset_flags & IPH_MF), ill, msg_len);
-			}
-			/* Update per ipfb and ill byte counts */
-			ipfb->ipfb_count += ipf->ipf_count;
-			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
-			ill->ill_frag_count += ipf->ipf_count;
-			ASSERT(ill->ill_frag_count > 0); /* Wraparound */
-			/* If the frag timer wasn't already going, start it. */
-			mutex_enter(&ill->ill_lock);
-			ill_frag_timer_start(ill);
-			mutex_exit(&ill->ill_lock);
-			goto reass_done;
-		}
-
-		/*
-		 * We have a new piece of a datagram which is already being
-		 * reassembled.  Update the ECN info if all IP fragments
-		 * are ECN capable.  If there is one which is not, clear
-		 * all the info.  If there is at least one which has CE
-		 * code point, IP needs to report that up to transport.
-		 */
-		if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
-			if (ecn_info == IPH_ECN_CE)
-				ipf->ipf_ecn = IPH_ECN_CE;
-		} else {
-			ipf->ipf_ecn = IPH_ECN_NECT;
-		}
-		if (offset && ipf->ipf_end == offset) {
-			/* The new fragment fits at the end */
-			ipf->ipf_tail_mp->b_cont = mp;
-			/* Update the byte count */
+			mutex_exit(&ipfb->ipfb_lock);
+			return (B_FALSE);
+		}
+
+
+		BUMP_MIB(&ip_mib, ipReasmReqds);
+		mp1->b_cont = mp;
+
+		/* Initialize the fragment header. */
+		ipf = (ipf_t *)mp1->b_rptr;
+		ipf->ipf_mp = mp1;
+		ipf->ipf_ptphn = ipfp;
+		ipfp[0] = ipf;
+		ipf->ipf_hash_next = NULL;
+		ipf->ipf_ident = ident;
+		ipf->ipf_protocol = proto;
+		ipf->ipf_src = src;
+		ipf->ipf_dst = dst;
+		ipf->ipf_nf_hdr_len = 0;
+		/* Record reassembly start time. */
+		ipf->ipf_timestamp = gethrestime_sec();
+		/* Record ipf generation and account for frag header */
+		ipf->ipf_gen = ill->ill_ipf_gen++;
+		ipf->ipf_count = MBLKSIZE(mp1);
+		ipf->ipf_last_frag_seen = B_FALSE;
+		ipf->ipf_ecn = ecn_info;
+		ipf->ipf_num_dups = 0;
+		ipfb->ipfb_frag_pkts++;
+		ipf->ipf_checksum = 0;
+		ipf->ipf_checksum_flags = 0;
+
+		/* Store checksum value in fragment header */
+		if (sum_flags != 0) {
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			ipf->ipf_checksum = sum_val;
+			ipf->ipf_checksum_flags = sum_flags;
+		}
+
+		/*
+		 * We handle reassembly two ways.  In the easy case,
+		 * where all the fragments show up in order, we do
+		 * minimal bookkeeping, and just clip new pieces on
+		 * the end.  If we ever see a hole, then we go off
+		 * to ip_reassemble which has to mark the pieces and
+		 * keep track of the number of holes, etc.  Obviously,
+		 * the point of having both mechanisms is so we can
+		 * handle the easy case as efficiently as possible.
+		 */
+		if (offset == 0) {
+			/* Easy case, in-order reassembly so far. */
 			ipf->ipf_count += msg_len;
-			/* Update per ipfb and ill byte counts */
-			ipfb->ipfb_count += msg_len;
-			ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
-			ill->ill_frag_count += msg_len;
-			ASSERT(ill->ill_frag_count > 0); /* Wraparound */
-			if (frag_offset_flags & IPH_MF) {
-				/* More to come. */
-				ipf->ipf_end = end;
-				ipf->ipf_tail_mp = tail_mp;
-				goto reass_done;
-			}
-		} else {
-			/* Go do the hard cases. */
-			int ret;
-
-			if (offset == 0)
-				ipf->ipf_nf_hdr_len = hdr_length;
-
-			/* Save current byte count */
-			count = ipf->ipf_count;
-			ret = ip_reassemble(mp, ipf,
+			ipf->ipf_tail_mp = tail_mp;
+			/*
+			 * Keep track of next expected offset in
+			 * ipf_end.
+			 */
+			ipf->ipf_end = end;
+			ipf->ipf_nf_hdr_len = hdr_length;
+		} else {
+			/* Hard case, hole at the beginning. */
+			ipf->ipf_tail_mp = NULL;
+			/*
+			 * ipf_end == 0 means that we have given up
+			 * on easy reassembly.
+			 */
+			ipf->ipf_end = 0;
+
+			/* Forget checksum offload from now on */
+			ipf->ipf_checksum_flags = 0;
+
+			/*
+			 * ipf_hole_cnt is set by ip_reassemble.
+			 * ipf_count is updated by ip_reassemble.
+			 * No need to check for return value here
+			 * as we don't expect reassembly to complete
+			 * or fail for the first fragment itself.
+			 */
+			(void) ip_reassemble(mp, ipf,
 			    (frag_offset_flags & IPH_OFFSET) << 3,
 			    (frag_offset_flags & IPH_MF), ill, msg_len);
-			/* Count of bytes added and subtracted (freeb()ed) */
-			count = ipf->ipf_count - count;
-			if (count) {
-				/* Update per ipfb and ill byte counts */
-				ipfb->ipfb_count += count;
-				ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
-				ill->ill_frag_count += count;
-				ASSERT(ill->ill_frag_count > 0);
-			}
-			if (ret == IP_REASS_PARTIAL) {
-				goto reass_done;
-			} else if (ret == IP_REASS_FAILED) {
-				/* Reassembly failed. Free up all resources */
-				ill_frag_free_pkts(ill, ipfb, ipf, 1);
-				for (t_mp = mp; t_mp != NULL;
-				    t_mp = t_mp->b_cont) {
-					IP_REASS_SET_START(t_mp, 0);
-					IP_REASS_SET_END(t_mp, 0);
-				}
-				freemsg(mp);
-				goto reass_done;
-			}
-			/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
-		}
-		/*
-		 * We have completed reassembly.  Unhook the frag header from
-		 * the reassembly list.
-		 *
-		 * Before we free the frag header, record the ECN info
-		 * to report back to the transport.
-		 */
-		ecn_info = ipf->ipf_ecn;
-		BUMP_MIB(&ip_mib, ipReasmOKs);
-		ipfp = ipf->ipf_ptphn;
-		mp1 = ipf->ipf_mp;
+		}
+		/* Update per ipfb and ill byte counts */
+		ipfb->ipfb_count += ipf->ipf_count;
+		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
+		ill->ill_frag_count += ipf->ipf_count;
+		ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+		/* If the frag timer wasn't already going, start it. */
+		mutex_enter(&ill->ill_lock);
+		ill_frag_timer_start(ill);
+		mutex_exit(&ill->ill_lock);
+		goto reass_done;
+	}
+
+	/*
+	 * If the packet's flag has changed (it could be coming up
+	 * from an interface different than the previous, therefore
+	 * possibly different checksum capability), then forget about
+	 * any stored checksum states.  Otherwise add the value to
+	 * the existing one stored in the fragment header.
+	 */
+	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+		sum_val += ipf->ipf_checksum;
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		ipf->ipf_checksum = sum_val;
+	} else if (ipf->ipf_checksum_flags != 0) {
+		/* Forget checksum offload from now on */
+		ipf->ipf_checksum_flags = 0;
+	}
+
+	/*
+	 * We have a new piece of a datagram which is already being
+	 * reassembled.  Update the ECN info if all IP fragments
+	 * are ECN capable.  If there is one which is not, clear
+	 * all the info.  If there is at least one which has CE
+	 * code point, IP needs to report that up to transport.
+	 */
+	if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
+		if (ecn_info == IPH_ECN_CE)
+			ipf->ipf_ecn = IPH_ECN_CE;
+	} else {
+		ipf->ipf_ecn = IPH_ECN_NECT;
+	}
+	if (offset && ipf->ipf_end == offset) {
+		/* The new fragment fits at the end */
+		ipf->ipf_tail_mp->b_cont = mp;
+		/* Update the byte count */
+		ipf->ipf_count += msg_len;
+		/* Update per ipfb and ill byte counts */
+		ipfb->ipfb_count += msg_len;
+		ASSERT(ipfb->ipfb_count > 0);	/* Wraparound */
+		ill->ill_frag_count += msg_len;
+		ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+		if (frag_offset_flags & IPH_MF) {
+			/* More to come. */
+			ipf->ipf_end = end;
+			ipf->ipf_tail_mp = tail_mp;
+			goto reass_done;
+		}
+	} else {
+		/* Go do the hard cases. */
+		int ret;
+
+		if (offset == 0)
+			ipf->ipf_nf_hdr_len = hdr_length;
+
+		/* Save current byte count */
 		count = ipf->ipf_count;
-		ipf = ipf->ipf_hash_next;
-		if (ipf)
-			ipf->ipf_ptphn = ipfp;
-		ipfp[0] = ipf;
-		ill->ill_frag_count -= count;
-		ASSERT(ipfb->ipfb_count >= count);
-		ipfb->ipfb_count -= count;
-		ipfb->ipfb_frag_pkts--;
-		mutex_exit(&ipfb->ipfb_lock);
-		/* Ditch the frag header. */
-		mp = mp1->b_cont;
-
-		freeb(mp1);
-
-		/* Restore original IP length in header. */
-		packet_size = (uint32_t)msgdsize(mp);
-		if (packet_size > IP_MAXPACKET) {
+		ret = ip_reassemble(mp, ipf,
+		    (frag_offset_flags & IPH_OFFSET) << 3,
+		    (frag_offset_flags & IPH_MF), ill, msg_len);
+		/* Count of bytes added and subtracted (freeb()ed) */
+		count = ipf->ipf_count - count;
+		if (count) {
+			/* Update per ipfb and ill byte counts */
+			ipfb->ipfb_count += count;
+			ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+			ill->ill_frag_count += count;
+			ASSERT(ill->ill_frag_count > 0);
+		}
+		if (ret == IP_REASS_PARTIAL) {
+			goto reass_done;
+		} else if (ret == IP_REASS_FAILED) {
+			/* Reassembly failed. Free up all resources */
+			ill_frag_free_pkts(ill, ipfb, ipf, 1);
+			for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
+				IP_REASS_SET_START(t_mp, 0);
+				IP_REASS_SET_END(t_mp, 0);
+			}
 			freemsg(mp);
-			BUMP_MIB(&ip_mib, ipInHdrErrors);
+			goto reass_done;
+		}
+		/* We will reach here iff 'ret' is IP_REASS_COMPLETE */
+	}
+	/*
+	 * We have completed reassembly.  Unhook the frag header from
+	 * the reassembly list.
+	 *
+	 * Before we free the frag header, record the ECN info
+	 * to report back to the transport.
+	 */
+	ecn_info = ipf->ipf_ecn;
+	BUMP_MIB(&ip_mib, ipReasmOKs);
+	ipfp = ipf->ipf_ptphn;
+
+	/* We need to supply these to caller */
+	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+		sum_val = ipf->ipf_checksum;
+	else
+		sum_val = 0;
+
+	mp1 = ipf->ipf_mp;
+	count = ipf->ipf_count;
+	ipf = ipf->ipf_hash_next;
+	if (ipf != NULL)
+		ipf->ipf_ptphn = ipfp;
+	ipfp[0] = ipf;
+	ill->ill_frag_count -= count;
+	ASSERT(ipfb->ipfb_count >= count);
+	ipfb->ipfb_count -= count;
+	ipfb->ipfb_frag_pkts--;
+	mutex_exit(&ipfb->ipfb_lock);
+	/* Ditch the frag header. */
+	mp = mp1->b_cont;
+
+	freeb(mp1);
+
+	/* Restore original IP length in header. */
+	packet_size = (uint32_t)msgdsize(mp);
+	if (packet_size > IP_MAXPACKET) {
+		freemsg(mp);
+		BUMP_MIB(&ip_mib, ipInHdrErrors);
+		return (B_FALSE);
+	}
+
+	if (DB_REF(mp) > 1) {
+		mblk_t *mp2 = copymsg(mp);
+
+		freemsg(mp);
+		if (mp2 == NULL) {
+			BUMP_MIB(&ip_mib, ipInDiscards);
 			return (B_FALSE);
 		}
-
-		if (mp->b_datap->db_ref > 1) {
-			mblk_t *mp2;
-
-			mp2 = copymsg(mp);
-			freemsg(mp);
-			if (!mp2) {
-				BUMP_MIB(&ip_mib, ipInDiscards);
-				return (B_FALSE);
-			}
-			mp = mp2;
-		}
-		ipha = (ipha_t *)mp->b_rptr;
-
-		ipha->ipha_length = htons((uint16_t)packet_size);
-		/* We're now complete, zip the frag state */
-		ipha->ipha_fragment_offset_and_flags = 0;
-		/* Record the ECN info. */
-		ipha->ipha_type_of_service &= 0xFC;
-		ipha->ipha_type_of_service |= ecn_info;
-		*mpp = mp;
-
-	}
+		mp = mp2;
+	}
+	ipha = (ipha_t *)mp->b_rptr;
+
+	ipha->ipha_length = htons((uint16_t)packet_size);
+	/* We're now complete, zip the frag state */
+	ipha->ipha_fragment_offset_and_flags = 0;
+	/* Record the ECN info. */
+	ipha->ipha_type_of_service &= 0xFC;
+	ipha->ipha_type_of_service |= ecn_info;
+	*mpp = mp;
+
+	/* Reassembly is successful; return checksum information if needed */
+	if (cksum_val != NULL)
+		*cksum_val = sum_val;
+	if (cksum_flags != NULL)
+		*cksum_flags = sum_flags;
+
 	return (B_TRUE);
 }
 
@@ -11156,16 +11343,12 @@
 {
 	uint32_t	sum;
 	uint32_t	u1;
-	uint32_t	u2;
 	boolean_t	mctl_present;
 	conn_t		*connp;
 	mblk_t		*first_mp;
-	mblk_t		*mp1;
-	dblk_t		*dp;
 	uint16_t	*up;
 	ill_t		*ill = (ill_t *)q->q_ptr;
-	uint32_t	ports;
-	boolean_t	cksum_computed = B_FALSE;
+	uint16_t	reass_hck_flags = 0;
 
 #define	rptr    ((uchar_t *)ipha)
 
@@ -11182,19 +11365,13 @@
 	    IP_SIMPLE_HDR_LENGTH_IN_WORDS);
 
 	/* IP options present */
-	if (u1)
+	if (u1 != 0)
 		goto ipoptions;
 
-#define	IS_IPHDR_HWCKSUM(mctl_present, mp, ill)				\
-	((!mctl_present) && (mp->b_datap->db_struioun.cksum.flags &	\
-	HCK_IPV4_HDRCKSUM) && (ill->ill_capabilities &			\
-	ILL_CAPAB_HCKSUM) && dohwcksum)
-
 	/* Check the IP header checksum.  */
-	if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+	if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 		/* Clear the IP header h/w cksum flag */
-		mp->b_datap->db_struioun.cksum.flags &=
-		    ~HCK_IPV4_HDRCKSUM;
+		DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 	} else {
 #define	uph	((uint16_t *)ipha)
 		sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
@@ -11207,7 +11384,7 @@
 		 * Don't verify header checksum if this packet is coming
 		 * back from AH/ESP as we already did it.
 		 */
-		if (!mctl_present && (sum && sum != 0xFFFF)) {
+		if (!mctl_present && sum != 0 && sum != 0xFFFF) {
 			BUMP_MIB(&ip_mib, ipInCksumErrs);
 			freemsg(first_mp);
 			return;
@@ -11236,133 +11413,52 @@
 	/* packet does not contain complete IP & UDP headers */
 	if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
 		goto udppullup;
+
 	/* up points to UDP header */
 	up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
 #define	iphs    ((uint16_t *)ipha)
 
-#define	IP_CKSUM_RECV(len, u1, u2, mp, mp1, error, dp) {		\
-	boolean_t	doswcksum = B_TRUE;				\
-	uint_t		hcksumflags = 0;				\
-									\
-	hcksumflags = dp->db_struioun.cksum.flags;			\
-									\
-	/* Clear the hardware checksum flags; they have been consumed */\
-	dp->db_struioun.cksum.flags = 0;				\
-	if (hcksumflags && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&\
-		dohwcksum) {						\
-		if (hcksumflags & HCK_FULLCKSUM) {			\
-			/* 						\
-			 * Full checksum has been computed by the	\
-			 * hardware and has been attached. 		\
-			 */						\
-			doswcksum = B_FALSE;				\
-			if (!(hcksumflags & HCK_FULLCKSUM_OK) &&	\
-			    (dp->db_cksum16 != 0xffff)) {		\
-				ipcsumdbg("full hwcksumerr\n", mp);	\
-				goto error;				\
-			}						\
-		} else if ((hcksumflags & HCK_PARTIALCKSUM) &&		\
-		    (((len = (IP_SIMPLE_HDR_LENGTH - dp->db_cksumstart))\
-		    & 1) == 0)) {					\
-			uint32_t	tot_len = 0;			\
-									\
-			doswcksum = B_FALSE;				\
-			/* Partial checksum computed */			\
-			u1 += dp->db_cksum16;				\
-			tot_len = mp->b_wptr - mp->b_rptr;		\
-			if (!mp1)					\
-				mp1 = mp;				\
-			else						\
-				tot_len += mp1->b_wptr - mp1->b_rptr;	\
-			if (len > 0) {					\
-				/* 					\
-				 * Prepended extraneous data. Adjust	\
-				 * checksum.				\
-				 */					\
-				u2 = IP_BCSUM_PARTIAL((uchar_t *)(rptr +\
-				    dp->db_cksumstart),	(int32_t)len, 	\
-				    0);					\
-			} else						\
-				u2 = 0;					\
-			if ((len = (dp->db_cksumend - tot_len)) > 0) {	\
-				/* 					\
-				 * Postpended extraneous data. Adjust	\
-				 * checksum.				\
-				 */					\
-				uint32_t	u3;			\
-									\
-				u3 = IP_BCSUM_PARTIAL(mp1->b_wptr, 	\
-				    (int32_t)len, 0);			\
-				if ((uintptr_t)mp1->b_wptr & 1)		\
-					/*				\
-					 * Postpended extraneous data	\
-					 * was odd byte aligned, so 	\
-					 * swap resulting checksum 	\
-					 * bytes.			\
-					 */				\
-					u2 += ((u3 << 8) & 0xffff) | 	\
-					    (u3 >> 8);			\
-				else					\
-					u2 += u3;			\
-				u2 = (u2 & 0xFFFF) + ((int)(u2) >> 16);	\
-			}						\
-			/*						\
-			 * One's complement subtract extraneous checksum\
-			 */						\
-			if (u2 >= u1)					\
-				u1 = ~(u2 - u1) & 0xFFFF;		\
-			else						\
-				u1 -= u2;				\
-			u1 = (u1 & 0xFFFF) + ((int)u1 >> 16);		\
-			if (~(u1) & 0xFFFF) {				\
-				ipcsumdbg("partial hwcksumerr\n", mp);	\
-				goto error;				\
-			}						\
-		} 							\
-	} 								\
-	if (doswcksum) {						\
-		IP_STAT(ip_in_sw_cksum);				\
-		if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -		\
-		    (uchar_t *)ipha), u1)) != 0) {			\
-			ipcsumdbg("swcksumerr\n", mp);			\
-			goto error;					\
-		}							\
-	}								\
-}
-
-	dp = mp->b_datap;
 	/* if udp hdr cksum != 0, then need to checksum udp packet */
-	if (up[3]) {
-		cksum_computed = B_TRUE;
-		/* multiple mblks of udp data? */
-		if ((mp1 = mp->b_cont) != NULL) {
-			/* more than two? */
-			if (mp1->b_cont)
-				goto multipktudp;
-		}
+	if (up[3] != 0) {
+		mblk_t *mp1 = mp->b_cont;
+		boolean_t cksum_err;
+		uint16_t hck_flags = 0;
 
 		/* Pseudo-header checksum */
 		u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
 		    iphs[9] + up[2];
-		if (!mctl_present) {
-			ssize_t len = 0;
-
-			IP_CKSUM_RECV(len, u1, u2, mp, mp1, udpcksumerr, dp);
-		} else {
-multipktudp:
+
+		/*
+		 * Revert to software checksum calculation if the interface
+		 * isn't capable of checksum offload or if IPsec is present.
+		 */
+		if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+			hck_flags = DB_CKSUMFLAGS(mp);
+
+		if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
 			IP_STAT(ip_in_sw_cksum);
-			if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -
-			    (uchar_t *)ipha), u1)) != 0) {
-udpcksumerr:
-				ip1dbg(("ip_udp_input: bad udp checksum\n"));
-				BUMP_MIB(&ip_mib, udpInCksumErrs);
-				freemsg(first_mp);
-				return;
-			}
-		}
-	}
-
-	/* broadcast IP packet? */
+
+		IP_CKSUM_RECV(hck_flags, u1,
+		    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+		    (int32_t)((uchar_t *)up - rptr),
+		    mp, mp1, cksum_err);
+
+		if (cksum_err) {
+			BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+			if (hck_flags & HCK_FULLCKSUM)
+				IP_STAT(ip_udp_in_full_hw_cksum_err);
+			else if (hck_flags & HCK_PARTIALCKSUM)
+				IP_STAT(ip_udp_in_part_hw_cksum_err);
+			else
+				IP_STAT(ip_udp_in_sw_cksum_err);
+
+			freemsg(first_mp);
+			return;
+		}
+	}
+
+	/* Non-fragmented broadcast or multicast packet? */
 	if (ire->ire_type == IRE_BROADCAST)
 		goto udpslowpath;
 
@@ -11371,7 +11467,7 @@
 		ASSERT(connp->conn_upq != NULL);
 		IP_STAT(ip_udp_fast_path);
 
-		if (!canputnext(connp->conn_upq)) {
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			freemsg(mp);
 			BUMP_MIB(&ip_mib, udpInOverflows);
 		} else {
@@ -11383,7 +11479,8 @@
 			 */
 			if (ip_udp_check(q, connp, recv_ill,
 			    ipha, &mp, &first_mp, mctl_present)) {
-				putnext(connp->conn_upq, mp);
+				/* Send it upstream */
+				CONN_UDP_RECV(connp, mp);
 			}
 		}
 		/*
@@ -11416,9 +11513,13 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		/*
+		 * "sum" and "reass_hck_flags" are non-zero if the
+		 * reassembled packet has a valid hardware computed
+		 * checksum information associated with it.
+		 */
+		if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
 			goto slow_done;
-		}
 		/*
 		 * Make sure that first_mp points back to mp as
 		 * the mp we came in with could have changed in
@@ -11432,7 +11533,7 @@
 	/* Now we have a complete datagram, destined for this machine. */
 	u1 = IPH_HDR_LENGTH(ipha);
 	/* Pull up the UDP header, if necessary. */
-	if ((mp->b_wptr - mp->b_rptr) < (u1 + UDPH_SIZE)) {
+	if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
 udppullup:
 		if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
 			BUMP_MIB(&ip_mib, ipInDiscards);
@@ -11441,30 +11542,43 @@
 		}
 		ipha = (ipha_t *)mp->b_rptr;
 	}
-	/*
-	 * Validate the checksum.  This code is a bit funny looking
-	 * but may help out the compiler in this crucial spot.
+
+	/*
+	 * Validate the checksum for the reassembled packet; for the
+	 * pullup case we calculate the payload checksum in software.
 	 */
 	up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
-	if (!cksum_computed && up[3]) {
-		IP_STAT(ip_in_sw_cksum);
-		sum = IP_CSUM(mp, (int32_t)((uchar_t *)up - (uchar_t *)ipha),
-		    IP_UDP_CSUM_COMP + iphs[6] +
-		    iphs[7] + iphs[8] +
-		    iphs[9] + up[2]);
-		if (sum != 0) {
-			ip1dbg(("ip_udp_input: bad udp checksum\n"));
-				BUMP_MIB(&ip_mib, udpInCksumErrs);
-				freemsg(first_mp);
-				goto slow_done;
+	if (up[3] != 0) {
+		boolean_t cksum_err;
+
+		if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+			IP_STAT(ip_in_sw_cksum);
+
+		IP_CKSUM_RECV_REASS(reass_hck_flags,
+		    (int32_t)((uchar_t *)up - (uchar_t *)ipha),
+		    IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
+		    iphs[9] + up[2], sum, cksum_err);
+
+		if (cksum_err) {
+			BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+			if (reass_hck_flags & HCK_FULLCKSUM)
+				IP_STAT(ip_udp_in_full_hw_cksum_err);
+			else if (reass_hck_flags & HCK_PARTIALCKSUM)
+				IP_STAT(ip_udp_in_part_hw_cksum_err);
+			else
+				IP_STAT(ip_udp_in_sw_cksum_err);
+
+			freemsg(first_mp);
+			goto slow_done;
 		}
 	}
 udpslowpath:
 
-	ports = *(uint32_t *)up;
-	/* Clear hardware checksum flag */
-	mp->b_datap->db_struioun.cksum.flags = 0;
-	ip_fanout_udp(q, first_mp, ill, ipha, ports,
+	/* Clear hardware checksum flag to be safe */
+	DB_CKSUMFLAGS(mp) = 0;
+
+	ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
 	    (ire->ire_type == IRE_BROADCAST),
 	    IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO,
 	    mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
@@ -11473,6 +11587,7 @@
 	IP_STAT(ip_udp_slow_path);
 	return;
 
+#undef  iphs
 #undef  rptr
 }
 
@@ -11485,17 +11600,17 @@
 	conn_t		*connp;
 	uint32_t	sum;
 	uint32_t	u1;
-	uint32_t	u2;
 	uint16_t	*up;
 	int		offset;
 	ssize_t		len;
 	mblk_t		*mp1;
-	dblk_t		*dp;
 	boolean_t	syn_present = B_FALSE;
 	tcph_t		*tcph;
 	uint_t		ip_hdr_len;
 	ill_t		*ill = (ill_t *)q->q_ptr;
 	zoneid_t	zoneid = ire->ire_zoneid;
+	boolean_t	cksum_err;
+	uint16_t	hck_flags = 0;
 
 #define	rptr	((uchar_t *)ipha)
 
@@ -11514,10 +11629,9 @@
 		goto ipoptions;
 	} else {
 		/* Check the IP header checksum.  */
-		if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 			/* Clear the IP header h/w cksum flag */
-			mp->b_datap->db_struioun.cksum.flags &=
-			    ~HCK_IPV4_HDRCKSUM;
+			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 		} else {
 #define	uph	((uint16_t *)ipha)
 			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -11596,30 +11710,32 @@
 #endif
 	u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
 
-
-	/*
-	 * If the packet has gone through AH/ESP, do the checksum here
-	 * itself.
-	 *
-	 * If it has not gone through IPSEC processing and not a duped
-	 * mblk, then look for driver checksummed mblk. We validate or
-	 * postpone the checksum to TCP for single copy checksum.
-	 *
-	 * Note that we only honor HW cksum in the fastpath.
-	 */
-	dp = mp->b_datap;
-	if (!mctl_present) {
-		IP_CKSUM_RECV(len, u1, u2, mp, mp1, tcpcksumerr, dp);
-	} else {
+	/*
+	 * Revert to software checksum calculation if the interface
+	 * isn't capable of checksum offload or if IPsec is present.
+	 */
+	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+		hck_flags = DB_CKSUMFLAGS(mp);
+
+	if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
 		IP_STAT(ip_in_sw_cksum);
-		if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr),
-		    u1)) != 0) {
-tcpcksumerr:
-			BUMP_MIB(&ip_mib, tcpInErrs);
-			ip1dbg(("ip_tcp_input: bad tcp checksum \n"));
-			freemsg(first_mp);
-			goto slow_done;
-		}
+
+	IP_CKSUM_RECV(hck_flags, u1,
+	    (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+	    (int32_t)((uchar_t *)up - rptr),
+	    mp, mp1, cksum_err);
+
+	if (cksum_err) {
+		BUMP_MIB(&ip_mib, tcpInErrs);
+
+		if (hck_flags & HCK_FULLCKSUM)
+			IP_STAT(ip_tcp_in_full_hw_cksum_err);
+		else if (hck_flags & HCK_PARTIALCKSUM)
+			IP_STAT(ip_tcp_in_part_hw_cksum_err);
+		else
+			IP_STAT(ip_tcp_in_sw_cksum_err);
+
+		goto error;
 	}
 
 try_again:
@@ -11654,7 +11770,7 @@
 	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
 		if (IPCL_IS_TCP(connp)) {
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart =
+			DB_CKSUMSTART(mp) =
 			    (intptr_t)ip_squeue_get(ill_ring);
 			if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
 			    !CONN_INBOUND_POLICY_PRESENT(connp)) {
@@ -11800,7 +11916,7 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
 			if (mctl_present)
 				freeb(first_mp);
 			goto slow_done;
@@ -11876,9 +11992,10 @@
 	 * ICMP's back, then this flag may need to be cleared in
 	 * other places as well.
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
+
 	u1 = (uint32_t)(len - u1);	/* TCP datagram length. */
 #ifdef	_BIG_ENDIAN
 	u1 += IPPROTO_TCP;
@@ -11890,7 +12007,7 @@
 	 * Not M_DATA mblk or its a dup, so do the checksum now.
 	 */
 	IP_STAT(ip_in_sw_cksum);
-	if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1)) {
+	if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
 		BUMP_MIB(&ip_mib, tcpInErrs);
 		goto error;
 	}
@@ -11937,12 +12054,12 @@
 		goto ipoptions;
 	} else {
 		/* Check the IP header checksum.  */
-		if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+		if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
 			/*
 			 * Since there is no SCTP h/w cksum support yet, just
 			 * clear the flag.
 			 */
-			mp->b_datap->db_struioun.cksum.flags = 0;
+			DB_CKSUMFLAGS(mp) = 0;
 		} else {
 #define	uph	((uint16_t *)ipha)
 			sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -12031,7 +12148,7 @@
 	return;
 
 ipoptions:
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	if (!ip_options_cksum(q, first_mp, ipha, ire))
 		goto slow_done;
 
@@ -12041,7 +12158,7 @@
 	u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
 	if (u1 & (IPH_MF | IPH_OFFSET)) {
 fragmented:
-		if (!ip_rput_fragment(q, &mp, ipha))
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
 			goto slow_done;
 		/*
 		 * Make sure that first_mp points back to mp as
@@ -12183,7 +12300,7 @@
 	 * Clear the indication that this may have a hardware checksum
 	 * as we are not using it
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Now hand the packet to ip_newroute.
@@ -12351,7 +12468,7 @@
 			 * Clear the indication that this may have
 			 * hardware checksum as we are not using it.
 			 */
-			mp->b_datap->db_struioun.cksum.flags = 0;
+			DB_CKSUMFLAGS(mp) = 0;
 			icmp_unreachable(q, mp,
 			    ICMP_SOURCE_ROUTE_FAILED);
 			ire_refrele(ire);
@@ -12361,7 +12478,7 @@
 	}
 
 	/* Packet is being forwarded. Turning off hwcksum flag. */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	if (ip_g_send_redirects) {
 		/*
 		 * Check whether the incoming interface and outgoing
@@ -12435,15 +12552,17 @@
 {
 	queue_t		*q;
 	ire_t		*ire;
+	uint16_t	hcksumflags;
 
 	q = *qp;
 	ire = *irep;
 
 	/*
 	 * Clear the indication that this may have hardware
-	 * checksum as we are not using it.
-	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	 * checksum as we are not using it for forwarding.
+	 */
+	hcksumflags = DB_CKSUMFLAGS(mp);
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Directed broadcast forwarding: if the packet came in over a
@@ -12613,6 +12732,9 @@
 	}
 
 	*irep = ire;
+
+	/* Restore any hardware checksum flags */
+	DB_CKSUMFLAGS(mp) = hcksumflags;
 	return (B_FALSE);
 }
 
@@ -12632,7 +12754,7 @@
 		 * Clear the indication that this may have hardware
 		 * checksum as we are not using it.
 		 */
-		mp->b_datap->db_struioun.cksum.flags = 0;
+		DB_CKSUMFLAGS(mp) = 0;
 		retval = ip_mforward(ill, ipha, mp);
 		/* ip_mforward updates mib variables if needed */
 		/* clear b_prev - used by ip_mroute_decap */
@@ -12951,7 +13073,7 @@
 			/*
 			 * Also SIOC[GS]TUN* ioctls can come here.
 			 */
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 			TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
 			    "ip_input_end: q %p (%S)", q, "uninit");
 			return;
@@ -13300,9 +13422,20 @@
 			continue;
 		}
 
-		/* broadcast? */
+		/*
+		 * Broadcast IRE may indicate either broadcast or
+		 * multicast packet
+		 */
 		if (ire->ire_type == IRE_BROADCAST) {
-			if (ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
+			/*
+			 * Skip broadcast checks if packet is UDP multicast;
+			 * we'd rather not enter ip_rput_process_broadcast()
+			 * unless the packet is broadcast for real, since
+			 * that routine is a no-op for multicast.
+			 */
+			if ((ipha->ipha_protocol != IPPROTO_UDP ||
+			    !CLASSD(ipha->ipha_dst)) &&
+			    ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
 			    dst, cgtp_flt_pkt, ll_multicast)) {
 				continue;
 			}
@@ -13533,24 +13666,6 @@
 }
 
 /*
- * This function is used to free a message that has gone through
- * mi_copyin processing which modifies the M_IOCTL mblk's b_next
- * and b_prev pointers. We use this function to set b_next/b_prev
- * to NULL and free them.
- */
-void
-ip_ioctl_freemsg(mblk_t *mp)
-{
-	mblk_t	*bp = mp;
-
-	for (; bp != NULL; bp = bp->b_cont) {
-		bp->b_prev = NULL;
-		bp->b_next = NULL;
-	}
-	freemsg(mp);
-}
-
-/*
  * Handling of DLPI messages that require exclusive access to the ipsq.
  *
  * Need to do ill_pending_mp_release on ioctl completion, which could
@@ -14483,7 +14598,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				ASSERT(ipsq->ipsq_current_ipif != NULL);
 				ASSERT(connp != NULL);
 				ip_ioctl_finish(CONNP_TO_WQ(connp), mp,
@@ -14515,7 +14630,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				if (iocp->ioc_error == 0)
 					mp->b_datap->db_type = M_IOCDATA;
 				ASSERT(connp != NULL);
@@ -14596,7 +14711,7 @@
 					mp->b_cont->b_prev =
 					    mp1->b_cont->b_prev;
 				}
-				ip_ioctl_freemsg(mp1);
+				inet_freemsg(mp1);
 				if (iocp->ioc_error == 0)
 					iocp->ioc_error = EINVAL;
 				ASSERT(connp != NULL);
@@ -15321,7 +15436,7 @@
 		 */
 		ASSERT(!mctl_present);
 		ASSERT(first_mp == mp);
-		if (!ip_rput_fragment(q, &mp, ipha)) {
+		if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
 			return;
 		}
 		/*
@@ -15337,7 +15452,7 @@
 	 * Clear hardware checksumming flag as it is currently only
 	 * used by TCP and UDP.
 	 */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/* Now we have a complete datagram, destined for this machine. */
 	u1 = IPH_HDR_LENGTH(ipha);
@@ -15839,7 +15954,7 @@
 bad_src_route:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
 	return (B_FALSE);
 
@@ -16022,14 +16137,14 @@
 param_prob:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_param_problem(q, mp, (uint8_t)code);
 	return (-1);
 
 bad_src_route:
 	q = WR(q);
 	/* make sure we clear any indication of a hardware checksum */
-	mp->b_datap->db_struioun.cksum.flags = 0;
+	DB_CKSUMFLAGS(mp) = 0;
 	icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
 	return (-1);
 }
@@ -17571,7 +17686,7 @@
  * upper level protocol.  We remove this conn from any fanout hash list it is
  * on, and zero out the bind information.  No reply is expected up above.
  */
-static void
+mblk_t *
 ip_unbind(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
@@ -17591,7 +17706,7 @@
 	 * original message.
 	 */
 	if (mp == NULL)
-		return;
+		return (NULL);
 
 	/*
 	 * Don't bzero the ports if its TCP since TCP still needs the
@@ -17601,7 +17716,7 @@
 	if (!IPCL_IS_TCP(connp))
 		bzero(&connp->u_port, sizeof (connp->u_port));
 
-	qreply(q, mp);
+	return (mp);
 }
 
 /*
@@ -17657,7 +17772,9 @@
 	/* is queue flow controlled? */
 	if ((q->q_first != NULL || connp->conn_draining) &&
 	    (caller == IP_WPUT)) {
-		goto doputq;
+		ASSERT(!need_decref);
+		(void) putq(q, mp);
+		return;
 	}
 
 	/* Multidata transmit? */
@@ -17992,11 +18109,6 @@
 		CONN_DEC_REF(connp);
 	return;
 
-doputq:
-	ASSERT(!need_decref);
-	(void) putq(q, mp);
-	return;
-
 qnext:
 	/*
 	 * Upper Level Protocols pass down complete IP datagrams
@@ -18933,7 +19045,7 @@
  * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
  * the above holds.
  */
-static ipif_t *
+ipif_t *
 conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
 {
 	ipif_t	*ipif;
@@ -19414,7 +19526,6 @@
 	boolean_t	multirt_send = B_FALSE;
 	int		err;
 	zoneid_t	zoneid;
-	boolean_t	iphdrhwcksum = B_FALSE;
 
 	TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
 	    "ip_wput_ire_start: q %p", q);
@@ -19749,102 +19860,6 @@
 	/* pseudo checksum (do it in parts for IP header checksum) */
 	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
 
-#define	FRAGMENT_NEEDED(mtu, size)	\
-	(((mtu) < (unsigned int)(size)) ? B_TRUE : B_FALSE)
-
-#define	IS_FASTPATH(ire, bp) 					\
-	((ire)->ire_fp_mp != NULL &&				\
-	(MBLKHEAD((bp)) >= (MBLKL((ire)->ire_fp_mp))))		\
-
-#define	IPH_UDPH_CHECKSUMP(ipha, hlen) \
-	((uint16_t *)(((uchar_t *)ipha)+(hlen + UDP_CHECKSUM_OFFSET)))
-#define	IPH_TCPH_CHECKSUMP(ipha, hlen) \
-	    ((uint16_t *)(((uchar_t *)ipha)+(hlen+TCP_CHECKSUM_OFFSET)))
-
-#define	IP_CKSUM_XMIT(ill, ire, mp, up, proto, hlen, max_frag,		\
-	    ipsec_len) { 						\
-	uint32_t	sum;						\
-	uint32_t	xmit_capab = HCKSUM_INET_FULL_V4 |		\
-			    HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM;	\
-	boolean_t	cksum_offload = B_FALSE;			\
-									\
-	/*								\
-	 * The ire fp mp can change due to the arrival of a		\
-	 * DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST		\
-	 * and IRE_MIPRTUN. Hence the ire_fp_mp has to be accessed	\
-	 * only under the ire_lock in such cases.			\
-	 */								\
-	LOCK_IRE_FP_MP(ire);						\
-	if ((ill) && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&	\
-	    (ill->ill_hcksum_capab->ill_hcksum_txflags &		\
-	    xmit_capab) && (!FRAGMENT_NEEDED(max_frag, 			\
-	    (LENGTH + ipsec_len))) && (!(ire->ire_flags & 		\
-	    RTF_MULTIRT)) && (ipsec_len == 0) && 			\
-	    IS_FASTPATH((ire), (mp)) &&	(dohwcksum)) { 			\
-		/*							\
-		 * Underlying interface supports hardware checksumming.	\
-		 * So postpone the checksum to the interface driver	\
-		 */							\
-									\
-		if ((hlen) == IP_SIMPLE_HDR_LENGTH) {			       \
-			if (ill->ill_hcksum_capab->ill_hcksum_txflags &        \
-			    HCKSUM_IPHDRCKSUM) {			       \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_IPV4_HDRCKSUM;			       \
-				/* seed the cksum field to 0 */		       \
-				ipha->ipha_hdr_checksum = 0;		       \
-				iphdrhwcksum = B_TRUE;			       \
-			}						       \
-			/*						       \
-			 * If underlying h/w supports full h/w checksumming    \
-			 * and no IP options are present, then offload	       \
-			 * full checksumming to the hardware.		       \
-			 *						       \
-			 * If h/w can do partial checksumming then offload     \
-			 * unless the startpoint offset, including mac-header, \
-			 * is too big for the interface to some of our	       \
-			 * hardware (CE and ERI) which have 6 bit fields.      \
-			 * Sigh.					       \
-			 * Unhappily we don't have the mac-header size here    \
-			 * so punt for any options.			       \
-			 */						       \
-			if (ill->ill_hcksum_capab->ill_hcksum_txflags &        \
-			    HCKSUM_INET_FULL_V4) {			       \
-				UNLOCK_IRE_FP_MP(ire);			       \
-				/* Seed the checksum field to 0 */	       \
-				*up = 0;				       \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_FULLCKSUM;			       \
-				cksum_offload = B_TRUE;			       \
-			} else if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
-			    HCKSUM_INET_PARTIAL) {			       \
-				UNLOCK_IRE_FP_MP(ire);			       \
-				sum = *up + cksum + proto;		       \
-				sum = (sum & 0xFFFF) + (sum >> 16);	       \
-				*up = (sum & 0xFFFF) + (sum >> 16);	       \
-				/*					       \
-				 * All offsets are relative to the beginning   \
-				 * of the IP header.			       \
-				 */					       \
-				mp->b_datap->db_cksumstart = hlen;	       \
-				mp->b_datap->db_cksumstuff = 		       \
-				    (PROTO == IPPROTO_UDP) ?		       \
-				    (hlen) + UDP_CHECKSUM_OFFSET :	       \
-				    (hlen) + TCP_CHECKSUM_OFFSET;	       \
-				mp->b_datap->db_cksumend = ipha->ipha_length;  \
-				mp->b_datap->db_struioun.cksum.flags |=	       \
-				    HCK_PARTIALCKSUM;			       \
-				cksum_offload = B_TRUE;			       \
-			}						       \
-		}							\
-	} 								\
-	if (!cksum_offload) {						\
-		UNLOCK_IRE_FP_MP(ire);					\
-		IP_STAT(ip_out_sw_cksum);				\
-		(sum) = IP_CSUM((mp), (hlen), cksum + proto);		\
-		*(up) = (uint16_t)((sum) ? (sum) : ~(sum));		\
-	}								\
-}
 	if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
 		queue_t *dev_q = stq->q_next;
 
@@ -19856,10 +19871,16 @@
 		    (ip_hdr_included != IP_HDR_INCLUDED)) {
 			hlen = (V_HLEN & 0xF) << 2;
 			up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-			if (*up) {
-				IP_CKSUM_XMIT(ill, ire, mp, up,
-				    IP_UDP_CSUM_COMP, hlen, max_frag,
-				    ipsec_len);
+			if (*up != 0) {
+				IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
+				    hlen, LENGTH, max_frag, ipsec_len, cksum);
+				/* Software checksum? */
+				if (DB_CKSUMFLAGS(mp) == 0) {
+					IP_STAT(ip_out_sw_cksum);
+					IP_STAT_UPDATE(
+					    ip_udp_out_sw_cksum_bytes,
+					    LENGTH - hlen);
+				}
 			}
 		}
 	} else if (ip_hdr_included != IP_HDR_INCLUDED) {
@@ -19873,8 +19894,14 @@
 			 * replicated via several interfaces, and not all of
 			 * them may have this capability.
 			 */
-			IP_CKSUM_XMIT(ill, ire, mp, up,
-			    IP_TCP_CSUM_COMP, hlen, max_frag, ipsec_len);
+			IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
+			    LENGTH, max_frag, ipsec_len, cksum);
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP_STAT(ip_out_sw_cksum);
+				IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+				    LENGTH - hlen);
+			}
 		} else {
 			sctp_hdr_t	*sctph;
 
@@ -19904,7 +19931,7 @@
 	cksum += ttl_protocol;
 
 	/* fragment the packet */
-	if (FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len)))
+	if (max_frag < (uint_t)(LENGTH + ipsec_len))
 		goto fragmentit;
 	/*
 	 * Don't use frag_flag if packet is pre-built or source
@@ -19918,8 +19945,8 @@
 		ipha->ipha_fragment_offset_and_flags |=
 		    htons(ire->ire_frag_flag);
 
-	if (!iphdrhwcksum) {
-		/* checksum */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+		/* calculate IP header checksum */
 		cksum += ipha->ipha_ident;
 		cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
 		cksum += ipha->ipha_fragment_offset_and_flags;
@@ -20258,7 +20285,11 @@
 			hlen = (V_HLEN & 0xF) << 2;
 			up = IPH_TCPH_CHECKSUMP(ipha, hlen);
 			IP_STAT(ip_out_sw_cksum);
+			IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+			    LENGTH - hlen);
 			*up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
+			if (*up == 0)
+				*up = 0xFFFF;
 		} else if (PROTO == IPPROTO_SCTP &&
 		    (ip_hdr_included != IP_HDR_INCLUDED)) {
 			sctp_hdr_t	*sctph;
@@ -20338,17 +20369,18 @@
 				 */
 				hlen = (V_HLEN & 0xF) << 2;
 				up = IPH_UDPH_CHECKSUMP(ipha, hlen);
-				if (*up) {
-					uint_t	sum;
-
-					/*
-					 * NOTE: watch out for compiler high
-					 * bits
-					 */
-					IP_STAT(ip_out_sw_cksum);
-					sum = IP_CSUM(mp, hlen,
-					    cksum + IP_UDP_CSUM_COMP);
-					*up = (uint16_t)(sum ? sum : ~sum);
+				max_frag = ire->ire_max_frag;
+				if (*up != 0) {
+					IP_CKSUM_XMIT(ire_ill, ire, mp, ipha,
+					    up, PROTO, hlen, LENGTH, max_frag,
+					    ipsec_len, cksum);
+					/* Software checksum? */
+					if (DB_CKSUMFLAGS(mp) == 0) {
+						IP_STAT(ip_out_sw_cksum);
+						IP_STAT_UPDATE(
+						    ip_udp_out_sw_cksum_bytes,
+						    LENGTH - hlen);
+					}
 				}
 			}
 		}
@@ -20369,9 +20401,7 @@
 			    conn_multicast_loop));
 
 			/*  Forget header checksum offload */
-			mp->b_datap->db_struioun.cksum.flags &=
-			    ~HCK_IPV4_HDRCKSUM;
-			iphdrhwcksum = B_FALSE;
+			DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 
 			/*
 			 * Local loopback of multicasts?  Check the
@@ -20459,10 +20489,8 @@
 		}
 		max_frag = ire->ire_max_frag;
 		cksum += ttl_protocol;
-		if (!FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) {
+		if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
 			/* No fragmentation required for this one. */
-			/* Complete the IP header checksum. */
-			cksum += ipha->ipha_ident;
 			/*
 			 * Don't use frag_flag if packet is pre-built or source
 			 * routed or if multicast (since multicast packets do
@@ -20475,26 +20503,32 @@
 				ipha->ipha_fragment_offset_and_flags |=
 				    htons(ire->ire_frag_flag);
 
-			cksum += (v_hlen_tos_len >> 16)+
-			    (v_hlen_tos_len & 0xFFFF);
-			cksum += ipha->ipha_fragment_offset_and_flags;
-			hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
-			if (hlen) {
-			    checksumoptions:
-				/*
-				 * Account for the IP Options in the IP
-				 * header checksum.
-				 */
-				up = (uint16_t *)(rptr+IP_SIMPLE_HDR_LENGTH);
-				do {
-					cksum += up[0];
-					cksum += up[1];
-					up += 2;
-				} while (--hlen);
-			}
-			cksum = ((cksum & 0xFFFF) + (cksum >> 16));
-			cksum = ~(cksum + (cksum >> 16));
-			ipha->ipha_hdr_checksum = (uint16_t)cksum;
+			if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+				/* Complete the IP header checksum. */
+				cksum += ipha->ipha_ident;
+				cksum += (v_hlen_tos_len >> 16)+
+				    (v_hlen_tos_len & 0xFFFF);
+				cksum += ipha->ipha_fragment_offset_and_flags;
+				hlen = (V_HLEN & 0xF) -
+				    IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+				if (hlen) {
+				    checksumoptions:
+					/*
+					 * Account for the IP Options in the IP
+					 * header checksum.
+					 */
+					up = (uint16_t *)(rptr+
+					    IP_SIMPLE_HDR_LENGTH);
+					do {
+						cksum += up[0];
+						cksum += up[1];
+						up += 2;
+					} while (--hlen);
+				}
+				cksum = ((cksum & 0xFFFF) + (cksum >> 16));
+				cksum = ~(cksum + (cksum >> 16));
+				ipha->ipha_hdr_checksum = (uint16_t)cksum;
+			}
 			if (ipsec_len != 0) {
 				ipsec_out_process(q, first_mp, ire, ill_index);
 				if (!next_mp) {
@@ -20991,6 +21025,298 @@
 }
 
 /*
+ * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
+ * block chain. We could rewrite to handle arbitrary message block chains but
+ * that would make the code complicated and slow. Right now there three
+ * restrictions:
+ *
+ *   1. The first message block must contain the complete IP header and
+ *	at least 1 byte of payload data.
+ *   2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
+ *	so that we can use a single Multidata message.
+ *   3. No frag must be distributed over two or more message blocks so
+ *	that we don't need more than two packet descriptors per frag.
+ *
+ * The above restrictions allow us to support userland applications (which
+ * will send down a single message block) and NFS over UDP (which will
+ * send down a chain of at most three message blocks).
+ *
+ * We also don't use MDT for payloads with less than or equal to
+ * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
+ */
+boolean_t
+ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
+{
+	int	blocks;
+	ssize_t	total, missing, size;
+
+	ASSERT(mp != NULL);
+	ASSERT(hdr_len > 0);
+
+	size = MBLKL(mp) - hdr_len;
+	if (size <= 0)
+		return (B_FALSE);
+
+	/* The first mblk contains the header and some payload. */
+	blocks = 1;
+	total = size;
+	size %= len;
+	missing = (size == 0) ? 0 : (len - size);
+	mp = mp->b_cont;
+
+	while (mp != NULL) {
+		/*
+		 * Give up if we encounter a zero length message block.
+		 * In practice, this should rarely happen and therefore
+		 * not worth the trouble of freeing and re-linking the
+		 * mblk from the chain to handle such case.
+		 */
+		if ((size = MBLKL(mp)) == 0)
+			return (B_FALSE);
+
+		/* Too many payload buffers for a single Multidata message? */
+		if (++blocks > MULTIDATA_MAX_PBUFS)
+			return (B_FALSE);
+
+		total += size;
+		/* Is a frag distributed over two or more message blocks? */
+		if (missing > size)
+			return (B_FALSE);
+		size -= missing;
+
+		size %= len;
+		missing = (size == 0) ? 0 : (len - size);
+
+		mp = mp->b_cont;
+	}
+
+	return (total > ip_wput_frag_mdt_min);
+}
+
+/*
+ * Outbound IPv4 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
+    uint32_t frag_flag, int offset)
+{
+	ipha_t		*ipha_orig;
+	int		i1, ip_data_end;
+	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
+	mblk_t		*hdr_mp, *md_mp = NULL;
+	unsigned char	*hdr_ptr, *pld_ptr;
+	multidata_t	*mmd;
+	ip_pdescinfo_t	pdi;
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(MBLKL(mp) > sizeof (ipha_t));
+
+	ipha_orig = (ipha_t *)mp->b_rptr;
+	mp->b_rptr += sizeof (ipha_t);
+
+	/* Calculate how many packets we will send out */
+	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+	pkts = (i1 + len - 1) / len;
+	ASSERT(pkts > 1);
+
+	/* Allocate a message block which will hold all the IP Headers. */
+	wroff = ip_wroff_extra;
+	hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
+
+	i1 = pkts * hdr_chunk_len;
+	/*
+	 * Create the header buffer, Multidata and destination address
+	 * and SAP attribute that should be associated with it.
+	 */
+	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+	    ((hdr_mp->b_wptr += i1),
+	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+	    !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) {
+		freemsg(mp);
+		if (md_mp == NULL) {
+			freemsg(hdr_mp);
+		} else {
+free_mmd:		IP_STAT(ip_frag_mdt_discarded);
+			freemsg(md_mp);
+		}
+		IP_STAT(ip_frag_mdt_allocfail);
+		UPDATE_MIB(&ip_mib, ipOutDiscards, pkts);
+		return;
+	}
+	IP_STAT(ip_frag_mdt_allocd);
+
+	/*
+	 * Add a payload buffer to the Multidata; this operation must not
+	 * fail, or otherwise our logic in this routine is broken.  There
+	 * is no memory allocation done by the routine, so any returned
+	 * failure simply tells us that we've done something wrong.
+	 *
+	 * A failure tells us that either we're adding the same payload
+	 * buffer more than once, or we're trying to add more buffers than
+	 * allowed.  None of the above cases should happen, and we panic
+	 * because either there's horrible heap corruption, and/or
+	 * programming mistake.
+	 */
+	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+		goto pbuf_panic;
+
+	hdr_ptr = hdr_mp->b_rptr;
+	pld_ptr = mp->b_rptr;
+
+	/* Establish the ending byte offset, based on the starting offset. */
+	offset <<= 3;
+	ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
+	    IP_SIMPLE_HDR_LENGTH;
+
+	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+	while (pld_ptr < mp->b_wptr) {
+		ipha_t		*ipha;
+		uint16_t	offset_and_flags;
+		uint16_t	ip_len;
+		int		error;
+
+		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+		ipha = (ipha_t *)(hdr_ptr + wroff);
+		ASSERT(OK_32PTR(ipha));
+		*ipha = *ipha_orig;
+
+		if (ip_data_end - offset > len) {
+			offset_and_flags = IPH_MF;
+		} else {
+			/*
+			 * Last frag. Set len to the length of this last piece.
+			 */
+			len = ip_data_end - offset;
+			/* A frag of a frag might have IPH_MF non-zero */
+			offset_and_flags =
+			    ntohs(ipha->ipha_fragment_offset_and_flags) &
+			    IPH_MF;
+		}
+		offset_and_flags |= (uint16_t)(offset >> 3);
+		offset_and_flags |= (uint16_t)frag_flag;
+		/* Store the offset and flags in the IP header. */
+		ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
+
+		/* Store the length in the IP header. */
+		ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
+		ipha->ipha_length = htons(ip_len);
+
+		/*
+		 * Set the IP header checksum.  Note that mp is just
+		 * the header, so this is easy to pass to ip_csum.
+		 */
+		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+		/*
+		 * Record offset and size of header and data of the next packet
+		 * in the multidata message.
+		 */
+		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
+		PDESC_PLD_INIT(&pdi);
+		i1 = MIN(mp->b_wptr - pld_ptr, len);
+		ASSERT(i1 > 0);
+		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+		if (i1 == len) {
+			pld_ptr += len;
+		} else {
+			i1 = len - i1;
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			ASSERT(MBLKL(mp) >= i1);
+			/*
+			 * Attach the next payload message block to the
+			 * multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+			pld_ptr = mp->b_rptr + i1;
+		}
+
+		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+		    KM_NOSLEEP)) == NULL) {
+			/*
+			 * Any failure other than ENOMEM indicates that we
+			 * have passed in invalid pdesc info or parameters
+			 * to mmd_addpdesc, which must not happen.
+			 *
+			 * EINVAL is a result of failure on boundary checks
+			 * against the pdesc info contents.  It should not
+			 * happen, and we panic because either there's
+			 * horrible heap corruption, and/or programming
+			 * mistake.
+			 */
+			if (error != ENOMEM) {
+				cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
+				    "pdesc logic error detected for "
+				    "mmd %p pinfo %p (%d)\n",
+				    (void *)mmd, (void *)&pdi, error);
+				/* NOTREACHED */
+			}
+			IP_STAT(ip_frag_mdt_addpdescfail);
+			/* Free unattached payload message blocks as well */
+			md_mp->b_cont = mp->b_cont;
+			goto free_mmd;
+		}
+
+		/* Advance fragment offset. */
+		offset += len;
+
+		/* Advance to location for next header in the buffer. */
+		hdr_ptr += hdr_chunk_len;
+
+		/* Did we reach the next payload message block? */
+		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+			mp = mp->b_cont;
+			/*
+			 * Attach the next message block with payload
+			 * data to the multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			pld_ptr = mp->b_rptr;
+		}
+	}
+
+	ASSERT(hdr_mp->b_wptr == hdr_ptr);
+	ASSERT(mp->b_wptr == pld_ptr);
+
+	/* Update IP statistics */
+	UPDATE_MIB(&ip_mib, ipFragCreates, pkts);
+	BUMP_MIB(&ip_mib, ipFragOKs);
+	IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts);
+
+	if (pkt_type == OB_PKT) {
+		ire->ire_ob_pkt_count += pkts;
+		if (ire->ire_ipif != NULL)
+			atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+	} else {
+		/*
+		 * The type is IB_PKT in the forwarding path and in
+		 * the mobile IP case when the packet is being reverse-
+		 * tunneled to the home agent.
+		 */
+		ire->ire_ib_pkt_count += pkts;
+		ASSERT(!IRE_IS_LOCAL(ire));
+		if (ire->ire_type & IRE_BROADCAST)
+			atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
+		else
+			atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
+	}
+	ire->ire_last_used_time = lbolt;
+	/* Send it down */
+	putnext(ire->ire_stq, md_mp);
+	return;
+
+pbuf_panic:
+	cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
+	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+	    pbuf_idx);
+	/* NOTREACHED */
+}
+
+/*
  * Outbound IP fragmentation routine.
  *
  * NOTE : This routine does not ire_refrele the ire that is passed in
@@ -21000,29 +21326,30 @@
 ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
     uint32_t frag_flag)
 {
-	int	i1;
-	mblk_t	*ll_hdr_mp;
-	int 	ll_hdr_len;
-	int	hdr_len;
-	mblk_t	*hdr_mp;
-	ipha_t	*ipha;
-	int	ip_data_end;
-	int	len;
-	mblk_t	*mp = mp_orig;
-	int	offset;
-	queue_t	*q;
+	int		i1;
+	mblk_t		*ll_hdr_mp;
+	int 		ll_hdr_len;
+	int		hdr_len;
+	mblk_t		*hdr_mp;
+	ipha_t		*ipha;
+	int		ip_data_end;
+	int		len;
+	mblk_t		*mp = mp_orig;
+	int		offset;
+	queue_t		*q;
 	uint32_t	v_hlen_tos_len;
-	mblk_t	*first_mp;
-	boolean_t mctl_present;
-	mblk_t	*xmit_mp;
-	mblk_t	*carve_mp;
-	ire_t   *ire1 = NULL;
-	ire_t   *save_ire = NULL;
-	mblk_t  *next_mp = NULL;
-	boolean_t last_frag = B_FALSE;
-	boolean_t multirt_send = B_FALSE;
-	ire_t *first_ire = NULL;
-	irb_t *irb = NULL;
+	mblk_t		*first_mp;
+	boolean_t	mctl_present;
+	ill_t		*ill;
+	mblk_t		*xmit_mp;
+	mblk_t		*carve_mp;
+	ire_t		*ire1 = NULL;
+	ire_t		*save_ire = NULL;
+	mblk_t  	*next_mp = NULL;
+	boolean_t	last_frag = B_FALSE;
+	boolean_t	multirt_send = B_FALSE;
+	ire_t		*first_ire = NULL;
+	irb_t		*irb = NULL;
 
 	TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
 	    "ip_wput_frag_start:");
@@ -21036,6 +21363,7 @@
 		mctl_present = B_FALSE;
 	}
 
+	ASSERT(MBLKL(mp) >= sizeof (ipha_t));
 	ipha = (ipha_t *)mp->b_rptr;
 
 	/*
@@ -21079,8 +21407,37 @@
 	}
 
 	hdr_len = (V_HLEN & 0xF) << 2;
+
 	ipha->ipha_hdr_checksum = 0;
 
+	/*
+	 * Establish the number of bytes maximum per frag, after putting
+	 * in the header.
+	 */
+	len = (max_frag - hdr_len) & ~7;
+
+	/* Check if we can use MDT to send out the frags. */
+	ASSERT(!IRE_IS_LOCAL(ire));
+	if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound &&
+	    !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) &&
+	    (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) &&
+	    IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
+		ASSERT(ill->ill_mdt_capab != NULL);
+		if (!ill->ill_mdt_capab->ill_mdt_on) {
+			/*
+			 * If MDT has been previously turned off in the past,
+			 * and we currently can do MDT (due to IPQoS policy
+			 * removal, etc.) then enable it for this interface.
+			 */
+			ill->ill_mdt_capab->ill_mdt_on = 1;
+			ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
+			    ill->ill_name));
+		}
+		ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
+		    offset);
+		return;
+	}
+
 	/* Get a copy of the header for the trailing frags */
 	hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset);
 	if (!hdr_mp) {
@@ -21100,12 +21457,6 @@
 	offset <<= 3;
 	ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
 
-	/*
-	 * Establish the number of bytes maximum per frag, after putting
-	 * in the header.
-	 */
-	len = (max_frag - hdr_len) & ~7;
-
 	/* Store the length of the first fragment in the IP header. */
 	i1 = len + hdr_len;
 	ASSERT(i1 <= IP_MAXPACKET);
@@ -22565,8 +22916,6 @@
 	zoneid_t zoneid;
 	uint32_t cksum;
 	uint16_t *up;
-	/* Hack until the UDP merge into IP happens. */
-	extern boolean_t udp_compute_checksum(void);
 #ifdef	_BIG_ENDIAN
 #define	LENGTH	(v_hlen_tos_len & 0xFFFF)
 #else
@@ -22741,6 +23090,8 @@
 
 		offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET;
 		IP_STAT(ip_out_sw_cksum);
+		IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes,
+		    ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH));
 #define	iphs	((uint16_t *)ipha)
 		cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
 		    iphs[9] + ntohs(htons(ipha->ipha_length) -
@@ -23790,10 +24141,10 @@
 void
 ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
 {
-	conn_t *connp = (conn_t *)arg;
+	conn_t *connp = arg;
 	tcp_t	*tcp;
 
-	ASSERT(connp != NULL && connp->conn_tcp != NULL);
+	ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL);
 	tcp = connp->conn_tcp;
 
 	if (connp->conn_tcp->tcp_state == TCPS_CLOSED)
@@ -23801,7 +24152,6 @@
 	else
 		tcp_rput_other(tcp, mp);
 	CONN_OPER_PENDING_DONE(connp);
-
 }
 
 /* Called from ip_wput for all non data messages */
@@ -24031,31 +24381,48 @@
 		case T_BIND_REQ: {
 			/* Request can get queued in bind */
 			ASSERT(connp != NULL);
+			/*
+			 * Both TCP and UDP call ip_bind_{v4,v6}() directly
+			 * instead of going through this path.  We only get
+			 * here in the following cases:
+			 *
+			 * a. Bind retries, where ipsq is non-NULL.
+			 * b. T_BIND_REQ is issued from non TCP/UDP
+			 *    transport, e.g. icmp for raw socket,
+			 *    in which case ipsq will be NULL.
+			 */
+			ASSERT(ipsq != NULL ||
+			    (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp)));
+
 			/* Don't increment refcnt if this is a re-entry */
 			if (ipsq == NULL)
 				CONN_INC_REF(connp);
-			mp = connp->conn_af_isv6 ?
-			    ip_bind_v6(q, mp, connp, NULL) :
-				ip_bind_v4(q, mp, connp);
-			if (mp != NULL) {
-				tcp_t	*tcp;
-
-				tcp = connp->conn_tcp;
-				if (tcp != NULL) {
-					if (ipsq == NULL) {
-						tcp_rput_other(tcp, mp);
-					} else {
-						CONN_INC_REF(connp);
-						squeue_fill(connp->conn_sqp, mp,
-						    ip_resume_tcp_bind,
-						    connp, SQTAG_TCP_RPUTOTHER);
-						return;
-					}
-				} else {
-					qreply(q, mp);
-				}
-				CONN_OPER_PENDING_DONE(connp);
-			}
+			mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
+			    connp, NULL) : ip_bind_v4(q, mp, connp);
+			if (mp == NULL)
+				return;
+			if (IPCL_IS_TCP(connp)) {
+				/*
+				 * In the case of TCP endpoint we
+				 * come here only for bind retries
+				 */
+				ASSERT(ipsq != NULL);
+				CONN_INC_REF(connp);
+				squeue_fill(connp->conn_sqp, mp,
+				    ip_resume_tcp_bind, connp,
+				    SQTAG_BIND_RETRY);
+				return;
+			} else if (IPCL_IS_UDP(connp)) {
+				/*
+				 * In the case of UDP endpoint we
+				 * come here only for bind retries
+				 */
+				ASSERT(ipsq != NULL);
+				udp_resume_bind(connp, mp);
+				return;
+			}
+			qreply(q, mp);
+			CONN_OPER_PENDING_DONE(connp);
 			return;
 		}
 		case T_SVR4_OPTMGMT_REQ:
@@ -24111,7 +24478,8 @@
 			}
 			return;
 		case T_UNBIND_REQ:
-			ip_unbind(q, mp);
+			mp = ip_unbind(q, mp);
+			qreply(q, mp);
 			return;
 		default:
 			/*
--- a/usr/src/uts/common/inet/ip/ip6.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip6.c	Sat Oct 22 22:50:14 2005 -0700
@@ -58,6 +58,7 @@
 #include <sys/policy.h>
 #include <net/if.h>
 #include <net/if_arp.h>
+#include <net/if_types.h>
 #include <net/route.h>
 #include <net/if_dl.h>
 #include <sys/sockio.h>
@@ -74,9 +75,12 @@
 #include <inet/snmpcom.h>
 
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip6_asp.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
 #include <inet/ipp_common.h>
 
 #include <inet/ip_multi.h>
@@ -103,20 +107,51 @@
 /*
  * IP statistics.
  */
-#define	IP6_STAT(x)	(ip6_statistics.x.value.ui64++)
+#define	IP6_STAT(x)		(ip6_statistics.x.value.ui64++)
+#define	IP6_STAT_UPDATE(x, n)	(ip6_statistics.x.value.ui64 += (n))
 
 typedef struct ip6_stat {
 	kstat_named_t	ip6_udp_fast_path;
 	kstat_named_t	ip6_udp_slow_path;
 	kstat_named_t	ip6_udp_fannorm;
 	kstat_named_t	ip6_udp_fanmb;
+	kstat_named_t   ip6_out_sw_cksum;
+	kstat_named_t   ip6_in_sw_cksum;
+	kstat_named_t	ip6_tcp_in_full_hw_cksum_err;
+	kstat_named_t	ip6_tcp_in_part_hw_cksum_err;
+	kstat_named_t	ip6_tcp_in_sw_cksum_err;
+	kstat_named_t	ip6_tcp_out_sw_cksum_bytes;
+	kstat_named_t	ip6_udp_in_full_hw_cksum_err;
+	kstat_named_t	ip6_udp_in_part_hw_cksum_err;
+	kstat_named_t	ip6_udp_in_sw_cksum_err;
+	kstat_named_t	ip6_udp_out_sw_cksum_bytes;
+	kstat_named_t	ip6_frag_mdt_pkt_out;
+	kstat_named_t	ip6_frag_mdt_discarded;
+	kstat_named_t	ip6_frag_mdt_allocfail;
+	kstat_named_t	ip6_frag_mdt_addpdescfail;
+	kstat_named_t	ip6_frag_mdt_allocd;
 } ip6_stat_t;
 
 static ip6_stat_t ip6_statistics = {
-	{ "ip6_udp_fast_path", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_slow_path", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_fannorm", 	KSTAT_DATA_UINT64 },
-	{ "ip6_udp_fanmb", 	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fast_path",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_slow_path",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fannorm",			KSTAT_DATA_UINT64 },
+	{ "ip6_udp_fanmb",			KSTAT_DATA_UINT64 },
+	{ "ip6_out_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip6_in_sw_cksum",			KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip6_tcp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_full_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_part_hw_cksum_err",	KSTAT_DATA_UINT64 },
+	{ "ip6_udp_in_sw_cksum_err",		KSTAT_DATA_UINT64 },
+	{ "ip6_udp_out_sw_cksum_bytes",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_pkt_out",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_discarded",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_allocfail",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_addpdescfail",		KSTAT_DATA_UINT64 },
+	{ "ip6_frag_mdt_allocd",		KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *ip6_kstat;
@@ -221,7 +256,7 @@
 static int	ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
     uint8_t *, uint_t, uint8_t);
 static mblk_t	*ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
-    ip6_frag_t *, uint_t, uint_t *);
+    ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
 static boolean_t	ip_source_routed_v6(ip6_t *, mblk_t *);
 static void	ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
     conn_t *, int, int, int);
@@ -2302,7 +2337,8 @@
 			connp->conn_recv = tcp_input;
 	}
 	/* Update qinfo if v4/v6 changed */
-	if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && !IS_TCP_CONN(connp)) {
+	if ((orig_pkt_isv6 != connp->conn_pkt_isv6) &&
+	    !(IPCL_IS_TCP(connp) || IPCL_IS_UDP(connp))) {
 		if (connp->conn_pkt_isv6)
 			ip_setqinfo(RD(q), IPV6_MINOR, B_TRUE);
 		else
@@ -2531,7 +2567,6 @@
     void *dummy_arg)
 {
 	conn_t	*connp = NULL;
-	tcp_t *tcp;
 	t_scalar_t prim;
 
 	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
@@ -2543,24 +2578,24 @@
 	prim = ((union T_primitives *)mp->b_rptr)->type;
 	ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ);
 
-	tcp = connp->conn_tcp;
-	if (tcp != NULL) {
+	if (IPCL_IS_TCP(connp)) {
 		/* Pass sticky_ipp for scope_id and pktinfo */
-		mp = ip_bind_v6(q, mp, connp, &tcp->tcp_sticky_ipp);
+		mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp);
 	} else {
 		/* For UDP and ICMP */
 		mp = ip_bind_v6(q, mp, connp, NULL);
 	}
 	if (mp != NULL) {
-		if (tcp != NULL) {
+		if (IPCL_IS_TCP(connp)) {
 			CONN_INC_REF(connp);
-			squeue_fill(connp->conn_sqp, mp,
-			    ip_resume_tcp_bind, connp, SQTAG_TCP_RPUTOTHER);
-			return;
+			squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind,
+			    connp, SQTAG_TCP_RPUTOTHER);
+		} else if (IPCL_IS_UDP(connp)) {
+			udp_resume_bind(connp, mp);
 		} else {
 			qreply(q, mp);
-		}
-		CONN_OPER_PENDING_DONE(connp);
+			CONN_OPER_PENDING_DONE(connp);
+		}
 	}
 }
 
@@ -2719,7 +2754,7 @@
 	if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
 	    !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
 	    (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
-	    (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+	    ILL_MDT_CAPABLE(md_ill)) {
 		md_dst_ire = dst_ire;
 		IRE_REFHOLD(md_dst_ire);
 	}
@@ -2936,7 +2971,7 @@
 		 */
 		error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
 		    connp->conn_ports,
-		    IS_TCP_CONN(connp) ? connp->conn_tcp->tcp_bound_if : 0);
+		    IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
 	}
 	if (error == 0) {
 		connp->conn_fully_bound = B_TRUE;
@@ -3411,8 +3446,7 @@
 		ASSERT((dp->db_struioflag & STRUIO_IP) == 0);
 
 		/* Initiate IPPf processing, if needed. */
-		if (IPP_ENABLED(IPP_LOCAL_IN) &&
-			(flags & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))) {
+		if (IPP_ENABLED(IPP_LOCAL_IN) && (flags & IP6_NO_IPPOLICY)) {
 			ill_index = ill->ill_phyint->phyint_ifindex;
 			ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
 			if (first_mp == NULL) {
@@ -3447,14 +3481,14 @@
 			}
 
 			mp->b_datap->db_struioflag |= STRUIO_EAGER;
-			mp->b_datap->db_cksumstart = (intptr_t)sqp;
+			DB_CKSUMSTART(mp) = (intptr_t)sqp;
 
 			/*
 			 * db_cksumstuff is unused in the incoming
 			 * path; Thus store the ifindex here. It will
 			 * be cleared in tcp_conn_create_v6().
 			 */
-			mp->b_datap->db_cksumstuff =
+			DB_CKSUMSTUFF(mp) =
 			    (intptr_t)ill->ill_phyint->phyint_ifindex;
 			syn_present = B_TRUE;
 		}
@@ -3587,7 +3621,6 @@
     ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present,
     zoneid_t zoneid)
 {
-	queue_t		*rq;
 	uint32_t	dstport, srcport;
 	in6_addr_t	dst;
 	mblk_t		*first_mp;
@@ -3637,9 +3670,8 @@
 		/* Found a client */
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
-
-		if (!canputnext(rq)) {
+
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			freemsg(first_mp);
 			BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 			CONN_DEC_REF(connp);
@@ -3691,7 +3723,10 @@
 			}
 		}
 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-		putnext(rq, mp);
+
+		/* Send it upstream */
+		CONN_UDP_RECV(connp, mp);
+
 		IP6_STAT(ip6_udp_fannorm);
 		CONN_DEC_REF(connp);
 		if (mctl_present)
@@ -3746,7 +3781,6 @@
 		mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
 		CONN_INC_REF(connp);
 		mutex_exit(&connfp->connf_lock);
-		rq = connp->conn_rq;
 		/*
 		 * For link-local always add ifindex so that transport
 		 * can set sin6_scope_id. Avoid it for ICMP error
@@ -3762,7 +3796,7 @@
 			BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
 			goto next_one;
 		}
-		if (!canputnext(rq)) {
+		if (CONN_UDP_FLOWCTLD(connp)) {
 			BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 			freemsg(mp1);
 			goto next_one;
@@ -3778,7 +3812,9 @@
 			if (mctl_present)
 				freeb(first_mp1);
 			BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-			putnext(rq, mp1);
+
+			/* Send it upstream */
+			CONN_UDP_RECV(connp, mp1);
 		}
 next_one:
 		mutex_enter(&connfp->connf_lock);
@@ -3791,7 +3827,6 @@
 
 	/* Last one.  Send it upstream. */
 	mutex_exit(&connfp->connf_lock);
-	rq = connp->conn_rq;
 
 	/* Initiate IPPF processing */
 	if (IP6_IN_IPP(flags)) {
@@ -3830,7 +3865,7 @@
 			first_mp = mp;
 		}
 	}
-	if (!canputnext(rq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 		freemsg(mp);
 	} else {
@@ -3844,7 +3879,9 @@
 			}
 		}
 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-		putnext(rq, mp);
+
+		/* Send it upstream */
+		CONN_UDP_RECV(connp, mp);
 	}
 	IP6_STAT(ip6_udp_fanmb);
 	CONN_DEC_REF(connp);
@@ -6447,7 +6484,7 @@
 		 */
 		if ((mp->b_datap->db_type != M_PCPROTO) ||
 		    (dl->dl_primitive == DL_UNITDATA_IND)) {
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 			return;
 		}
 	}
@@ -6835,14 +6872,16 @@
 	mblk_t		*first_mp1;
 	boolean_t	no_forward;
 	ip6_hbh_t	*hbhhdr;
-	boolean_t	no_cksum = (flags & IP6_IN_NOCKSUM);
 	boolean_t	ll_multicast = (flags & IP6_IN_LLMCAST);
 	conn_t		*connp;
-	int		off;
 	ilm_t		*ilm;
 	uint32_t	ports;
 	uint_t		ipif_id = 0;
 	zoneid_t	zoneid = GLOBAL_ZONEID;
+	uint16_t	hck_flags, reass_hck_flags;
+	uint32_t	reass_sum;
+	boolean_t	cksum_err;
+	mblk_t		*mp1;
 
 	EXTRACT_PKT_MP(mp, first_mp, mctl_present);
 
@@ -6899,11 +6938,14 @@
 		pkt_len -= diff;
 	}
 
-	/*
-	 * XXX When zero-copy support is added, this turning off of
-	 * checksum flag  will need to be done more selectively.
-	 */
-	mp->b_datap->db_struioun.cksum.flags &= ~HCK_PARTIALCKSUM;
+	if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+		hck_flags = DB_CKSUMFLAGS(mp);
+	else
+		hck_flags = 0;
+
+	/* Clear checksum flags in case we need to forward */
+	DB_CKSUMFLAGS(mp) = 0;
+	reass_sum = reass_hck_flags = 0;
 
 	nexthdr = ip6h->ip6_nxt;
 
@@ -7168,7 +7210,6 @@
 			/* TBD add site-local check at site boundary? */
 		} else if (ipv6_send_redirects) {
 			in6_addr_t	*v6targ;
-			mblk_t		*mp1;
 			in6_addr_t	gw_addr_v6;
 			ire_t		*src_ire_v6 = NULL;
 
@@ -7313,7 +7354,6 @@
 		case IPPROTO_TCP: {
 			uint16_t	*up;
 			uint32_t	sum;
-			dblk_t		*dp;
 			int		offset;
 
 			hdr_len = pkt_len - remlen;
@@ -7336,6 +7376,7 @@
 					freemsg(first_mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + hdr_len;
 			}
@@ -7368,30 +7409,12 @@
 						freemsg(first_mp);
 						return;
 					}
+					hck_flags = 0;
 					ip6h = (ip6_t *)mp->b_rptr;
 					whereptr = (uint8_t *)ip6h + hdr_len;
 				}
 			}
 
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum) {
-				if (mp->b_datap->db_type == M_DATA) {
-					/*
-					 * M_DATA mblk, so init mblk (chain)
-					 * for no struio().
-					 */
-					mblk_t  *mp1 = mp;
-
-					do {
-						mp1->b_datap->db_struioflag = 0;
-					} while ((mp1 = mp1->b_cont) != NULL);
-				}
-				goto tcp_fanout;
-			}
-
 			up = (uint16_t *)&ip6h->ip6_src;
 			/*
 			 * TCP checksum calculation.  First sum up the
@@ -7400,44 +7423,38 @@
 			 *  -	Destination IPv6 address
 			 *  -	TCP payload length
 			 *  -	TCP protocol ID
-			 * XXX need zero-copy support here
 			 */
 			sum = htons(IPPROTO_TCP + remlen) +
 			    up[0] + up[1] + up[2] + up[3] +
 			    up[4] + up[5] + up[6] + up[7] +
 			    up[8] + up[9] + up[10] + up[11] +
 			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold initial sum */
 			sum = (sum & 0xffff) + (sum >> 16);
-			dp = mp->b_datap;
-			if (dp->db_type != M_DATA || dp->db_ref > 1) {
-				/*
-				 * Not M_DATA mblk or its a dup, so do the
-				 * checksum now.
-				 */
-				sum = IP_CSUM(mp, hdr_len, sum);
-				if (sum) {
-					/* checksum failed */
-					ip1dbg(("ip_rput_data_v6: TCP checksum"
-					    " failed %x off %d\n",
-					    sum, hdr_len));
-					BUMP_MIB(&ip_mib, tcpInErrs);
-					freemsg(first_mp);
-					return;
-				}
-			} else {
-				/*
-				 * M_DATA mblk and not a dup
-				 * compute checksum here
-				 */
-				off = (int)(whereptr - mp->b_rptr);
-
-				if (IP_CSUM(mp, off, sum)) {
-					BUMP_MIB(&ip_mib, tcpInErrs);
-					ipcsumdbg("ip_rput_data_v6 "
-					    "swcksumerr\n", mp);
-					freemsg(first_mp);
-					return;
-				}
+
+			mp1 = mp->b_cont;
+
+			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+				IP6_STAT(ip6_in_sw_cksum);
+
+			IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+			    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+			    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+			    mp, mp1, cksum_err);
+
+			if (cksum_err) {
+				BUMP_MIB(&ip_mib, tcpInErrs);
+
+				if (hck_flags & HCK_FULLCKSUM)
+					IP6_STAT(ip6_tcp_in_full_hw_cksum_err);
+				else if (hck_flags & HCK_PARTIALCKSUM)
+					IP6_STAT(ip6_tcp_in_part_hw_cksum_err);
+				else
+					IP6_STAT(ip6_tcp_in_sw_cksum_err);
+
+				freemsg(first_mp);
+				return;
 			}
 tcp_fanout:
 			ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill,
@@ -7468,18 +7485,16 @@
 			}
 
 			sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len);
-			if (!no_cksum) {
-				/* checksum */
-				pktsum = sctph->sh_chksum;
-				sctph->sh_chksum = 0;
-				calcsum = sctp_cksum(mp, hdr_len);
-				if (calcsum != pktsum) {
-					BUMP_MIB(&sctp_mib, sctpChecksumError);
-					freemsg(mp);
-					return;
-				}
-				sctph->sh_chksum = pktsum;
-			}
+			/* checksum */
+			pktsum = sctph->sh_chksum;
+			sctph->sh_chksum = 0;
+			calcsum = sctp_cksum(mp, hdr_len);
+			if (calcsum != pktsum) {
+				BUMP_MIB(&sctp_mib, sctpChecksumError);
+				freemsg(mp);
+				return;
+			}
+			sctph->sh_chksum = pktsum;
 			ports = *(uint32_t *)(mp->b_rptr + hdr_len);
 			if ((connp = sctp_find_conn(&ip6h->ip6_src,
 			    &ip6h->ip6_dst, ports, ipif_id, zoneid)) == NULL) {
@@ -7501,8 +7516,6 @@
 
 			hdr_len = pkt_len - remlen;
 
-#define	UDPH_SIZE 8
-
 			if (hada_mp != NULL) {
 				ip0dbg(("udp hada drop\n"));
 				goto hada_drop;
@@ -7519,16 +7532,10 @@
 					freemsg(first_mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + hdr_len;
 			}
-#undef UDPH_SIZE
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum)
-				goto udp_fanout;
 
 			/*
 			 *  Before going through the regular checksum
@@ -7568,15 +7575,37 @@
 			    up[8] + up[9] + up[10] + up[11] +
 			    up[12] + up[13] + up[14] + up[15];
 
+			/* Fold initial sum */
 			sum = (sum & 0xffff) + (sum >> 16);
-			/* Next sum in the UDP packet */
-			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum) {
-				/* UDP checksum failed */
-				ip1dbg(("ip_rput_data_v6: UDP checksum "
-				    "failed %x\n",
-				    sum));
+
+			if (reass_hck_flags != 0) {
+				hck_flags = reass_hck_flags;
+
+				IP_CKSUM_RECV_REASS(hck_flags,
+				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+				    sum, reass_sum, cksum_err);
+			} else {
+				mp1 = mp->b_cont;
+
+				IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+				    ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+				    (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+				    mp, mp1, cksum_err);
+			}
+
+			if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+				IP6_STAT(ip6_in_sw_cksum);
+
+			if (cksum_err) {
 				BUMP_MIB(ill->ill_ip6_mib, udpInCksumErrs);
+
+				if (hck_flags & HCK_FULLCKSUM)
+					IP6_STAT(ip6_udp_in_full_hw_cksum_err);
+				else if (hck_flags & HCK_PARTIALCKSUM)
+					IP6_STAT(ip6_udp_in_part_hw_cksum_err);
+				else
+					IP6_STAT(ip6_udp_in_sw_cksum_err);
+
 				freemsg(first_mp);
 				return;
 			}
@@ -7592,13 +7621,6 @@
 				goto hada_drop;
 			}
 
-			/*
-			 * If packet is being looped back locally checksums
-			 * aren't used
-			 */
-			if (no_cksum)
-				goto icmp_fanout;
-
 			up = (uint16_t *)&ip6h->ip6_src;
 			sum = htons(IPPROTO_ICMPV6 + remlen) +
 			    up[0] + up[1] + up[2] + up[3] +
@@ -7607,7 +7629,7 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			sum = IP_CSUM(mp, hdr_len, sum);
-			if (sum) {
+			if (sum != 0) {
 				/* IPv6 ICMP checksum failed */
 				ip1dbg(("ip_rput_data_v6: ICMPv6 checksum "
 				    "failed %x\n",
@@ -7795,6 +7817,7 @@
 					freemsg(mp);
 					return;
 				}
+				hck_flags = 0;
 				ip6h = (ip6_t *)mp->b_rptr;
 				whereptr = (uint8_t *)ip6h + pkt_len - remlen;
 			}
@@ -7820,8 +7843,12 @@
 				}
 			}
 
+			/* Restore the flags */
+			DB_CKSUMFLAGS(mp) = hck_flags;
+
 			mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
-			    remlen - used, &prev_nexthdr_offset);
+			    remlen - used, &prev_nexthdr_offset,
+			    &reass_sum, &reass_hck_flags);
 			if (mp == NULL) {
 				/* Reassembly is still pending */
 				return;
@@ -8032,7 +8059,7 @@
 		return;
 	}
 
-	if (!canputnext(connp->conn_upq)) {
+	if (CONN_UDP_FLOWCTLD(connp)) {
 		freemsg(first_mp);
 		BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
 		CONN_DEC_REF(connp);
@@ -8062,7 +8089,9 @@
 	IP6_STAT(ip6_udp_fast_path);
 	BUMP_MIB(ill->ill_ip6_mib, ipv6InReceives);
 	BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
-	putnext(connp->conn_upq, mp);
+
+	/* Send it upstream */
+	CONN_UDP_RECV(connp, mp);
 
 	CONN_DEC_REF(connp);
 	freemsg(hada_mp);
@@ -8086,7 +8115,8 @@
  */
 static mblk_t *
 ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
-    ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset)
+    ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
+    uint32_t *cksum_val, uint16_t *cksum_flags)
 {
 	ill_t		*ill = (ill_t *)q->q_ptr;
 	uint32_t	ident = ntohl(fraghdr->ip6f_ident);
@@ -8107,6 +8137,62 @@
 	mblk_t		*tail_mp;
 	mblk_t		*t_mp;
 	boolean_t	pruned = B_FALSE;
+	uint32_t	sum_val;
+	uint16_t	sum_flags;
+
+
+	if (cksum_val != NULL)
+		*cksum_val = 0;
+	if (cksum_flags != NULL)
+		*cksum_flags = 0;
+
+	/*
+	 * We utilize hardware computed checksum info only for UDP since
+	 * IP fragmentation is a normal occurence for the protocol.  In
+	 * addition, checksum offload support for IP fragments carrying
+	 * UDP payload is commonly implemented across network adapters.
+	 */
+	ASSERT(ill != NULL);
+	if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+	    (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+		mblk_t *mp1 = mp->b_cont;
+		int32_t len;
+
+		/* Record checksum information from the packet */
+		sum_val = (uint32_t)DB_CKSUM16(mp);
+		sum_flags = DB_CKSUMFLAGS(mp);
+
+		/* fragmented payload offset from beginning of mblk */
+		offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
+
+		if ((sum_flags & HCK_PARTIALCKSUM) &&
+		    (mp1 == NULL || mp1->b_cont == NULL) &&
+		    offset >= (uint16_t)DB_CKSUMSTART(mp) &&
+		    ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) {
+			uint32_t adj;
+			/*
+			 * Partial checksum has been calculated by hardware
+			 * and attached to the packet; in addition, any
+			 * prepended extraneous data is even byte aligned.
+			 * If any such data exists, we adjust the checksum;
+			 * this would also handle any postpended data.
+			 */
+			IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+			    mp, mp1, len, adj);
+
+			/* One's complement subtract extraneous checksum */
+			if (adj >= sum_val)
+				sum_val = ~(adj - sum_val) & 0xFFFF;
+			else
+				sum_val -= adj;
+		}
+	} else {
+		sum_val = 0;
+		sum_flags = 0;
+	}
+
+	/* Clear hardware checksumming flag */
+	DB_CKSUMFLAGS(mp) = 0;
 
 	/*
 	 * Note: Fragment offset in header is in 8-octet units.
@@ -8159,7 +8245,6 @@
 	 * Drop the fragmented as early as possible, if
 	 * we don't have resource(s) to re-assemble.
 	 */
-
 	if (ip_reass_queue_bytes == 0) {
 		freemsg(mp);
 		return (NULL);
@@ -8183,12 +8268,11 @@
 	 * there is anything on the reassembly queue, the timer will
 	 * be running.
 	 */
-	msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
+	msg_len = MBLKSIZE(mp);
 	tail_mp = mp;
 	while (tail_mp->b_cont != NULL) {
 		tail_mp = tail_mp->b_cont;
-		msg_len += tail_mp->b_datap->db_lim -
-		    tail_mp->b_datap->db_base;
+		msg_len += MBLKSIZE(tail_mp);
 	}
 	/*
 	 * If the reassembly list for this ILL will get too big
@@ -8287,7 +8371,7 @@
 		ipf->ipf_timestamp = gethrestime_sec();
 		/* Record ipf generation and account for frag header */
 		ipf->ipf_gen = ill->ill_ipf_gen++;
-		ipf->ipf_count = mp1->b_datap->db_lim - mp1->b_datap->db_base;
+		ipf->ipf_count = MBLKSIZE(mp1);
 		ipf->ipf_protocol = nexthdr;
 		ipf->ipf_nf_hdr_len = 0;
 		ipf->ipf_prev_nexthdr_offset = 0;
@@ -8295,6 +8379,16 @@
 		ipf->ipf_ecn = ecn_info;
 		ipf->ipf_num_dups = 0;
 		ipfb->ipfb_frag_pkts++;
+		ipf->ipf_checksum = 0;
+		ipf->ipf_checksum_flags = 0;
+
+		/* Store checksum value in fragment header */
+		if (sum_flags != 0) {
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+			ipf->ipf_checksum = sum_val;
+			ipf->ipf_checksum_flags = sum_flags;
+		}
 
 		/*
 		 * We handle reassembly two ways.  In the easy case,
@@ -8326,6 +8420,10 @@
 			 * on easy reassembly.
 			 */
 			ipf->ipf_end = 0;
+
+			/* Forget checksum offload from now on */
+			ipf->ipf_checksum_flags = 0;
+
 			/*
 			 * ipf_hole_cnt is set by ip_reassemble.
 			 * ipf_count is updated by ip_reassemble.
@@ -8349,6 +8447,23 @@
 	}
 
 	/*
+	 * If the packet's flag has changed (it could be coming up
+	 * from an interface different than the previous, therefore
+	 * possibly different checksum capability), then forget about
+	 * any stored checksum states.  Otherwise add the value to
+	 * the existing one stored in the fragment header.
+	 */
+	if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+		sum_val += ipf->ipf_checksum;
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+		ipf->ipf_checksum = sum_val;
+	} else if (ipf->ipf_checksum_flags != 0) {
+		/* Forget checksum offload from now on */
+		ipf->ipf_checksum_flags = 0;
+	}
+
+	/*
 	 * We have a new piece of a datagram which is already being
 	 * reassembled.  Update the ECN info if all IP fragments
 	 * are ECN capable.  If there is one which is not, clear
@@ -8443,6 +8558,13 @@
 	nexthdr = ipf->ipf_protocol;
 	*prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
 	ipfp = ipf->ipf_ptphn;
+
+	/* We need to supply these to caller */
+	if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+		sum_val = ipf->ipf_checksum;
+	else
+		sum_val = 0;
+
 	mp1 = ipf->ipf_mp;
 	count = ipf->ipf_count;
 	ipf = ipf->ipf_hash_next;
@@ -8508,6 +8630,12 @@
 	ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
 	ip6h->ip6_vcf |= htonl(ecn_info << 20);
 
+	/* Reassembly is successful; return checksum information if needed */
+	if (cksum_val != NULL)
+		*cksum_val = sum_val;
+	if (cksum_flags != NULL)
+		*cksum_flags = sum_flags;
+
 	return (mp);
 }
 
@@ -9954,7 +10082,7 @@
 	if (q->q_next == NULL) {
 		connp = Q_TO_CONN(q);
 
-		if (IS_TCP_CONN(connp)) {
+		if (IPCL_IS_TCP(connp)) {
 			/* change conn_send for the tcp_v4_connections */
 			connp->conn_send = ip_output;
 		} else if (connp->conn_ulp == IPPROTO_SCTP) {
@@ -10426,12 +10554,52 @@
 		uint32_t	sum;
 		uint_t		ill_index =  ((ill_t *)ire->ire_stq->q_ptr)->
 		    ill_phyint->phyint_ifindex;
+		queue_t		*dev_q = ire->ire_stq->q_next;
 
 		/*
 		 * non-NULL send-to queue - packet is to be sent
 		 * out an interface.
 		 */
 
+		/* Driver is flow-controlling? */
+		if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
+		    ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+			/*
+			 * Queue packet if we have an conn to give back
+			 * pressure.  We can't queue packets intended for
+			 * hardware acceleration since we've tossed that
+			 * state already.  If the packet is being fed back
+			 * from ire_send_v6, we don't know the position in
+			 * the queue to enqueue the packet and we discard
+			 * the packet.
+			 */
+			ASSERT(mp == first_mp);
+			if (ip_output_queue && connp != NULL &&
+			    !mctl_present && caller != IRE_SEND) {
+				if (caller == IP_WSRV) {
+					connp->conn_did_putbq = 1;
+					(void) putbq(connp->conn_wq, mp);
+					conn_drain_insert(connp);
+					/*
+					 * caller == IP_WSRV implies we are
+					 * the service thread, and the
+					 * queue is already noenabled.
+					 * The check for canput and
+					 * the putbq is not atomic.
+					 * So we need to check again.
+					 */
+					if (canput(dev_q))
+						connp->conn_did_putbq = 0;
+				} else {
+					(void) putq(connp->conn_wq, mp);
+				}
+				return;
+			}
+			BUMP_MIB(mibptr, ipv6OutDiscards);
+			freemsg(mp);
+			return;
+		}
+
 		/*
 		 * Look for reachability confirmations from the transport.
 		 */
@@ -10490,20 +10658,20 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			*insp = IP_CSUM(mp, hdr_length, sum);
+			if (*insp == 0)
+				*insp = 0xFFFF;
 		} else if (nexthdr == IPPROTO_TCP) {
 			uint16_t	*up;
 
 			/*
 			 * Check for full IPv6 header + enough TCP header
 			 * to get at the checksum field.
-			 * XXX need hardware checksum support.
-			 */
-#define	TCP_CSUM_OFFSET	16
-#define	TCP_CSUM_SIZE	2
+			 */
 			if ((mp->b_wptr - mp->b_rptr) <
-			    (hdr_length + TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+			    (hdr_length + TCP_CHECKSUM_OFFSET +
+			    TCP_CHECKSUM_SIZE)) {
 				if (!pullupmsg(mp, hdr_length +
-				    TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+				    TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) {
 					ip1dbg(("ip_wput_v6: TCP hdr pullupmsg"
 					    " failed\n"));
 					BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10519,30 +10687,28 @@
 			 * into the tcp checksum field, so we don't
 			 * need to explicitly sum it in here.
 			 */
-			if (hdr_length == IPV6_HDR_LEN) {
-				/* src, dst, tcp consequtive */
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + TCP_CSUM_OFFSET);
-				*up = IP_CSUM(mp,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_TCP));
-			} else {
-				sum = htons(IPPROTO_TCP) +
-				    up[0] + up[1] + up[2] + up[3] +
-				    up[4] + up[5] + up[6] + up[7] +
-				    up[8] + up[9] + up[10] + up[11] +
-				    up[12] + up[13] + up[14] + up[15];
-				/*
-				 * Fold the initial sum.
-				 */
-				sum = (sum & 0xffff) + (sum >> 16);
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    hdr_length + TCP_CSUM_OFFSET);
-				*up = IP_CSUM(mp, hdr_length, sum);
-			}
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
-
+			sum = up[0] + up[1] + up[2] + up[3] +
+			    up[4] + up[5] + up[6] + up[7] +
+			    up[8] + up[9] + up[10] + up[11] +
+			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold the initial sum */
+			sum = (sum & 0xffff) + (sum >> 16);
+
+			up = (uint16_t *)(((uchar_t *)ip6h) +
+			    hdr_length + TCP_CHECKSUM_OFFSET);
+
+			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP,
+			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+			    ire->ire_max_frag, mctl_present, sum);
+
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP6_STAT(ip6_out_sw_cksum);
+				IP6_STAT_UPDATE(ip6_tcp_out_sw_cksum_bytes,
+				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+				    hdr_length);
+			}
 		} else if (nexthdr == IPPROTO_UDP) {
 			uint16_t	*up;
 
@@ -10550,12 +10716,10 @@
 			 * check for full IPv6 header + enough UDP header
 			 * to get at the UDP checksum field
 			 */
-#define	UDP_CSUM_OFFSET	6
-#define	UDP_CSUM_SIZE	2
 			if ((mp->b_wptr - mp->b_rptr) < (hdr_length +
-			    UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+			    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
 				if (!pullupmsg(mp, hdr_length +
-				    UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+				    UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
 					ip1dbg(("ip_wput_v6: UDP hdr pullupmsg"
 					    " failed\n"));
 					BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10570,34 +10734,28 @@
 			 * into the udp checksum field, so we don't
 			 * need to explicitly sum it in here.
 			 */
-			if (hdr_length == IPV6_HDR_LEN) {
-				/* src, dst, udp consequtive */
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + UDP_CSUM_OFFSET);
-				*up = IP_CSUM(mp,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_UDP));
-			} else {
-				sum = htons(IPPROTO_UDP) +
-				    up[0] + up[1] + up[2] + up[3] +
-				    up[4] + up[5] + up[6] + up[7] +
-				    up[8] + up[9] + up[10] + up[11] +
-				    up[12] + up[13] + up[14] + up[15];
-				sum = (sum & 0xffff) + (sum >> 16);
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    hdr_length + UDP_CSUM_OFFSET);
-				*up = IP_CSUM(mp, hdr_length, sum);
-			}
-
-			/*
-			 * According to RFC 2460, UDP in IPv6 shouldn't
-			 * appear with all zero checksum on the wire and
-			 * should be changed to 0xffff.
-			 */
-			if (*up == 0)
-				*up = 0xffff;
-#undef UDP_CSUM_OFFSET
-#undef UDP_CSUM_SIZE
+			sum = up[0] + up[1] + up[2] + up[3] +
+			    up[4] + up[5] + up[6] + up[7] +
+			    up[8] + up[9] + up[10] + up[11] +
+			    up[12] + up[13] + up[14] + up[15];
+
+			/* Fold the initial sum */
+			sum = (sum & 0xffff) + (sum >> 16);
+
+			up = (uint16_t *)(((uchar_t *)ip6h) +
+			    hdr_length + UDP_CHECKSUM_OFFSET);
+
+			IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP,
+			    hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+			    ire->ire_max_frag, mctl_present, sum);
+
+			/* Software checksum? */
+			if (DB_CKSUMFLAGS(mp) == 0) {
+				IP6_STAT(ip6_out_sw_cksum);
+				IP6_STAT_UPDATE(ip6_udp_out_sw_cksum_bytes,
+				    (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+				    hdr_length);
+			}
 		} else if (nexthdr == IPPROTO_ICMPV6) {
 			uint16_t	*up;
 			icmp6_t *icmp6;
@@ -10627,6 +10785,9 @@
 			    up[12] + up[13] + up[14] + up[15];
 			sum = (sum & 0xffff) + (sum >> 16);
 			icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum);
+			if (icmp6->icmp6_cksum == 0)
+				icmp6->icmp6_cksum = 0xFFFF;
+
 			/* Update output mib stats */
 			icmp_update_out_mib_v6(ill, icmp6);
 		} else if (nexthdr == IPPROTO_SCTP) {
@@ -10764,6 +10925,223 @@
 }
 
 /*
+ * Outbound IPv6 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk,
+    size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset)
+{
+	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
+	uint_t		pkts, wroff, hdr_chunk_len, pbuf_idx;
+	mblk_t		*hdr_mp, *md_mp = NULL;
+	int		i1;
+	multidata_t	*mmd;
+	unsigned char	*hdr_ptr, *pld_ptr;
+	ip_pdescinfo_t	pdi;
+	uint32_t	ident;
+	size_t		len;
+	uint16_t	offset;
+	queue_t		*stq = ire->ire_stq;
+	ill_t		*ill = (ill_t *)stq->q_ptr;
+
+	ASSERT(DB_TYPE(mp) == M_DATA);
+	ASSERT(MBLKL(mp) > unfragmentable_len);
+
+	/*
+	 * Move read ptr past unfragmentable portion, we don't want this part
+	 * of the data in our fragments.
+	 */
+	mp->b_rptr += unfragmentable_len;
+
+	/* Calculate how many packets we will send out  */
+	i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+	pkts = (i1 + max_chunk - 1) / max_chunk;
+	ASSERT(pkts > 1);
+
+	/* Allocate a message block which will hold all the IP Headers. */
+	wroff = ip_wroff_extra;
+	hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t);
+
+	i1 = pkts * hdr_chunk_len;
+	/*
+	 * Create the header buffer, Multidata and destination address
+	 * and SAP attribute that should be associated with it.
+	 */
+	if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+	    ((hdr_mp->b_wptr += i1),
+	    (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+	    !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
+		freemsg(mp);
+		if (md_mp == NULL) {
+			freemsg(hdr_mp);
+		} else {
+free_mmd:		IP6_STAT(ip6_frag_mdt_discarded);
+			freemsg(md_mp);
+		}
+		IP6_STAT(ip6_frag_mdt_allocfail);
+		BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragFails);
+		UPDATE_MIB(ill->ill_ip6_mib, ipv6OutDiscards, pkts);
+		return;
+	}
+	IP6_STAT(ip6_frag_mdt_allocd);
+
+	/*
+	 * Add a payload buffer to the Multidata; this operation must not
+	 * fail, or otherwise our logic in this routine is broken.  There
+	 * is no memory allocation done by the routine, so any returned
+	 * failure simply tells us that we've done something wrong.
+	 *
+	 * A failure tells us that either we're adding the same payload
+	 * buffer more than once, or we're trying to add more buffers than
+	 * allowed.  None of the above cases should happen, and we panic
+	 * because either there's horrible heap corruption, and/or
+	 * programming mistake.
+	 */
+	if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) {
+		goto pbuf_panic;
+	}
+
+	hdr_ptr = hdr_mp->b_rptr;
+	pld_ptr = mp->b_rptr;
+
+	pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+	ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1));
+
+	/*
+	 * len is the total length of the fragmentable data in this
+	 * datagram.  For each fragment sent, we will decrement len
+	 * by the amount of fragmentable data sent in that fragment
+	 * until len reaches zero.
+	 */
+	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
+
+	offset = 0;
+	prev_nexthdr_offset += wroff;
+
+	while (len != 0) {
+		size_t		mlen;
+		ip6_t		*fip6h;
+		ip6_frag_t	*fraghdr;
+		int		error;
+
+		ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+		mlen = MIN(len, max_chunk);
+		len -= mlen;
+
+		fip6h = (ip6_t *)(hdr_ptr + wroff);
+		ASSERT(OK_32PTR(fip6h));
+		bcopy(ip6h, fip6h, unfragmentable_len);
+		hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
+
+		fip6h->ip6_plen = htons((uint16_t)(mlen +
+		    unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
+
+		fraghdr = (ip6_frag_t *)((unsigned char *)fip6h +
+		    unfragmentable_len);
+		fraghdr->ip6f_nxt = nexthdr;
+		fraghdr->ip6f_reserved = 0;
+		fraghdr->ip6f_offlg = htons(offset) |
+		    ((len != 0) ? IP6F_MORE_FRAG : 0);
+		fraghdr->ip6f_ident = ident;
+
+		/*
+		 * Record offset and size of header and data of the next packet
+		 * in the multidata message.
+		 */
+		PDESC_HDR_ADD(&pdi, hdr_ptr, wroff,
+		    unfragmentable_len + sizeof (ip6_frag_t), 0);
+		PDESC_PLD_INIT(&pdi);
+		i1 = MIN(mp->b_wptr - pld_ptr, mlen);
+		ASSERT(i1 > 0);
+		PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+		if (i1 == mlen) {
+			pld_ptr += mlen;
+		} else {
+			i1 = mlen - i1;
+			mp = mp->b_cont;
+			ASSERT(mp != NULL);
+			ASSERT(MBLKL(mp) >= i1);
+			/*
+			 * Attach the next payload message block to the
+			 * multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+			pld_ptr = mp->b_rptr + i1;
+		}
+
+		if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+		    KM_NOSLEEP)) == NULL) {
+			/*
+			 * Any failure other than ENOMEM indicates that we
+			 * have passed in invalid pdesc info or parameters
+			 * to mmd_addpdesc, which must not happen.
+			 *
+			 * EINVAL is a result of failure on boundary checks
+			 * against the pdesc info contents.  It should not
+			 * happen, and we panic because either there's
+			 * horrible heap corruption, and/or programming
+			 * mistake.
+			 */
+			if (error != ENOMEM) {
+				cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: "
+				    "pdesc logic error detected for "
+				    "mmd %p pinfo %p (%d)\n",
+				    (void *)mmd, (void *)&pdi, error);
+				/* NOTREACHED */
+			}
+			IP6_STAT(ip6_frag_mdt_addpdescfail);
+			/* Free unattached payload message blocks as well */
+			md_mp->b_cont = mp->b_cont;
+			goto free_mmd;
+		}
+
+		/* Advance fragment offset. */
+		offset += mlen;
+
+		/* Advance to location for next header in the buffer. */
+		hdr_ptr += hdr_chunk_len;
+
+		/* Did we reach the next payload message block? */
+		if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+			mp = mp->b_cont;
+			/*
+			 * Attach the next message block with payload
+			 * data to the multidata message.
+			 */
+			if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+				goto pbuf_panic;
+			pld_ptr = mp->b_rptr;
+		}
+	}
+
+	ASSERT(hdr_mp->b_wptr == hdr_ptr);
+	ASSERT(mp->b_wptr == pld_ptr);
+
+	/* Update IP statistics */
+	UPDATE_MIB(ill->ill_ip6_mib, ipv6OutFragCreates, pkts);
+	BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragOKs);
+	IP6_STAT_UPDATE(ip6_frag_mdt_pkt_out, pkts);
+
+	ire->ire_ob_pkt_count += pkts;
+	if (ire->ire_ipif != NULL)
+		atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+
+	ire->ire_last_used_time = lbolt;
+	/* Send it down */
+	putnext(stq, md_mp);
+	return;
+
+pbuf_panic:
+	cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic "
+	    "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+	    pbuf_idx);
+	/* NOTREACHED */
+}
+
+/*
  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
  * We have not optimized this in terms of number of mblks
  * allocated. For instance, for each fragment sent we always allocate a
@@ -10779,7 +11157,7 @@
  */
 void
 ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
-    boolean_t caller, int max_frag)
+    int caller, int max_frag)
 {
 	ip6_t		*ip6h = (ip6_t *)mp->b_rptr;
 	ip6_t		*fip6h;
@@ -10849,6 +11227,19 @@
 	}
 	unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
 
+	max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
+	    sizeof (ip6_frag_t)) & ~7;
+
+	/* Check if we can use MDT to send out the frags. */
+	ASSERT(!IRE_IS_LOCAL(ire));
+	if (ip_multidata_outbound && reachable == 0 &&
+	    !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) &&
+	    IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) {
+		ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len,
+		    nexthdr, prev_nexthdr_offset);
+		return;
+	}
+
 	/*
 	 * Allocate an mblk with enough room for the link-layer
 	 * header, the unfragmentable part of the datagram, and the
@@ -10875,7 +11266,7 @@
 
 	fraghdr->ip6f_nxt = nexthdr;
 	fraghdr->ip6f_reserved = 0;
-	fraghdr->ip6f_offlg = htons(0);
+	fraghdr->ip6f_offlg = 0;
 	fraghdr->ip6f_ident = htonl(ident);
 
 	/*
@@ -10886,9 +11277,6 @@
 	 */
 	len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
 
-	max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
-	    sizeof (ip6_frag_t)) & ~7;
-
 	/*
 	 * Move read ptr past unfragmentable portion, we don't want this part
 	 * of the data in our fragments.
@@ -11117,7 +11505,9 @@
 		}
 	}
 
-	if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || canput(stq->q_next)) {
+	/* Flow-control check has been done in ip_wput_ire_v6 */
+	if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT ||
+	    caller == IP_WSRV || canput(stq->q_next)) {
 		uint32_t ill_index;
 
 		/*
@@ -11164,7 +11554,7 @@
 				ill = ire_to_ill(ire);
 			}
 			IRB_REFRELE(irb);
-		} else if (connp != NULL && IS_TCP_CONN(connp) &&
+		} else if (connp != NULL && IPCL_IS_TCP(connp) &&
 		    connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt &&
 		    ILL_MDT_USABLE(ill)) {
 			/*
@@ -11583,7 +11973,7 @@
 				(void) putbq(connp->conn_wq, mp);
 				conn_drain_insert(connp);
 				/*
-				 * called_from_wsrv implies we are
+				 * caller == IP_WSRV implies we are
 				 * the service thread, and the
 				 * queue is already noenabled.
 				 * The check for canput and
--- a/usr/src/uts/common/inet/ip/ip_if.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_if.c	Sat Oct 22 22:50:14 2005 -0700
@@ -80,6 +80,7 @@
 #include <inet/ip_rts.h>
 #include <inet/ip_ndp.h>
 #include <inet/ip_if.h>
+#include <inet/ip_impl.h>
 #include <inet/tun.h>
 #include <inet/sctp_ip.h>
 
@@ -1232,10 +1233,10 @@
 	} else {
 		/*
 		 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
-		 * be just ip_ioctl_freemsg. we have to restart it
+		 * be just inet_freemsg. we have to restart it
 		 * otherwise the thread will be stuck.
 		 */
-		ip_ioctl_freemsg(mp);
+		inet_freemsg(mp);
 	}
 	return (B_TRUE);
 }
@@ -1344,10 +1345,10 @@
 		} else {
 			/*
 			 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
-			 * this can't be just ip_ioctl_freemsg. we have to
+			 * this can't be just inet_freemsg. we have to
 			 * restart it otherwise the thread will be stuck.
 			 */
-			ip_ioctl_freemsg(curr);
+			inet_freemsg(curr);
 		}
 	}
 }
@@ -1384,7 +1385,7 @@
 	if (curr != NULL) {
 		mutex_exit(&connp->conn_lock);
 		CONN_DEC_REF(connp);
-		ip_ioctl_freemsg(curr);
+		inet_freemsg(curr);
 		return;
 	}
 	/*
@@ -2042,7 +2043,7 @@
 	dl_capability_sub_t *dl_subcap;
 	int size;
 
-	if (!(ill->ill_capabilities & ILL_CAPAB_MDT))
+	if (!ILL_MDT_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_mdt_capab != NULL);
@@ -2857,6 +2858,9 @@
 	bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t));
 	ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
 
+	ip1dbg(("ill_capability_poll_capable: asking interface %s "
+	    "to enable polling\n", ill->ill_name));
+
 	/* nmp points to a DL_CAPABILITY_REQ message to enable polling */
 	ill_dlpi_send(ill, nmp);
 }
@@ -2944,6 +2948,8 @@
 			ASSERT(ill->ill_poll_capab != NULL);
 			ill->ill_capabilities |= ILL_CAPAB_POLL;
 		}
+		ip1dbg(("ill_capability_poll_ack: interface %s "
+		    "has enabled polling\n", ill->ill_name));
 		break;
 	}
 }
@@ -3048,8 +3054,9 @@
 		return;
 	}
 
-#define	CURR_HCKSUM_CAPAB \
-	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM)
+#define	CURR_HCKSUM_CAPAB				\
+	(HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |	\
+	HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
 
 	if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
 	    (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
@@ -3126,10 +3133,11 @@
 		 * hardware checksum acceleration.
 		 */
 		ill_dlpi_send(ill, nmp);
-	} else
+	} else {
 		ip1dbg(("ill_capability_hcksum_ack: interface %s has "
 		    "advertised %x hardware checksum capability flags\n",
 		    ill->ill_name, ihck->hcksum_txflags));
+	}
 }
 
 static void
@@ -3140,7 +3148,7 @@
 	dl_capability_sub_t *dl_subcap;
 	int size;
 
-	if (!(ill->ill_capabilities & ILL_CAPAB_HCKSUM))
+	if (!ILL_HCKSUM_CAPABLE(ill))
 		return;
 
 	ASSERT(ill->ill_hcksum_capab != NULL);
@@ -7300,7 +7308,7 @@
 				ASSERT(mp_next == NULL);
 				ipsq->ipsq_mptail = prev;
 			}
-			ip_ioctl_freemsg(mp);
+			inet_freemsg(mp);
 		} else {
 			prev = mp;
 		}
@@ -8838,7 +8846,7 @@
 		if (mp1 != NULL)
 			freeb(mp1);
 		if (pending_mp != NULL)
-			ip_ioctl_freemsg(pending_mp);
+			inet_freemsg(pending_mp);
 		return (ENOMEM);
 	}
 
@@ -8848,7 +8856,7 @@
 	    (caddr_t)&ipaddr);
 	if (mp2 == NULL) {
 		freeb(mp1);
-		ip_ioctl_freemsg(pending_mp);
+		inet_freemsg(pending_mp);
 		return (ENOMEM);
 	}
 	/* Put together the chain. */
@@ -9743,7 +9751,7 @@
 	pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
 	if (pending_mp == NULL) {
 		ASSERT(connp == NULL);
-		ip_ioctl_freemsg(mp);
+		inet_freemsg(mp);
 		return;
 	}
 	ASSERT(connp != NULL);
@@ -9760,7 +9768,7 @@
 	 */
 	orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
 	orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
-	ip_ioctl_freemsg(pending_mp);
+	inet_freemsg(pending_mp);
 
 	/*
 	 * We're done if there was an error or if this is not an SIOCG{X}ARP
@@ -18114,6 +18122,8 @@
 	icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
 	bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
 	icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
+	if (icmph->icmph_checksum == 0)
+		icmph->icmph_checksum = 0xffff;
 
 	put(ipif->ipif_wq, mp);
 
--- a/usr/src/uts/common/inet/ip/ip_multi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_multi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -65,6 +65,7 @@
 #include <inet/ipsec_impl.h>
 #include <inet/sctp_ip.h>
 #include <inet/ip_listutils.h>
+#include <inet/udp_impl.h>
 
 #include <netinet/igmp.h>
 
@@ -1186,14 +1187,39 @@
 ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags,
     zoneid_t zoneid)
 {
-	mblk_t		*mp;
-	mblk_t		*ipsec_mp;
-
-	/* TODO this could use dup'ed messages except for the IP header. */
-	mp = ip_copymsg(mp_orig);
+	mblk_t	*mp;
+	mblk_t	*ipsec_mp;
+
+	if (DB_TYPE(mp_orig) == M_DATA &&
+	    ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) {
+		uint_t hdrsz;
+
+		hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) +
+		    sizeof (udpha_t);
+		ASSERT(MBLKL(mp_orig) >= hdrsz);
+
+		if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) &&
+		    (mp_orig = dupmsg(mp_orig)) != NULL) {
+			bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz);
+			mp->b_wptr += hdrsz;
+			mp->b_cont = mp_orig;
+			mp_orig->b_rptr += hdrsz;
+			if (MBLKL(mp_orig) == 0) {
+				mp->b_cont = mp_orig->b_cont;
+				mp_orig->b_cont = NULL;
+				freeb(mp_orig);
+			}
+		} else if (mp != NULL) {
+			freeb(mp);
+			mp = NULL;
+		}
+	} else {
+		mp = ip_copymsg(mp_orig);
+	}
+
 	if (mp == NULL)
 		return;
-	if (mp->b_datap->db_type == M_CTL) {
+	if (DB_TYPE(mp) == M_CTL) {
 		ipsec_mp = mp;
 		mp = mp->b_cont;
 	} else {
@@ -2553,7 +2579,7 @@
 	zoneid = connp->conn_zoneid;
 
 	/* don't allow multicast operations on a tcp conn */
-	if (IS_TCP_CONN(connp))
+	if (IPCL_IS_TCP(connp))
 		return (ENOPROTOOPT);
 
 	if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) {
--- a/usr/src/uts/common/inet/ip/ip_ndp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -144,7 +144,6 @@
 	mblk_t		*mp;
 	mblk_t		*template;
 	nce_t		**ncep;
-	int		err = 0;
 	boolean_t	dropped = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&ndp_g_lock));
@@ -280,8 +279,15 @@
 		mutex_exit(&nce->nce_lock);
 		mutex_enter(&ndp_g_lock);
 	}
-done:
-	return (err);
+	/*
+	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
+	 * We call nce_fastpath from nce_update if the link layer address of
+	 * the peer changes from nce_update
+	 */
+	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
+		nce_fastpath(nce);
+	return (0);
 }
 
 int
@@ -1028,7 +1034,6 @@
 		 * Cache entry with a proper resolver cookie was
 		 * created.
 		 */
-		nce_fastpath(nce);
 		NCE_REFRELE(nce);
 		break;
 	case EEXIST:
@@ -1108,7 +1113,6 @@
 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
 		return (err);
 	}
-	nce_fastpath(nce);
 	NCE_REFRELE(nce);
 	return (0);
 }
@@ -2168,8 +2172,7 @@
 
 	ASSERT(ll_addr != NULL);
 	/* Always called before fast_path_probe */
-	if (nce->nce_fp_mp != NULL)
-		return;
+	ASSERT(nce->nce_fp_mp == NULL);
 	if (ill->ill_sap_length != 0) {
 		/*
 		 * Copy the SAP type specified in the
@@ -2265,8 +2268,8 @@
 		if (nce->nce_fp_mp != NULL) {
 			freemsg(nce->nce_fp_mp);
 			nce->nce_fp_mp = NULL;
-			need_fastpath_update = B_TRUE;
 		}
+		need_fastpath_update = B_TRUE;
 	}
 	mutex_exit(&nce->nce_lock);
 	if (need_stop_timer) {
--- a/usr/src/uts/common/inet/ip/ipclassifier.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c	Sat Oct 22 22:50:14 2005 -0700
@@ -233,6 +233,7 @@
 #include <inet/ip_rts.h>
 #include <inet/optcom.h>
 #include <inet/ip_ndp.h>
+#include <inet/udp_impl.h>
 #include <inet/sctp_ip.h>
 
 #include <sys/ethernet.h>
@@ -351,8 +352,7 @@
 
 	ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
-	    NULL, NULL,
-	    NULL, NULL, NULL, 0);
+	    NULL, NULL, NULL, NULL, NULL, 0);
 
 	ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
 	    sizeof (itc_t), CACHE_ALIGN_SIZE,
@@ -501,17 +501,19 @@
 	case IPCL_IPCCONN:
 		connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
 		if (connp == NULL)
-			return (connp);
+			return (NULL);
 		bzero(connp, sizeof (conn_t));
-		mutex_init(&connp->conn_lock, NULL,
-		    MUTEX_DEFAULT, NULL);
+		mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
-		connp->conn_flags |= IPCL_IPCCONN;
+		connp->conn_flags = IPCL_IPCCONN;
 		connp->conn_ref = 1;
 		IPCL_DEBUG_LVL(1,
 		    ("ipcl_conn_create: connp = %p\n", (void *)connp));
 		ipcl_globalhash_insert(connp);
 		break;
+	default:
+		connp = NULL;
+		ASSERT(0);
 	}
 
 	return (connp);
@@ -521,7 +523,6 @@
 ipcl_conn_destroy(conn_t *connp)
 {
 	mblk_t	*mp;
-	tcp_t	*tcp = connp->conn_tcp;
 
 	ASSERT(!MUTEX_HELD(&connp->conn_lock));
 	ASSERT(connp->conn_ref == 0);
@@ -531,6 +532,8 @@
 
 	cv_destroy(&connp->conn_cv);
 	if (connp->conn_flags & IPCL_TCPCONN) {
+		tcp_t	*tcp = connp->conn_tcp;
+
 		mutex_destroy(&connp->conn_lock);
 		ASSERT(connp->conn_tcp != NULL);
 		tcp_free(tcp);
@@ -567,6 +570,7 @@
 	} else if (connp->conn_flags & IPCL_SCTPCONN) {
 		sctp_free(connp);
 	} else {
+		ASSERT(connp->conn_udp == NULL);
 		mutex_destroy(&connp->conn_lock);
 		kmem_cache_free(ipcl_conn_cache, connp);
 	}
@@ -1863,6 +1867,57 @@
 	return (NULL);
 }
 
+/*
+ * ipcl_get_next_conn
+ *	get the next entry in the conn global list
+ *	and put a reference on the next_conn.
+ *	decrement the reference on the current conn.
+ *
+ * This is an iterator based walker function that also provides for
+ * some selection by the caller. It walks through the conn_hash bucket
+ * searching for the next valid connp in the list, and selects connections
+ * that are neither closed nor condemned. It also REFHOLDS the conn
+ * thus ensuring that the conn exists when the caller uses the conn.
+ */
+conn_t *
+ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
+{
+	conn_t	*next_connp;
+
+	if (connfp == NULL)
+		return (NULL);
+
+	mutex_enter(&connfp->connf_lock);
+
+	next_connp = (connp == NULL) ?
+	    connfp->connf_head : connp->conn_g_next;
+
+	while (next_connp != NULL) {
+		mutex_enter(&next_connp->conn_lock);
+		if (!(next_connp->conn_flags & conn_flags) ||
+		    (next_connp->conn_state_flags &
+		    (CONN_CONDEMNED | CONN_INCIPIENT))) {
+			/*
+			 * This conn has been condemned or
+			 * is closing, or the flags don't match
+			 */
+			mutex_exit(&next_connp->conn_lock);
+			next_connp = next_connp->conn_g_next;
+			continue;
+		}
+		CONN_INC_REF_LOCKED(next_connp);
+		mutex_exit(&next_connp->conn_lock);
+		break;
+	}
+
+	mutex_exit(&connfp->connf_lock);
+
+	if (connp != NULL)
+		CONN_DEC_REF(connp);
+
+	return (next_connp);
+}
+
 #ifdef CONN_DEBUG
 /*
  * Trace of the last NBUF refhold/refrele
--- a/usr/src/uts/common/inet/ip/tun.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/tun.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -3693,6 +3693,8 @@
 	*nicmp = *icmp;
 	nicmp->icmph_checksum = 0;
 	nicmp->icmph_checksum = IP_CSUM(send_mp, sizeof (ipha_t), 0);
+	if (nicmp->icmph_checksum == 0)
+		nicmp->icmph_checksum = 0xffff;
 
 	/* let ip know we are an icmp message */
 	atomic_add_64(&atp->tun_HCInOctets,
@@ -3757,6 +3759,8 @@
 	    up[12] + up[13] + up[14] + up[15];
 	sum = (sum & 0xffff) + (sum >> 16);
 	nicmp6->icmp6_cksum = IP_CSUM(send_mp, IPV6_HDR_LEN, sum);
+	if (nicmp6->icmp6_cksum == 0)
+		nicmp6->icmp6_cksum = 0xffff;
 
 	/* let ip know we are an icmp message */
 	atomic_add_64(&atp->tun_HCInOctets,
--- a/usr/src/uts/common/inet/ip6.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip6.h	Sat Oct 22 22:50:14 2005 -0700
@@ -370,8 +370,7 @@
 		    uint16_t *, uint8_t **);
 extern int	ip_hdr_length_v6(mblk_t *, ip6_t *);
 extern uint32_t	ip_massage_options_v6(ip6_t *, ip6_rthdr_t *);
-extern void	ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *,
-		    boolean_t, int);
+extern void	ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int);
 extern void 	ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
     ire_t *);
 extern int	ip_total_hdrs_len_v6(ip6_pkt_t *);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/ip_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,493 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_INET_IP_IMPL_H
+#define	_INET_IP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * IP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#define	IP_MOD_ID		5701
+
+#ifdef	_BIG_ENDIAN
+#define	IP_HDR_CSUM_TTL_ADJUST	256
+#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
+#define	IP_UDP_CSUM_COMP	IPPROTO_UDP
+#else
+#define	IP_HDR_CSUM_TTL_ADJUST	1
+#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
+#define	IP_UDP_CSUM_COMP	(IPPROTO_UDP << 8)
+#endif
+
+#define	TCP_CHECKSUM_OFFSET	16
+#define	TCP_CHECKSUM_SIZE	2
+
+#define	UDP_CHECKSUM_OFFSET	6
+#define	UDP_CHECKSUM_SIZE	2
+
+#define	IPH_TCPH_CHECKSUMP(ipha, hlen)	\
+	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
+
+#define	IPH_UDPH_CHECKSUMP(ipha, hlen)	\
+	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
+
+#define	ILL_HCKSUM_CAPABLE(ill)		\
+	(((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
+/*
+ * Macro that performs software checksum calculation on the IP header.
+ */
+#define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
+	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
+	    ((v_hlen_tos_len) >> 16) +					\
+	    ((v_hlen_tos_len) & 0xFFFF) +				\
+	    (ipha)->ipha_fragment_offset_and_flags;			\
+	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
+	(sum) = ~((sum) + ((sum) >> 16));				\
+	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
+}
+
+#define	IS_IP_HDR_HWCKSUM(ipsec, mp, ill)				\
+	((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&		\
+	ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
+
+/*
+ * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
+ * several checks on the IRE and ILL (among other things) in order to see
+ * whether or not hardware checksum offload is allowed for the outgoing
+ * packet.  It assumes that the caller has held a reference to the IRE.
+ */
+#define	IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end,		\
+	    max_frag, ipsec_len, pseudo) {				\
+	uint32_t _hck_flags;						\
+	/*								\
+	 * We offload checksum calculation to hardware when IPsec isn't	\
+	 * present and if fragmentation isn't required.  We also check	\
+	 * if M_DATA fastpath is safe to be used on the	corresponding	\
+	 * IRE; this check is performed without grabbing ire_lock but	\
+	 * instead by holding a reference to it.  This is sufficient	\
+	 * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the	\
+	 * DL_NOTE_FASTPATH_FLUSH indication could come up from the	\
+	 * driver and trigger the IRE (hence fp_mp) deletion.  This is	\
+	 * why only IRE_CACHE type is eligible for offload.		\
+	 *								\
+	 * The presense of IP options also forces the network stack to	\
+	 * calculate the checksum in software.  This is because:	\
+	 *								\
+	 * Wrap around: certain partial-checksum NICs (eri, ce) limit	\
+	 * the size of "start offset" width to 6-bit.  This effectively	\
+	 * sets the largest value of the offset to 64-bytes, starting	\
+	 * from the MAC header.  When the cumulative MAC and IP headers	\
+	 * exceed such limit, the offset will wrap around.  This causes	\
+	 * the checksum to be calculated at the wrong place.		\
+	 *								\
+	 * IPv4 source routing: none of the full-checksum capable NICs	\
+	 * is capable of correctly handling the	IPv4 source-routing	\
+	 * option for purposes of calculating the pseudo-header; the	\
+	 * actual destination is different from the destination in the	\
+	 * header which is that of the next-hop.  (This case may not be	\
+	 * true for NICs which can parse IPv6 extension headers, but	\
+	 * we choose to simplify the implementation by not offloading	\
+	 * checksum when they are present.)				\
+	 *								\
+	 */								\
+	if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) &&			\
+	    !((ire)->ire_flags & RTF_MULTIRT) &&			\
+	    (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) ||	\
+	    (ill)->ill_type == IFT_ETHER) &&				\
+	    (ipsec_len) == 0 &&						\
+	    (((ire)->ire_ipversion == IPV4_VERSION &&			\
+	    (start) == IP_SIMPLE_HDR_LENGTH &&				\
+	    (ire)->ire_fp_mp != NULL &&					\
+	    MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) ||			\
+	    ((ire)->ire_ipversion == IPV6_VERSION &&			\
+	    (start) == IPV6_HDR_LEN &&					\
+	    (ire)->ire_nce->nce_fp_mp != NULL &&			\
+	    MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) &&	\
+	    (max_frag) >= (uint_t)((end) + (ipsec_len)) &&		\
+	    dohwcksum) {						\
+		_hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
+	} else {							\
+		_hck_flags = 0;						\
+	}								\
+	IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp,	\
+	    up, proto, start, end, pseudo);				\
+}
+
+/*
+ * Based on the device capabilities, this macro either marks an outgoing
+ * packet with hardware checksum offload information or calculate the
+ * checksum in software.  If the latter is performed, the checksum field
+ * of the dblk is cleared; otherwise it will be non-zero and contain the
+ * necessary flag(s) for the driver.
+ */
+#define	IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start,	\
+	    end, pseudo) {						\
+	uint32_t _sum;							\
+	/*								\
+	 * Underlying interface supports hardware checksum offload for	\
+	 * the payload; leave the payload checksum for the hardware to	\
+	 * calculate.  N.B: We only need to set up checksum info on the	\
+	 * first mblk.							\
+	 */								\
+	DB_CKSUMFLAGS(mp) = 0;						\
+	if (((ipver) == IPV4_VERSION &&					\
+	    ((hck_flags) & HCKSUM_INET_FULL_V4)) ||			\
+	    ((ipver) == IPV6_VERSION &&					\
+	    ((hck_flags) & HCKSUM_INET_FULL_V6))) {			\
+		/*							\
+		 * Hardware calculates pseudo-header, header and the	\
+		 * payload checksums, so clear the checksum field in	\
+		 * the protocol header.					\
+		 */							\
+		*(up) = 0;						\
+		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;			\
+	} else if ((hck_flags) & HCKSUM_INET_PARTIAL)  {		\
+		/*							\
+		 * Partial checksum offload has been enabled.  Fill	\
+		 * the checksum field in the protocl header with the	\
+		 * pseudo-header checksum value.			\
+		 */							\
+		_sum = ((proto) == IPPROTO_UDP) ?			\
+		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
+		_sum += *(up) + (pseudo);				\
+		_sum = (_sum & 0xFFFF) + (_sum >> 16);			\
+		*(up) = (_sum & 0xFFFF) + (_sum >> 16);			\
+		/*							\
+		 * Offsets are relative to beginning of IP header.	\
+		 */							\
+		DB_CKSUMSTART(mp) = (start);				\
+		DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ?		\
+		    (start) + UDP_CHECKSUM_OFFSET :			\
+		    (start) + TCP_CHECKSUM_OFFSET;			\
+		DB_CKSUMEND(mp) = (end);				\
+		DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;			\
+	} else {							\
+		/*							\
+		 * Software checksumming.				\
+		 */							\
+		_sum = ((proto) == IPPROTO_UDP) ?			\
+		    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP;		\
+		_sum += (pseudo);					\
+		_sum = IP_CSUM(mp, start, _sum);			\
+		*(up) = (uint16_t)(_sum ? _sum : ~_sum);		\
+	}								\
+	/*								\
+	 * Hardware supports IP header checksum offload; clear the	\
+	 * contents of IP header checksum field as expected by NIC.	\
+	 * Do this only if we offloaded either full or partial sum.	\
+	 */								\
+	if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 &&	\
+	    ((hck_flags) & HCKSUM_IPHDRCKSUM)) {			\
+		DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;			\
+		((ipha_t *)(ihp))->ipha_hdr_checksum = 0;		\
+	}								\
+}
+
+/*
+ * Macro to inspect the checksum of a fully-reassembled incoming datagram.
+ */
+#define	IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) {		\
+	(err) = B_FALSE;						\
+	if ((hck_flags) & HCK_FULLCKSUM) {				\
+		/*							\
+		 * The sum of all fragment checksums should		\
+		 * result in -0 (0xFFFF) or otherwise invalid.		\
+		 */							\
+		if ((sum) != 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if ((hck_flags) & HCK_PARTIALCKSUM) {			\
+		(sum) += (pseudo);					\
+		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
+		(sum) = ((sum) & 0xFFFF) + ((sum) >> 16);		\
+		if (~(sum) & 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) {		\
+		(err) = B_TRUE;						\
+	}								\
+}
+
+/*
+ * This macro inspects an incoming packet to see if the checksum value
+ * contained in it is valid; if the hardware has provided the information,
+ * the value is verified, otherwise it performs software checksumming.
+ * The checksum value is returned to caller.
+ */
+#define	IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
+	int32_t _len;							\
+									\
+	(err) = B_FALSE;						\
+	if ((hck_flags) & HCK_FULLCKSUM) {				\
+		/*							\
+		 * Full checksum has been computed by the hardware	\
+		 * and has been attached.  If the driver wants us to	\
+		 * verify the correctness of the attached value, in	\
+		 * order to protect against faulty hardware, compare	\
+		 * it against -0 (0xFFFF) to see if it's valid.		\
+		 */							\
+		(sum) = DB_CKSUM16(mp);					\
+		if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
+			(err) = B_TRUE;					\
+	} else if (((hck_flags) & HCK_PARTIALCKSUM) &&			\
+	    ((mp1) == NULL || (mp1)->b_cont == NULL) &&			\
+	    (ulph_off) >= DB_CKSUMSTART(mp) &&				\
+	    ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) {	\
+		uint32_t _adj;						\
+		/*							\
+		 * Partial checksum has been calculated by hardware	\
+		 * and attached to the packet; in addition, any		\
+		 * prepended extraneous data is even byte aligned,	\
+		 * and there are at most two mblks associated with	\
+		 * the packet.  If any such data exists, we adjust	\
+		 * the checksum; also take care any postpended data.	\
+		 */							\
+		IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj);	\
+		/*							\
+		 * One's complement subtract extraneous checksum	\
+		 */							\
+		(sum) += DB_CKSUM16(mp);				\
+		if (_adj >= (sum))					\
+			(sum) = ~(_adj - (sum)) & 0xFFFF;		\
+		else							\
+			(sum) -= _adj;					\
+		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
+		(sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16);		\
+		if (~(sum) & 0xFFFF)					\
+			(err) = B_TRUE;					\
+	} else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) {		\
+		(err) = B_TRUE;						\
+	}								\
+}
+
+/*
+ * Macro to adjust a given checksum value depending on any prepended
+ * or postpended data on the packet.  It expects the start offset to
+ * begin at an even boundary and that the packet consists of at most
+ * two mblks.
+ */
+#define	IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) {		\
+	/*								\
+	 * Prepended extraneous data; adjust checksum.			\
+	 */								\
+	if ((len) > 0)							\
+		(adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0);		\
+	else								\
+		(adj) = 0;						\
+	/*								\
+	 * len is now the total length of mblk(s)			\
+	 */								\
+	(len) = MBLKL(mp);						\
+	if ((mp1) == NULL)						\
+		(mp1) = (mp);						\
+	else								\
+		(len) += MBLKL(mp1);					\
+	/*								\
+	 * Postpended extraneous data; adjust checksum.			\
+	 */								\
+	if (((len) = (DB_CKSUMEND(mp) - len)) > 0) {			\
+		uint32_t _pad;						\
+									\
+		_pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0);		\
+		/*							\
+		 * If the postpended extraneous data was odd		\
+		 * byte aligned, swap resulting checksum bytes.		\
+		 */							\
+		if ((uintptr_t)(mp1)->b_wptr & 1)			\
+			(adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8);	\
+		else							\
+			(adj) += _pad;					\
+		(adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16);		\
+	}								\
+}
+
+#define	ILL_MDT_CAPABLE(ill)		\
+	(((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
+
+/*
+ * ioctl identifier and structure for Multidata Transmit update
+ * private M_CTL communication from IP to ULP.
+ */
+#define	MDT_IOC_INFO_UPDATE	(('M' << 8) + 1020)
+
+typedef struct ip_mdt_info_s {
+	uint_t	mdt_info_id;	/* MDT_IOC_INFO_UPDATE */
+	ill_mdt_capab_t	mdt_capab; /* ILL MDT capabilities */
+} ip_mdt_info_t;
+
+/*
+ * Macro that determines whether or not a given ILL is allowed for MDT.
+ */
+#define	ILL_MDT_USABLE(ill)						\
+	(ILL_MDT_CAPABLE(ill) &&					\
+	ill->ill_mdt_capab != NULL &&					\
+	ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 &&		\
+	ill->ill_mdt_capab->ill_mdt_on != 0)
+
+/*
+ * Macro that determines whether or not a given CONN may be considered
+ * for fast path prior to proceeding further with Multidata.
+ */
+#define	CONN_IS_MD_FASTPATH(connp)	\
+	((connp)->conn_dontroute == 0 &&	/* SO_DONTROUTE */	\
+	(connp)->conn_nofailover_ill == NULL &&	/* IPIF_NOFAILOVER */	\
+	(connp)->conn_xmit_if_ill == NULL &&	/* IP_XMIT_IF */	\
+	(connp)->conn_outgoing_pill == NULL &&	/* IP{V6}_BOUND_PIF */	\
+	(connp)->conn_outgoing_ill == NULL)	/* IP{V6}_BOUND_IF */
+
+/* Definitons for fragmenting IP packets using MDT. */
+
+/*
+ * Smaller and private version of pdescinfo_t used specifically for IP,
+ * which allows for only a single payload span per packet.
+ */
+typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2)	ip_pdescinfo_t;
+
+/*
+ * Macro version of ip_can_frag_mdt() which avoids the function call if we
+ * only examine a single message block.
+ */
+#define	IP_CAN_FRAG_MDT(mp, hdr_len, len)			\
+	(((mp)->b_cont == NULL) ?				\
+	(MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) :	\
+	ip_can_frag_mdt((mp), (hdr_len), (len)))
+
+/*
+ * Macro that determines whether or not a given IPC requires
+ * outbound IPSEC processing.
+ */
+#define	CONN_IPSEC_OUT_ENCAPSULATED(connp)	\
+	((connp)->conn_out_enforce_policy ||	\
+	((connp)->conn_latch != NULL &&		\
+	(connp)->conn_latch->ipl_out_policy != NULL))
+
+/*
+ * These are used by the synchronous streams code in tcp and udp.
+ */
+#define	STR_WAKEUP_CLEAR(stp) {						\
+	mutex_enter(&stp->sd_lock);					\
+	stp->sd_wakeq &= ~RSLEEP;					\
+	mutex_exit(&stp->sd_lock);					\
+}
+
+#define	STR_WAKEUP_SET(stp) {						\
+	mutex_enter(&stp->sd_lock);					\
+	if (stp->sd_flag & RSLEEP) {					\
+		stp->sd_flag &= ~RSLEEP;				\
+		cv_broadcast(&_RD(stp->sd_wrq)->q_wait);		\
+	} else {							\
+		stp->sd_wakeq |= RSLEEP;				\
+	}								\
+	mutex_exit(&stp->sd_lock);					\
+}
+
+#define	STR_SENDSIG(stp) {						\
+	int _events;							\
+	mutex_enter(&stp->sd_lock);					\
+	if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0)	\
+		strsendsig(stp->sd_siglist, _events, 0, 0);		\
+	if (stp->sd_rput_opt & SR_POLLIN) {				\
+		stp->sd_rput_opt &= ~SR_POLLIN;				\
+		mutex_exit(&stp->sd_lock);				\
+		pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);	\
+	} else {							\
+		mutex_exit(&stp->sd_lock);				\
+	}								\
+}
+
+#define	CONN_UDP_SYNCSTR(connp)						\
+	(IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs)
+
+/*
+ * Macro that checks whether or not a particular UDP conn is
+ * flow-controlling on the read-side.  If udp module is directly
+ * above ip, check to see if the drain queue is full; note here
+ * that we check this without any lock protection because this
+ * is a coarse granularity inbound flow-control.  If the module
+ * above ip is not udp, then use canputnext to determine the
+ * flow-control.
+ *
+ * Note that these checks are done after the conn is found in
+ * the UDP fanout table.  A UDP conn in that table may have its
+ * IPCL_UDP bit cleared from the conn_flags when the application
+ * pops the udp module without issuing an unbind; in this case
+ * IP will still receive packets for the conn and deliver it
+ * upstream via putnext.  This is the reason why we have to test
+ * against IPCL_UDP.
+ */
+#define	CONN_UDP_FLOWCTLD(connp)					\
+	((CONN_UDP_SYNCSTR(connp) &&					\
+	(connp)->conn_udp->udp_drain_qfull) ||				\
+	(!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq)))
+
+/*
+ * Macro that delivers a given message upstream; if udp module
+ * is directly above ip, the message is passed directly into
+ * the stream-less entry point.  Otherwise putnext is used.
+ */
+#define	CONN_UDP_RECV(connp, mp) {					\
+	if (IPCL_IS_UDP(connp))						\
+		udp_conn_recv(connp, mp);				\
+	else								\
+		putnext((connp)->conn_rq, mp);				\
+}
+
+#define	ILL_POLL_CAPABLE(ill)	\
+	(((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0)
+
+/*
+ * Macro that hands off one or more messages directly to DLD
+ * when the interface is marked with ILL_CAPAB_POLL.
+ */
+#define	IP_POLL_ILL_TX(ill, mp) {					\
+	ill_poll_capab_t *ill_poll = ill->ill_poll_capab;		\
+	ASSERT(ILL_POLL_CAPABLE(ill));					\
+	ASSERT(ill_poll != NULL);					\
+	ASSERT(ill_poll->ill_tx != NULL);				\
+	ASSERT(ill_poll->ill_tx_handle != NULL);			\
+	ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);			\
+}
+
+extern int	ip_wput_frag_mdt_min;
+extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_IP_IMPL_H */
--- a/usr/src/uts/common/inet/ipclassifier.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ipclassifier.h	Sat Oct 22 22:50:14 2005 -0700
@@ -37,6 +37,7 @@
 #include <inet/ip.h>
 #include <inet/mi.h>
 #include <inet/tcp.h>
+#include <inet/udp_impl.h>
 #include <inet/ip6.h>
 #include <netinet/in.h>		/* for IPPROTO_* constants */
 #include <sys/sdt.h>
@@ -58,17 +59,19 @@
  */
 
 /* Conn Flags */
-#define	IPCL_BOUND		0x80000000	/* Conn in bind table */
-#define	IPCL_CONNECTED		0x40000000	/* Conn in connected table */
-#define	IPCL_TCP4		0x08000000	/* A TCP connection */
-#define	IPCL_TCP6		0x04000000	/* A TCP6 connection */
-#define	IPCL_EAGER		0x01000000	/* Incoming connection */
+#define	IPCL_UDPMOD		0x00020000	/* Is UDP module instance */
+#define	IPCL_TCPMOD		0x00040000	/* Is TCP module instance */
+#define	IPCL_FULLY_BOUND	0x00080000	/* Bound to correct squeue */
+#define	IPCL_CHECK_POLICY	0x00100000	/* Needs policy checking */
+#define	IPCL_SOCKET		0x00200000	/* Sockfs connection */
+#define	IPCL_ACCEPTOR		0x00400000	/* Sockfs priv acceptor */
 #define	IPCL_CL_LISTENER	0x00800000	/* Cluster listener */
-#define	IPCL_ACCEPTOR		0x00400000	/* Sockfs priv acceptor */
-#define	IPCL_SOCKET		0x00200000	/* Sockfs connection */
-#define	IPCL_CHECK_POLICY	0x00100000	/* Needs policy checking */
-#define	IPCL_FULLY_BOUND	0x00080000	/* Bound to correct squeue */
-#define	IPCL_TCPMOD		0x00040000	/* Is tcp module instance */
+#define	IPCL_EAGER		0x01000000	/* Incoming connection */
+#define	IPCL_UDP		0x02000000	/* A UDP connection */
+#define	IPCL_TCP6		0x04000000	/* A TCP6 connection */
+#define	IPCL_TCP4		0x08000000	/* A TCP connection */
+#define	IPCL_CONNECTED		0x40000000	/* Conn in connected table */
+#define	IPCL_BOUND		0x80000000	/* Conn in bind table */
 
 /* Flags identifying the type of conn */
 #define	IPCL_TCPCONN		0x00000001	/* Flag to indicate cache */
@@ -81,8 +84,6 @@
 #define	IPCL_REMOVED		0x00000020
 #define	IPCL_REUSED		0x00000040
 
-#define	IS_TCP_CONN(connp)	(((connp)->conn_flags & IPCL_TCP) != 0)
-
 #define	IPCL_IS_TCP4(connp)						\
 	(((connp)->conn_flags & IPCL_TCP4))
 
@@ -108,6 +109,13 @@
 #define	IPCL_IS_TCP(connp)						\
 	((connp)->conn_flags & (IPCL_TCP4|IPCL_TCP6))
 
+/*
+ * IPCL_UDP is set on the conn when udp is directly above ip;
+ * this flag is cleared the moment udp is popped.
+ */
+#define	IPCL_IS_UDP(connp)						\
+	((connp)->conn_flags & IPCL_UDP)
+
 #define	IPCL_IS_IPTUN(connp)						\
 	((connp)->conn_ulp == IPPROTO_ENCAP || \
 	(connp)->conn_ulp == IPPROTO_IPV6)
@@ -169,6 +177,8 @@
 		pad_to_bit_31 : 2;
 
 	tcp_t		*conn_tcp;		/* Pointer to the tcp struct */
+	udp_t		*conn_udp;		/* Pointer to the udp struct */
+
 	squeue_t	*conn_sqp;		/* Squeue for processing */
 	edesc_rpf	conn_recv;		/* Pointer to recv routine */
 	void		*conn_pad1;
@@ -483,6 +493,7 @@
 		    uint32_t);
 extern int	ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
 		    const in6_addr_t *, uint32_t, uint_t);
+extern conn_t	*ipcl_get_next_conn(connf_t *, conn_t *, uint32_t);
 
 void ipcl_proto_insert(conn_t *, uint8_t);
 void ipcl_proto_insert_v6(conn_t *, uint8_t);
--- a/usr/src/uts/common/inet/ipp_common.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ipp_common.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002, 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,7 +52,7 @@
 
 /* Apply IPQoS policies for inbound traffic? */
 #define	IP6_IN_IPP(flags) (IPP_ENABLED(IPP_LOCAL_IN) &&	\
-	(!((flags) & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))))
+	(!((flags) & IP6_NO_IPPOLICY)))
 
 /* Apply IPQoS policies for oubound traffic? */
 #define	IP6_OUT_IPP(flags)	\
--- a/usr/src/uts/common/inet/led.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/led.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -44,12 +44,12 @@
 #include <sys/types.h>
 
 /*
- * Intel x86 can handle unaligned access. However, the checksum routine
+ * x86 can handle unaligned access. However, the checksum routine
  * assumes that the source is 16 bit aligned so we always make sure
  * that packet headers are 16 bit aligned.
  */
 #define	OK_16PTR(p)	(!((uintptr_t)(p) & 0x1))
-#if defined(__i386)
+#if defined(__x86)
 #define	OK_32PTR(p)	OK_16PTR(p)
 #else
 #define	OK_32PTR(p)	(!((uintptr_t)(p) & 0x3))
--- a/usr/src/uts/common/inet/optcom.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/optcom.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -82,8 +82,6 @@
 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
 static boolean_t opt_bloated_maxsize(opdes_t *);
 
-extern optdb_obj_t tcp_opt_obj;
-
 /* Common code for sending back a T_ERROR_ACK. */
 void
 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
@@ -220,9 +218,12 @@
 	opdes_t	*optd;
 	boolean_t	pass_to_next = B_FALSE;
 	boolean_t	pass_to_ip = B_FALSE;
+	boolean_t	is_tcp;
 	struct T_optmgmt_ack *toa;
 	struct T_optmgmt_req *tor;
 
+	is_tcp = (dbobjp == &tcp_opt_obj);
+
 	/*
 	 * Allocate M_CTL and prepend to the packet for restarting this
 	 * option if needed. IP may need to queue and restart the option
@@ -550,14 +551,14 @@
 				opt1->len = opt->len;
 				bcopy(&opt[1], &opt1[1], opt->len);
 				/*
-				 * Pass the option down to IP only if
-				 * TCP hasn't processed it.
+				 * Pass the option down to IP only
+				 * if TCP hasn't processed it.
 				 */
-				if (dbobjp == &tcp_opt_obj)
+				if (is_tcp)
 					pass_to_ip = B_TRUE;
+			} else {
+				opt1->len = (t_uscalar_t)len;
 			}
-			else
-				opt1->len = (t_uscalar_t)len;
 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
 			    _TPI_ALIGN_OPT(opt1->len));
 		} /* end for loop */
@@ -639,10 +640,10 @@
 				optcom_err_ack(q, mp, TSYSERR, error);
 				freeb(first_mp);
 				return (0);
-			} else if (error < 0 && dbobjp == &tcp_opt_obj) {
+			} else if (error < 0 && is_tcp) {
 				/*
-				 * Pass the option down to IP only if
-				 * TCP hasn't processed it.
+				 * Pass the option down to IP only
+				 * if TCP hasn't processed it.
 				 */
 				pass_to_ip = B_TRUE;
 			}
--- a/usr/src/uts/common/inet/optcom.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/optcom.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -205,6 +205,18 @@
 #define	SETFN_CONN_NEGOTIATE		4 /* semantics for T_CONN_*_REQ */
 
 /*
+ * Object to represent database of options to search passed to
+ * {sock,tpi}optcom_req() interface routine to take care of option
+ * management and associated methods.
+ */
+extern optdb_obj_t tcp_opt_obj;
+extern optdb_obj_t udp_opt_obj;
+extern optdb_obj_t ip_opt_obj;
+
+extern uint_t	tcp_max_optsize;
+extern uint_t	udp_max_optsize;
+
+/*
  * Function prototypes
  */
 extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int);
--- a/usr/src/uts/common/inet/snmpcom.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/snmpcom.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992,1997-2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -51,6 +51,11 @@
 #include <inet/optcom.h>
 #include <inet/snmpcom.h>
 
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
 #define	DEFAULT_LENGTH	sizeof (long)
 #define	DATA_MBLK_SIZE	1024
 #define	TOAHDR_SIZE	(sizeof (struct T_optmgmt_ack) +\
@@ -90,10 +95,7 @@
  * ctl buffer.
  */
 int
-snmp_append_data(mpdata, blob, len)
-	mblk_t	*mpdata;
-	char	*blob;
-	int	len;
+snmp_append_data(mblk_t *mpdata, char *blob, int len)
 {
 
 	if (!mpdata)
@@ -169,12 +171,7 @@
  *   for them: getfn() returns 0, setfn() returns 1.
  */
 boolean_t
-snmpcom_req(q, mp, setfn, getfn, credp)
-	queue_t	*q;
-	mblk_t	*mp;
-	pfi_t	setfn;
-	pfi_t	getfn;
-	cred_t	*credp;
+snmpcom_req(queue_t *q, mblk_t *mp, pfi_t setfn, pfi_t getfn, cred_t *credp)
 {
 	mblk_t			*mpctl;
 	struct opthdr		*req;
@@ -184,6 +181,7 @@
 	sor_t			*sreq;
 	struct T_optmgmt_req	*tor = (struct T_optmgmt_req *)mp->b_rptr;
 	struct T_optmgmt_ack	*toa;
+	boolean_t		pass_to_ip = B_FALSE;
 
 	if (mp->b_cont) {	/* don't deal with multiple mblk's */
 		freemsg(mp->b_cont);
@@ -209,6 +207,10 @@
 			req_start->level <= EXPER_RANGE_END)))
 		return (B_FALSE);
 
+	if (setfn == tcp_snmp_set || setfn == udp_snmp_set ||
+	    getfn == tcp_snmp_get || getfn == udp_snmp_get)
+		pass_to_ip = B_TRUE;
+
 	switch (tor->MGMT_flags) {
 
 	case T_NEGOTIATE:
@@ -235,8 +237,10 @@
 				(uchar_t *)&req[1], req->len))
 				goto bad_req4;
 		}
-		if (q->q_next)
+		if (q->q_next != NULL)
 			putnext(q, mp);
+		else if (pass_to_ip)
+			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
 		else
 			freemsg(mp);
 		return (B_TRUE);
@@ -268,9 +272,12 @@
 		 * this is bottom module of stream, send up an EOD ctl msg,
 		 * otherwise pass onto the next guy for processing.
 		 */
-		if (q->q_next) {
+		if (q->q_next != NULL) {
 			putnext(q, mp);
 			return (B_TRUE);
+		} else if (pass_to_ip) {
+			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
+			return (B_TRUE);
 		}
 		if (mp->b_cont) {
 			freemsg(mp->b_cont);
--- a/usr/src/uts/common/inet/squeue.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/squeue.c	Sat Oct 22 22:50:14 2005 -0700
@@ -729,7 +729,8 @@
 #endif
 #if SQUEUE_DEBUG
 	conn_t 	*connp = (conn_t *)arg;
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
 #endif
 
 	ASSERT(proc != NULL);
@@ -954,9 +955,10 @@
 	ASSERT(sqp != NULL);
 	ASSERT(mp != NULL);
 	ASSERT(mp->b_next == NULL);
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
+	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 
-	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 	mutex_enter(&sqp->sq_lock);
 
 	being_processed = (sqp->sq_state & SQS_PROC);
@@ -1100,7 +1102,8 @@
 	ASSERT(sqp != NULL);
 	ASSERT(mp != NULL);
 	ASSERT(mp->b_next == NULL);
-	ASSERT(connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+	ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
 
 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
 	mutex_enter(&sqp->sq_lock);
--- a/usr/src/uts/common/inet/tcp.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp.h	Sat Oct 22 22:50:14 2005 -0700
@@ -286,11 +286,8 @@
 		tcp_accept_error : 1,	/* Error during TLI accept */
 
 		tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
-		tcp_fused : 1,		/* loopback tcp in fusion mode */
-		tcp_unfusable : 1,	/* fusion not allowed on endpoint */
-		tcp_fused_sigurg : 1,	/* send SIGURG upon draining */
 		tcp_cork : 1,		/* tcp_cork option */
-		tcp_pad_to_bit_31 : 15;
+		tcp_pad_to_bit_31 : 18;
 
 	uint32_t	tcp_if_mtu;	/* Outgoing interface MTU. */
 
@@ -514,10 +511,29 @@
 #define	tcp_ipp_use_min_mtu	tcp_sticky_ipp.ipp_use_min_mtu
 	struct tcp_s *tcp_saved_listener;	/* saved value of listener */
 
+	uint32_t	tcp_in_ack_unsent;	/* ACK for unsent data cnt. */
+
+	/*
+	 * The following fusion-related fields are protected by squeue.
+	 */
 	struct tcp_s *tcp_loopback_peer;	/* peer tcp for loopback */
 	mblk_t	*tcp_fused_sigurg_mp;		/* M_PCSIG mblk for SIGURG */
+	size_t	tcp_fuse_rcv_hiwater;		/* fusion receive queue size */
+	uint_t	tcp_fuse_rcv_unread_hiwater;	/* max # of outstanding pkts */
+	/*
+	 * The following fusion-related fields and bit fields are to be
+	 * manipulated with squeue protection or with tcp_fuse_lock held.
+	 */
+	kmutex_t tcp_fuse_lock;
+	uint_t tcp_fuse_rcv_unread_cnt;	/* # of outstanding pkts */
+	uint32_t
+		tcp_fused : 1,		/* loopback tcp in fusion mode */
+		tcp_unfusable : 1,	/* fusion not allowed on endpoint */
+		tcp_fused_sigurg : 1,	/* send SIGURG upon draining */
+		tcp_direct_sockfs : 1,	/* direct calls to sockfs */
 
-	uint32_t	tcp_in_ack_unsent;	/* ACK for unsent data cnt. */
+		tcp_fuse_syncstr_stopped : 1, /* synchronous streams stopped */
+		tcp_fuse_to_bit_31 : 27;
 
 	/*
 	 * This variable is accessed without any lock protection
@@ -525,6 +541,8 @@
 	 * with the rest which require such condition.
 	 */
 	boolean_t	tcp_issocket;	/* this is a socket tcp */
+
+	uint32_t	tcp_squeue_bytes;
 } tcp_t;
 
 extern void 	tcp_free(tcp_t *tcp);
@@ -537,7 +555,8 @@
 extern void	tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
 extern void 	*tcp_get_conn(void *arg);
 extern void	tcp_time_wait_collector(void *arg);
-
+extern int	tcp_snmp_get(queue_t *, mblk_t *);
+extern int	tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 /*
  * The TCP Fanout structure.
  * The hash tables and their linkage (tcp_*_hash_next, tcp_ptp*hn) are
@@ -610,18 +629,6 @@
 #pragma pack()
 #endif
 
-/* Named Dispatch Parameter Management Structure */
-typedef struct tcpparam_s {
-	uint32_t	tcp_param_min;
-	uint32_t	tcp_param_max;
-	uint32_t	tcp_param_val;
-	char		*tcp_param_name;
-} tcpparam_t;
-
-extern tcpparam_t	tcp_param_arr[];
-
-extern boolean_t	do_tcp_fusion;
-
 #if (defined(_KERNEL) || defined(_KMEMUSER))
 extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp);
 #endif
--- a/usr/src/uts/common/inet/tcp/tcp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -73,6 +73,7 @@
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ndp.h>
 #include <inet/mi.h>
@@ -82,6 +83,7 @@
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
 #include <inet/tcp.h>
+#include <inet/tcp_impl.h>
 #include <net/pfkeyv2.h>
 #include <inet/ipsec_info.h>
 #include <inet/ipdrop.h>
@@ -230,8 +232,6 @@
 squeue_func_t tcp_squeue_close_proc;
 squeue_func_t tcp_squeue_wput_proc;
 
-extern vmem_t *ip_minor_arena;
-
 /*
  * This controls how tiny a write must be before we try to copy it
  * into the the mblk on the tail of the transmit queue.  Not much
@@ -278,9 +278,6 @@
  * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
  */
 
-#define	TCP_COUNTERS 1
-#define	TCP_CLD_COUNTERS 0
-
 #ifndef TCP_DEBUG_COUNTER
 #ifdef DEBUG
 #define	TCP_DEBUG_COUNTER 1
@@ -289,6 +286,7 @@
 #endif
 #endif
 
+#define	TCP_CLD_COUNTERS 0
 
 #define	TCP_TAG_CLEAN_DEATH 1
 #define	TCP_MAX_CLEAN_DEATH_TAG 32
@@ -297,20 +295,6 @@
 static int _lint_dummy_;
 #endif
 
-#if TCP_COUNTERS
-#define	TCP_STAT(x)		(tcp_statistics.x.value.ui64++)
-#define	TCP_STAT_UPDATE(x, n)	(tcp_statistics.x.value.ui64 += (n))
-#define	TCP_STAT_SET(x, n)	(tcp_statistics.x.value.ui64 = (n))
-#elif defined(lint)
-#define	TCP_STAT(x)		ASSERT(_lint_dummy_ == 0);
-#define	TCP_STAT_UPDATE(x, n)	ASSERT(_lint_dummy_ == 0);
-#define	TCP_STAT_SET(x, n)	ASSERT(_lint_dummy_ == 0);
-#else
-#define	TCP_STAT(x)
-#define	TCP_STAT_UPDATE(x, n)
-#define	TCP_STAT_SET(x, n)
-#endif
-
 #if TCP_CLD_COUNTERS
 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
 #define	TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
@@ -328,96 +312,7 @@
 #define	TCP_DBGSTAT(x)
 #endif
 
-typedef struct tcp_stat {
-	kstat_named_t	tcp_time_wait;
-	kstat_named_t	tcp_time_wait_syn;
-	kstat_named_t	tcp_time_wait_syn_success;
-	kstat_named_t	tcp_time_wait_syn_fail;
-	kstat_named_t	tcp_reinput_syn;
-	kstat_named_t	tcp_ip_output;
-	kstat_named_t	tcp_detach_non_time_wait;
-	kstat_named_t	tcp_detach_time_wait;
-	kstat_named_t	tcp_time_wait_reap;
-	kstat_named_t	tcp_clean_death_nondetached;
-	kstat_named_t	tcp_reinit_calls;
-	kstat_named_t	tcp_eager_err1;
-	kstat_named_t	tcp_eager_err2;
-	kstat_named_t	tcp_eager_blowoff_calls;
-	kstat_named_t	tcp_eager_blowoff_q;
-	kstat_named_t	tcp_eager_blowoff_q0;
-	kstat_named_t	tcp_not_hard_bound;
-	kstat_named_t	tcp_no_listener;
-	kstat_named_t	tcp_found_eager;
-	kstat_named_t	tcp_wrong_queue;
-	kstat_named_t	tcp_found_eager_binding1;
-	kstat_named_t	tcp_found_eager_bound1;
-	kstat_named_t	tcp_eager_has_listener1;
-	kstat_named_t	tcp_open_alloc;
-	kstat_named_t	tcp_open_detached_alloc;
-	kstat_named_t	tcp_rput_time_wait;
-	kstat_named_t	tcp_listendrop;
-	kstat_named_t	tcp_listendropq0;
-	kstat_named_t	tcp_wrong_rq;
-	kstat_named_t	tcp_rsrv_calls;
-	kstat_named_t	tcp_eagerfree2;
-	kstat_named_t	tcp_eagerfree3;
-	kstat_named_t	tcp_eagerfree4;
-	kstat_named_t	tcp_eagerfree5;
-	kstat_named_t	tcp_timewait_syn_fail;
-	kstat_named_t	tcp_listen_badflags;
-	kstat_named_t	tcp_timeout_calls;
-	kstat_named_t	tcp_timeout_cached_alloc;
-	kstat_named_t	tcp_timeout_cancel_reqs;
-	kstat_named_t	tcp_timeout_canceled;
-	kstat_named_t	tcp_timermp_alloced;
-	kstat_named_t	tcp_timermp_freed;
-	kstat_named_t	tcp_timermp_allocfail;
-	kstat_named_t	tcp_timermp_allocdblfail;
-	kstat_named_t	tcp_push_timer_cnt;
-	kstat_named_t	tcp_ack_timer_cnt;
-	kstat_named_t	tcp_ire_null1;
-	kstat_named_t	tcp_ire_null;
-	kstat_named_t	tcp_ip_send;
-	kstat_named_t	tcp_ip_ire_send;
-	kstat_named_t   tcp_wsrv_called;
-	kstat_named_t   tcp_flwctl_on;
-	kstat_named_t	tcp_timer_fire_early;
-	kstat_named_t	tcp_timer_fire_miss;
-	kstat_named_t	tcp_freelist_cleanup;
-	kstat_named_t	tcp_rput_v6_error;
-	kstat_named_t	tcp_out_sw_cksum;
-	kstat_named_t	tcp_zcopy_on;
-	kstat_named_t	tcp_zcopy_off;
-	kstat_named_t	tcp_zcopy_backoff;
-	kstat_named_t	tcp_zcopy_disable;
-	kstat_named_t	tcp_mdt_pkt_out;
-	kstat_named_t	tcp_mdt_pkt_out_v4;
-	kstat_named_t	tcp_mdt_pkt_out_v6;
-	kstat_named_t	tcp_mdt_discarded;
-	kstat_named_t	tcp_mdt_conn_halted1;
-	kstat_named_t	tcp_mdt_conn_halted2;
-	kstat_named_t	tcp_mdt_conn_halted3;
-	kstat_named_t	tcp_mdt_conn_resumed1;
-	kstat_named_t	tcp_mdt_conn_resumed2;
-	kstat_named_t	tcp_mdt_legacy_small;
-	kstat_named_t	tcp_mdt_legacy_all;
-	kstat_named_t	tcp_mdt_legacy_ret;
-	kstat_named_t	tcp_mdt_allocfail;
-	kstat_named_t	tcp_mdt_addpdescfail;
-	kstat_named_t	tcp_mdt_allocd;
-	kstat_named_t	tcp_mdt_linked;
-	kstat_named_t	tcp_fusion_flowctl;
-	kstat_named_t	tcp_fusion_backenabled;
-	kstat_named_t	tcp_fusion_urg;
-	kstat_named_t	tcp_fusion_putnext;
-	kstat_named_t	tcp_fusion_unfusable;
-	kstat_named_t	tcp_fusion_aborted;
-	kstat_named_t	tcp_fusion_unqualified;
-	kstat_named_t	tcp_in_ack_unsent_drop;
-} tcp_stat_t;
-
-#if (TCP_COUNTERS || TCP_DEBUG_COUNTER)
-static tcp_stat_t tcp_statistics = {
+tcp_stat_t tcp_statistics = {
 	{ "tcp_time_wait",		KSTAT_DATA_UINT64 },
 	{ "tcp_time_wait_syn",		KSTAT_DATA_UINT64 },
 	{ "tcp_time_wait_success",	KSTAT_DATA_UINT64 },
@@ -475,6 +370,7 @@
 	{ "tcp_freelist_cleanup",	KSTAT_DATA_UINT64 },
 	{ "tcp_rput_v6_error",		KSTAT_DATA_UINT64 },
 	{ "tcp_out_sw_cksum",		KSTAT_DATA_UINT64 },
+	{ "tcp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_on",		KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_off",		KSTAT_DATA_UINT64 },
 	{ "tcp_zcopy_backoff",		KSTAT_DATA_UINT64 },
@@ -502,13 +398,14 @@
 	{ "tcp_fusion_unfusable",	KSTAT_DATA_UINT64 },
 	{ "tcp_fusion_aborted",		KSTAT_DATA_UINT64 },
 	{ "tcp_fusion_unqualified",	KSTAT_DATA_UINT64 },
+	{ "tcp_fusion_rrw_busy",	KSTAT_DATA_UINT64 },
+	{ "tcp_fusion_rrw_msgcnt",	KSTAT_DATA_UINT64 },
 	{ "tcp_in_ack_unsent_drop",	KSTAT_DATA_UINT64 },
+	{ "tcp_sock_fallback",		KSTAT_DATA_UINT64 },
 };
 
 static kstat_t *tcp_kstat;
 
-#endif
-
 /*
  * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
  * tcp write side.
@@ -519,12 +416,6 @@
 	connp->conn_send(connp, (mp), (q), IP_WPUT);			\
 }
 
-/*
- * Was this tcp created via socket() interface?
- */
-#define	TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket)
-
-
 /* Macros for timestamp comparisons */
 #define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
 #define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)
@@ -569,8 +460,6 @@
  */
 #define	TCP_OLD_URP_INTERPRETATION	1
 
-#define	TCP_IS_DETACHED(tcp)		((tcp)->tcp_detached)
-
 #define	TCP_IS_DETACHED_NONEAGER(tcp)	\
 	(TCP_IS_DETACHED(tcp) && \
 	    (!(tcp)->tcp_hard_binding))
@@ -687,22 +576,6 @@
 kmem_cache_t	*tcp_sack_info_cache;
 kmem_cache_t	*tcp_iphc_cache;
 
-#define	TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim)
-#define	TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id)
-
-/*
- * To restart the TCP retransmission timer.
- */
-#define	TCP_TIMER_RESTART(tcp, intvl) \
-{ \
-	if ((tcp)->tcp_timer_tid != 0) { \
-		(void) TCP_TIMER_CANCEL((tcp),	\
-					(tcp)->tcp_timer_tid); \
-	} \
-	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \
-	    MSEC_TO_TICK(intvl)); \
-}
-
 /*
  * For scalability, we must not run a timer for every TCP connection
  * in TIME_WAIT state.  To see why, consider (for time wait interval of
@@ -951,7 +824,6 @@
 static mblk_t	*tcp_ire_mp(mblk_t *mp);
 static void	tcp_iss_init(tcp_t *tcp);
 static void	tcp_keepalive_killer(void *arg);
-static int	tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk);
 static int	tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
 static void	tcp_mss_set(tcp_t *tcp, uint32_t size);
 static int	tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
@@ -985,7 +857,6 @@
 		    tcp_t *thisstream, cred_t *cr);
 
 static uint_t	tcp_rcv_drain(queue_t *q, tcp_t *tcp);
-static void	tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len);
 static void	tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
 static boolean_t tcp_send_rst_chk(void);
 static void	tcp_ss_rexmit(tcp_t *tcp);
@@ -994,9 +865,6 @@
 static void	tcp_rput_common(tcp_t *tcp, mblk_t *mp);
 static void	tcp_rsrv(queue_t *q);
 static int	tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
-static int	tcp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int	tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr,
-		    int len);
 static int	tcp_snmp_state(tcp_t *tcp);
 static int	tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
@@ -1018,7 +886,6 @@
 static void	tcp_timer_callback(void *);
 static in_port_t tcp_update_next_port(in_port_t port, boolean_t random);
 static in_port_t tcp_get_next_priv_port(void);
-static void	tcp_wput(queue_t *q, mblk_t *mp);
 static void	tcp_wput_sock(queue_t *q, mblk_t *mp);
 void		tcp_wput_accept(queue_t *q, mblk_t *mp);
 static void	tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
@@ -1044,7 +911,6 @@
 		    boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
 static void	tcp_ack_timer(void *arg);
 static mblk_t	*tcp_ack_mp(tcp_t *tcp);
-static void	tcp_push_timer(void *arg);
 static void	tcp_xmit_early_reset(char *str, mblk_t *mp,
 		    uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len);
 static void	tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
@@ -1076,9 +942,6 @@
 boolean_t	tcp_reserved_port_check(in_port_t);
 static tcp_t	*tcp_alloc_temp_tcp(in_port_t);
 static int	tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
-static void	tcp_timers_stop(tcp_t *);
-static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
-static clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
 static mblk_t	*tcp_mdt_info_mp(mblk_t *);
 static void	tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
 static int	tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
@@ -1098,7 +961,6 @@
 static void	tcp_kstat_fini(void);
 static int	tcp_kstat_update(kstat_t *kp, int rw);
 void		tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
-conn_t		*tcp_get_next_conn(connf_t *, conn_t *);
 static int	tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
 			tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
 static int	tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
@@ -1118,14 +980,6 @@
 static mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
 static void	tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
 
-static void	tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
-static void	tcp_unfuse(tcp_t *);
-static boolean_t tcp_fuse_output(tcp_t *, mblk_t *);
-static void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
-static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
-
-extern mblk_t	*allocb_tryhard(size_t);
-
 /*
  * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
  *
@@ -1155,17 +1009,12 @@
 static int	tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
     boolean_t);
 
-
-static void	tcp_clrqfull(tcp_t *);
-static void	tcp_setqfull(tcp_t *);
-
 static struct module_info tcp_rinfo =  {
-#define	TCP_MODULE_ID	5105
-	TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
+	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
 };
 
 static struct module_info tcp_winfo =  {
-	TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16
+	TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
 };
 
 /*
@@ -1173,11 +1022,12 @@
  * to pass through.
  */
 struct qinit tcp_mod_rinit = {
-	(pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+	(pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo,
 };
 
 struct qinit tcp_mod_winit = {
-	(pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+	(pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL,
+	&tcp_rinfo
 };
 
 /*
@@ -1210,11 +1060,18 @@
 	(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
 };
 
+/*
+ * Entry points for TCP loopback (read side only)
+ */
+struct qinit tcp_loopback_rinit = {
+	(pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0,
+	&tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
+};
+
 struct streamtab tcpinfo = {
 	&tcp_rinit, &tcp_winit
 };
 
-
 extern squeue_func_t tcp_squeue_wput_proc;
 extern squeue_func_t tcp_squeue_timer_proc;
 
@@ -1306,15 +1163,6 @@
 mib2_tcp_t	tcp_mib;	/* SNMP fixed size info */
 kstat_t		*tcp_mibkp;	/* kstat exporting tcp_mib data */
 
-/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX These and other externs should ideally move to a TCP header
- */
-extern optdb_obj_t	tcp_opt_obj;
-extern uint_t		tcp_max_optsize;
-
 boolean_t tcp_icmp_source_quench = B_FALSE;
 /*
  * Following assumes TPI alignment requirements stay along 32 bit
@@ -1454,76 +1302,6 @@
 };
 /* END CSTYLED */
 
-
-#define	tcp_time_wait_interval			tcp_param_arr[0].tcp_param_val
-#define	tcp_conn_req_max_q			tcp_param_arr[1].tcp_param_val
-#define	tcp_conn_req_max_q0			tcp_param_arr[2].tcp_param_val
-#define	tcp_conn_req_min			tcp_param_arr[3].tcp_param_val
-#define	tcp_conn_grace_period			tcp_param_arr[4].tcp_param_val
-#define	tcp_cwnd_max_				tcp_param_arr[5].tcp_param_val
-#define	tcp_dbg					tcp_param_arr[6].tcp_param_val
-#define	tcp_smallest_nonpriv_port		tcp_param_arr[7].tcp_param_val
-#define	tcp_ip_abort_cinterval			tcp_param_arr[8].tcp_param_val
-#define	tcp_ip_abort_linterval			tcp_param_arr[9].tcp_param_val
-#define	tcp_ip_abort_interval			tcp_param_arr[10].tcp_param_val
-#define	tcp_ip_notify_cinterval			tcp_param_arr[11].tcp_param_val
-#define	tcp_ip_notify_interval			tcp_param_arr[12].tcp_param_val
-#define	tcp_ipv4_ttl				tcp_param_arr[13].tcp_param_val
-#define	tcp_keepalive_interval_high		tcp_param_arr[14].tcp_param_max
-#define	tcp_keepalive_interval			tcp_param_arr[14].tcp_param_val
-#define	tcp_keepalive_interval_low		tcp_param_arr[14].tcp_param_min
-#define	tcp_maxpsz_multiplier			tcp_param_arr[15].tcp_param_val
-#define	tcp_mss_def_ipv4			tcp_param_arr[16].tcp_param_val
-#define	tcp_mss_max_ipv4			tcp_param_arr[17].tcp_param_val
-#define	tcp_mss_min				tcp_param_arr[18].tcp_param_val
-#define	tcp_naglim_def				tcp_param_arr[19].tcp_param_val
-#define	tcp_rexmit_interval_initial		tcp_param_arr[20].tcp_param_val
-#define	tcp_rexmit_interval_max			tcp_param_arr[21].tcp_param_val
-#define	tcp_rexmit_interval_min			tcp_param_arr[22].tcp_param_val
-#define	tcp_deferred_ack_interval		tcp_param_arr[23].tcp_param_val
-#define	tcp_snd_lowat_fraction			tcp_param_arr[24].tcp_param_val
-#define	tcp_sth_rcv_hiwat			tcp_param_arr[25].tcp_param_val
-#define	tcp_sth_rcv_lowat			tcp_param_arr[26].tcp_param_val
-#define	tcp_dupack_fast_retransmit		tcp_param_arr[27].tcp_param_val
-#define	tcp_ignore_path_mtu			tcp_param_arr[28].tcp_param_val
-#define	tcp_smallest_anon_port			tcp_param_arr[29].tcp_param_val
-#define	tcp_largest_anon_port			tcp_param_arr[30].tcp_param_val
-#define	tcp_xmit_hiwat				tcp_param_arr[31].tcp_param_val
-#define	tcp_xmit_lowat				tcp_param_arr[32].tcp_param_val
-#define	tcp_recv_hiwat				tcp_param_arr[33].tcp_param_val
-#define	tcp_recv_hiwat_minmss			tcp_param_arr[34].tcp_param_val
-#define	tcp_fin_wait_2_flush_interval		tcp_param_arr[35].tcp_param_val
-#define	tcp_co_min				tcp_param_arr[36].tcp_param_val
-#define	tcp_max_buf				tcp_param_arr[37].tcp_param_val
-#define	tcp_strong_iss				tcp_param_arr[38].tcp_param_val
-#define	tcp_rtt_updates				tcp_param_arr[39].tcp_param_val
-#define	tcp_wscale_always			tcp_param_arr[40].tcp_param_val
-#define	tcp_tstamp_always			tcp_param_arr[41].tcp_param_val
-#define	tcp_tstamp_if_wscale			tcp_param_arr[42].tcp_param_val
-#define	tcp_rexmit_interval_extra		tcp_param_arr[43].tcp_param_val
-#define	tcp_deferred_acks_max			tcp_param_arr[44].tcp_param_val
-#define	tcp_slow_start_after_idle		tcp_param_arr[45].tcp_param_val
-#define	tcp_slow_start_initial			tcp_param_arr[46].tcp_param_val
-#define	tcp_co_timer_interval			tcp_param_arr[47].tcp_param_val
-#define	tcp_sack_permitted			tcp_param_arr[48].tcp_param_val
-#define	tcp_trace				tcp_param_arr[49].tcp_param_val
-#define	tcp_compression_enabled			tcp_param_arr[50].tcp_param_val
-#define	tcp_ipv6_hoplimit			tcp_param_arr[51].tcp_param_val
-#define	tcp_mss_def_ipv6			tcp_param_arr[52].tcp_param_val
-#define	tcp_mss_max_ipv6			tcp_param_arr[53].tcp_param_val
-#define	tcp_rev_src_routes			tcp_param_arr[54].tcp_param_val
-#define	tcp_local_dack_interval			tcp_param_arr[55].tcp_param_val
-#define	tcp_ndd_get_info_interval		tcp_param_arr[56].tcp_param_val
-#define	tcp_local_dacks_max			tcp_param_arr[57].tcp_param_val
-#define	tcp_ecn_permitted			tcp_param_arr[58].tcp_param_val
-#define	tcp_rst_sent_rate_enabled		tcp_param_arr[59].tcp_param_val
-#define	tcp_rst_sent_rate			tcp_param_arr[60].tcp_param_val
-#define	tcp_push_timer_interval			tcp_param_arr[61].tcp_param_val
-#define	tcp_use_smss_as_mss_opt			tcp_param_arr[62].tcp_param_val
-#define	tcp_keepalive_abort_interval_high	tcp_param_arr[63].tcp_param_max
-#define	tcp_keepalive_abort_interval		tcp_param_arr[63].tcp_param_val
-#define	tcp_keepalive_abort_interval_low	tcp_param_arr[63].tcp_param_min
-
 /*
  * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
  * each header fragment in the header buffer.  Each parameter value has
@@ -1720,642 +1498,6 @@
  */
 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
 
-#define	IPH_TCPH_CHECKSUMP(ipha, hlen) \
-	((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16)))
-
-#ifdef  _BIG_ENDIAN
-#define	IP_TCP_CSUM_COMP	IPPROTO_TCP
-#else
-#define	IP_TCP_CSUM_COMP	(IPPROTO_TCP << 8)
-#endif
-
-#define	IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) {		\
-	(sum) += (ttl_protocol) + (ipha)->ipha_ident +			\
-	    ((v_hlen_tos_len) >> 16) +					\
-	    ((v_hlen_tos_len) & 0xFFFF) +				\
-	    (ipha)->ipha_fragment_offset_and_flags;			\
-	(sum) = (((sum) & 0xFFFF) + ((sum) >> 16));			\
-	(sum) = ~((sum) + ((sum) >> 16));				\
-	(ipha)->ipha_hdr_checksum = (uint16_t)(sum);			\
-}
-
-/*
- * Macros that determine whether or not IP processing is needed for TCP.
- */
-#define	TCP_IPOPT_POLICY_V4(tcp)					\
-	((tcp)->tcp_ipversion == IPV4_VERSION &&			\
-	((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH ||		\
-	CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) ||		\
-	CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
-
-#define	TCP_IPOPT_POLICY_V6(tcp)					\
-	((tcp)->tcp_ipversion == IPV6_VERSION &&			\
-	((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN ||			\
-	CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) ||		\
-	CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
-
-#define	TCP_LOOPBACK_IP(tcp)						\
-	(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) ||	\
-	!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
-
-boolean_t do_tcp_fusion = B_TRUE;
-
-/*
- * This routine gets called by the eager tcp upon changing state from
- * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
- * and the active connect tcp such that the regular tcp processings
- * may be bypassed under allowable circumstances.  Because the fusion
- * requires both endpoints to be in the same squeue, it does not work
- * for simultaneous active connects because there is no easy way to
- * switch from one squeue to another once the connection is created.
- * This is different from the eager tcp case where we assign it the
- * same squeue as the one given to the active connect tcp during open.
- */
-static void
-tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
-{
-	conn_t *peer_connp, *connp = tcp->tcp_connp;
-	tcp_t *peer_tcp;
-
-	ASSERT(!tcp->tcp_fused);
-	ASSERT(tcp->tcp_loopback);
-	ASSERT(tcp->tcp_loopback_peer == NULL);
-	/*
-	 * We need to check the listener tcp to make sure it's a socket
-	 * endpoint, but we can't really use tcp_listener since we get
-	 * here after sending up T_CONN_IND and tcp_wput_accept() may be
-	 * called independently, at which point tcp_listener is cleared;
-	 * this is why we use tcp_saved_listener.  The listener itself
-	 * is guaranteed to be around until tcp_accept_finish() is called
-	 * on this eager -- this won't happen until we're done since
-	 * we're inside the eager's perimeter now.
-	 */
-	ASSERT(tcp->tcp_saved_listener != NULL);
-
-	/*
-	 * Lookup peer endpoint; search for the remote endpoint having
-	 * the reversed address-port quadruplet in ESTABLISHED state,
-	 * which is guaranteed to be unique in the system.  Zone check
-	 * is applied accordingly for loopback address, but not for
-	 * local address since we want fusion to happen across Zones.
-	 */
-	if (tcp->tcp_ipversion == IPV4_VERSION) {
-		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
-		    (ipha_t *)iphdr, tcph);
-	} else {
-		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
-		    (ip6_t *)iphdr, tcph);
-	}
-
-	/*
-	 * We can only proceed if peer exists, resides in the same squeue
-	 * as our conn and is not raw-socket.  The squeue assignment of
-	 * this eager tcp was done earlier at the time of SYN processing
-	 * in ip_fanout_tcp{_v6}.  Note that similar squeues by itself
-	 * doesn't guarantee a safe condition to fuse, hence we perform
-	 * additional tests below.
-	 */
-	ASSERT(peer_connp == NULL || peer_connp != connp);
-	if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
-	    !IPCL_IS_TCP(peer_connp)) {
-		if (peer_connp != NULL) {
-			TCP_STAT(tcp_fusion_unqualified);
-			CONN_DEC_REF(peer_connp);
-		}
-		return;
-	}
-	peer_tcp = peer_connp->conn_tcp;	/* active connect tcp */
-
-	ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
-	ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
-	ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
-
-	/*
-	 * Fuse the endpoints; we perform further checks against both
-	 * tcp endpoints to ensure that a fusion is allowed to happen.
-	 * In particular we bail out for TPI, non-simple TCP/IP or if
-	 * IPsec/IPQoS policy exists.  We could actually do it for the
-	 * XTI/TLI/TPI case but this requires more testing, so for now
-	 * we handle only the socket case.
-	 */
-	if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
-	    TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) &&
-	    !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
-	    !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
-		mblk_t *mp;
-		struct stroptions *stropt;
-		queue_t *peer_rq = peer_tcp->tcp_rq;
-		size_t sth_hiwat;
-
-		ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
-
-		/*
-		 * We need to drain data on both endpoints during unfuse.
-		 * If we need to send up SIGURG at the time of draining,
-		 * we want to be sure that an mblk is readily available.
-		 * This is why we pre-allocate the M_PCSIG mblks for both
-		 * endpoints which will only be used during/after unfuse.
-		 */
-		if ((mp = allocb(1, BPRI_HI)) == NULL) {
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-		ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
-		tcp->tcp_fused_sigurg_mp = mp;
-
-		if ((mp = allocb(1, BPRI_HI)) == NULL) {
-			freeb(tcp->tcp_fused_sigurg_mp);
-			tcp->tcp_fused_sigurg_mp = NULL;
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
-		peer_tcp->tcp_fused_sigurg_mp = mp;
-
-		/* Allocate M_SETOPTS mblk */
-		mp = allocb(sizeof (*stropt), BPRI_HI);
-		if (mp == NULL) {
-			freeb(tcp->tcp_fused_sigurg_mp);
-			tcp->tcp_fused_sigurg_mp = NULL;
-			freeb(peer_tcp->tcp_fused_sigurg_mp);
-			peer_tcp->tcp_fused_sigurg_mp = NULL;
-			CONN_DEC_REF(peer_connp);
-			return;
-		}
-
-		/* Fuse both endpoints */
-		peer_tcp->tcp_loopback_peer = tcp;
-		tcp->tcp_loopback_peer = peer_tcp;
-		peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
-
-		/*
-		 * We never use regular tcp paths in fusion and should
-		 * therefore clear tcp_unsent on both endpoints.  Having
-		 * them set to non-zero values means asking for trouble
-		 * especially after unfuse, where we may end up sending
-		 * through regular tcp paths which expect xmit_list and
-		 * friends to be correctly setup.
-		 */
-		peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
-
-		tcp_timers_stop(tcp);
-		tcp_timers_stop(peer_tcp);
-
-		/*
-		 * Set the stream head's write offset value to zero, since we
-		 * won't be needing any room for TCP/IP headers, and tell it
-		 * to not break up the writes.  This would reduce the amount
-		 * of work done by kmem.  In addition, we set the receive
-		 * buffer to twice that of q_hiwat in order to simulate the
-		 * non-fusion case.  Note that we can only do this for the
-		 * active connect tcp since our eager is still detached;
-		 * it will be dealt with later in tcp_accept_finish().
-		 */
-		DB_TYPE(mp) = M_SETOPTS;
-		mp->b_wptr += sizeof (*stropt);
-
-		sth_hiwat = peer_rq->q_hiwat << 1;
-		if (sth_hiwat > tcp_max_buf)
-			sth_hiwat = tcp_max_buf;
-
-		stropt = (struct stroptions *)mp->b_rptr;
-		stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
-		stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
-		stropt->so_wroff = 0;
-		stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
-
-		/* Send the options up */
-		putnext(peer_rq, mp);
-	} else {
-		TCP_STAT(tcp_fusion_unqualified);
-	}
-	CONN_DEC_REF(peer_connp);
-}
-
-/*
- * Unfuse a previously-fused pair of tcp loopback endpoints.
- */
-static void
-tcp_unfuse(tcp_t *tcp)
-{
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-
-	ASSERT(tcp->tcp_fused && peer_tcp != NULL);
-	ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
-	ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
-	ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
-	ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
-
-	/*
-	 * Drain any pending data; the detached check is needed because
-	 * we may be called from tcp_fuse_output().  Note that in case of
-	 * a detached tcp, the draining will happen later after the tcp
-	 * is unfused.  For non-urgent data, this can be handled by the
-	 * regular tcp_rcv_drain().  If we have urgent data sitting in
-	 * the receive list, we will need to send up a SIGURG signal first
-	 * before draining the data.  All of these will be handled by the
-	 * code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain().
-	 */
-	if (!TCP_IS_DETACHED(tcp)) {
-		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
-		    &tcp->tcp_fused_sigurg_mp);
-	}
-	if (!TCP_IS_DETACHED(peer_tcp)) {
-		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
-		    &peer_tcp->tcp_fused_sigurg_mp);
-	}
-	/* Lift up any flow-control conditions */
-	if (tcp->tcp_flow_stopped) {
-		tcp_clrqfull(tcp);
-		tcp->tcp_flow_stopped = B_FALSE;
-		TCP_STAT(tcp_fusion_backenabled);
-	}
-	if (peer_tcp->tcp_flow_stopped) {
-		tcp_clrqfull(peer_tcp);
-		peer_tcp->tcp_flow_stopped = B_FALSE;
-		TCP_STAT(tcp_fusion_backenabled);
-	}
-
-	/* Free up M_PCSIG mblk(s) if not needed */
-	if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) {
-		freeb(tcp->tcp_fused_sigurg_mp);
-		tcp->tcp_fused_sigurg_mp = NULL;
-	}
-	if (!peer_tcp->tcp_fused_sigurg &&
-	    peer_tcp->tcp_fused_sigurg_mp != NULL) {
-		freeb(peer_tcp->tcp_fused_sigurg_mp);
-		peer_tcp->tcp_fused_sigurg_mp = NULL;
-	}
-
-	/*
-	 * Update th_seq and th_ack in the header template
-	 */
-	U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
-	U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
-	U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
-
-	/* Unfuse the endpoints */
-	peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
-	peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
-}
-
-/*
- * Fusion output routine for urgent data.  This routine is called by
- * tcp_fuse_output() for handling non-M_DATA mblks.
- */
-static void
-tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
-{
-	mblk_t *mp1;
-	struct T_exdata_ind *tei;
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	mblk_t *head, *prev_head = NULL;
-
-	ASSERT(tcp->tcp_fused);
-	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-	ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
-	ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
-
-	/*
-	 * Urgent data arrives in the form of T_EXDATA_REQ from above.
-	 * Each occurence denotes a new urgent pointer.  For each new
-	 * urgent pointer we signal (SIGURG) the receiving app to indicate
-	 * that it needs to go into urgent mode.  This is similar to the
-	 * urgent data handling in the regular tcp.  We don't need to keep
-	 * track of where the urgent pointer is, because each T_EXDATA_REQ
-	 * "advances" the urgent pointer for us.
-	 *
-	 * The actual urgent data carried by T_EXDATA_REQ is then prepended
-	 * by a T_EXDATA_IND before being enqueued behind any existing data
-	 * destined for the receiving app.  There is only a single urgent
-	 * pointer (out-of-band mark) for a given tcp.  If the new urgent
-	 * data arrives before the receiving app reads some existing urgent
-	 * data, the previous marker is lost.  This behavior is emulated
-	 * accordingly below, by removing any existing T_EXDATA_IND messages
-	 * and essentially converting old urgent data into non-urgent.
-	 */
-	ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
-	/* Let sender get out of urgent mode */
-	tcp->tcp_valid_bits &= ~TCP_URG_VALID;
-
-	/*
-	 * Send up SIGURG to the receiving peer; if the peer is detached
-	 * or if we can't allocate the M_PCSIG, indicate that we need to
-	 * signal upon draining to the peer by marking tcp_fused_sigurg.
-	 * This flag will only get cleared once SIGURG is delivered and
-	 * is not affected by the tcp_fused flag -- delivery will still
-	 * happen even after an endpoint is unfused, to handle the case
-	 * where the sending endpoint immediately closes/unfuses after
-	 * sending urgent data and the accept is not yet finished.
-	 */
-	if (!TCP_IS_DETACHED(peer_tcp) &&
-	    ((mp1 = allocb(1, BPRI_HI)) != NULL ||
-	    (mp1 = allocb_tryhard(1)) != NULL)) {
-		peer_tcp->tcp_fused_sigurg = B_FALSE;
-		/* Send up the signal */
-		DB_TYPE(mp1) = M_PCSIG;
-		*mp1->b_wptr++ = (uchar_t)SIGURG;
-		putnext(peer_tcp->tcp_rq, mp1);
-	} else {
-		peer_tcp->tcp_fused_sigurg = B_TRUE;
-	}
-
-	/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
-	DB_TYPE(mp) = M_PROTO;
-	tei = (struct T_exdata_ind *)mp->b_rptr;
-	tei->PRIM_type = T_EXDATA_IND;
-	tei->MORE_flag = 0;
-	mp->b_wptr = (uchar_t *)&tei[1];
-
-	TCP_STAT(tcp_fusion_urg);
-	BUMP_MIB(&tcp_mib, tcpOutUrg);
-
-	head = peer_tcp->tcp_rcv_list;
-	while (head != NULL) {
-		/*
-		 * Remove existing T_EXDATA_IND, keep the data which follows
-		 * it and relink our list.  Note that we don't modify the
-		 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
-		 */
-		if (DB_TYPE(head) != M_DATA) {
-			mp1 = head;
-
-			ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
-			head = mp1->b_cont;
-			mp1->b_cont = NULL;
-			head->b_next = mp1->b_next;
-			mp1->b_next = NULL;
-			if (prev_head != NULL)
-				prev_head->b_next = head;
-			if (peer_tcp->tcp_rcv_list == mp1)
-				peer_tcp->tcp_rcv_list = head;
-			if (peer_tcp->tcp_rcv_last_head == mp1)
-				peer_tcp->tcp_rcv_last_head = head;
-			freeb(mp1);
-		}
-		prev_head = head;
-		head = head->b_next;
-	}
-}
-
-/*
- * Fusion output routine, called by tcp_output() and tcp_wput_proto().
- */
-static boolean_t
-tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
-{
-	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-	queue_t *peer_rq;
-	mblk_t *mp_tail = mp;
-	uint32_t send_size = 0;
-
-	ASSERT(tcp->tcp_fused);
-	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
-	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
-	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
-	    DB_TYPE(mp) == M_PCPROTO);
-
-	peer_rq = peer_tcp->tcp_rq;
-
-	/* If this connection requires IP, unfuse and use regular path */
-	if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
-	    IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
-		TCP_STAT(tcp_fusion_aborted);
-		tcp_unfuse(tcp);
-		return (B_FALSE);
-	}
-
-	for (;;) {
-		if (DB_TYPE(mp_tail) == M_DATA)
-			send_size += MBLKL(mp_tail);
-		if (mp_tail->b_cont == NULL)
-			break;
-		mp_tail = mp_tail->b_cont;
-	}
-
-	if (send_size == 0) {
-		freemsg(mp);
-		return (B_TRUE);
-	}
-
-	/*
-	 * Handle urgent data; we either send up SIGURG to the peer now
-	 * or do it later when we drain, in case the peer is detached
-	 * or if we're short of memory for M_PCSIG mblk.
-	 */
-	if (DB_TYPE(mp) != M_DATA)
-		tcp_fuse_output_urg(tcp, mp);
-
-	/*
-	 * Enqueue data into the peer's receive list; we may or may not
-	 * drain the contents depending on the conditions below.
-	 */
-	tcp_rcv_enqueue(peer_tcp, mp, send_size);
-
-	/* In case it wrapped around and also to keep it constant */
-	peer_tcp->tcp_rwnd += send_size;
-
-	/*
-	 * If peer is detached, exercise flow-control when needed; we will
-	 * get back-enabled either in tcp_accept_finish() or tcp_unfuse().
-	 */
-	if (TCP_IS_DETACHED(peer_tcp) &&
-	    peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) {
-		tcp_setqfull(tcp);
-		tcp->tcp_flow_stopped = B_TRUE;
-		TCP_STAT(tcp_fusion_flowctl);
-	}
-
-	loopback_packets++;
-	tcp->tcp_last_sent_len = send_size;
-
-	/* Need to adjust the following SNMP MIB-related variables */
-	tcp->tcp_snxt += send_size;
-	tcp->tcp_suna = tcp->tcp_snxt;
-	peer_tcp->tcp_rnxt += send_size;
-	peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
-
-	BUMP_MIB(&tcp_mib, tcpOutDataSegs);
-	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
-
-	BUMP_MIB(&tcp_mib, tcpInSegs);
-	BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
-	UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
-
-	BUMP_LOCAL(tcp->tcp_obsegs);
-	BUMP_LOCAL(peer_tcp->tcp_ibsegs);
-
-	if (!TCP_IS_DETACHED(peer_tcp)) {
-		/*
-		 * If we can't send SIGURG above due to lack of memory,
-		 * schedule push timer and try again.  Otherwise drain
-		 * the data if we're not flow-controlled.
-		 */
-		if (peer_tcp->tcp_fused_sigurg) {
-			if (peer_tcp->tcp_push_tid == 0) {
-				peer_tcp->tcp_push_tid =
-				    TCP_TIMER(peer_tcp, tcp_push_timer,
-				    MSEC_TO_TICK(tcp_push_timer_interval));
-			}
-		} else if (!tcp->tcp_flow_stopped) {
-			if (!canputnext(peer_rq)) {
-				tcp_setqfull(tcp);
-				tcp->tcp_flow_stopped = B_TRUE;
-				TCP_STAT(tcp_fusion_flowctl);
-			} else {
-				ASSERT(peer_tcp->tcp_rcv_list != NULL);
-				(void) tcp_fuse_rcv_drain(peer_rq,
-				    peer_tcp, NULL);
-				TCP_STAT(tcp_fusion_putnext);
-			}
-		}
-	}
-	return (B_TRUE);
-}
-
-/*
- * This routine gets called to deliver data upstream on a fused or
- * previously fused tcp loopback endpoint; the latter happens only
- * when there is a pending SIGURG signal plus urgent data that can't
- * be sent upstream in the past.
- */
-static boolean_t
-tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
-{
-	mblk_t *mp;
-#ifdef DEBUG
-	uint_t cnt = 0;
-#endif
-
-	ASSERT(tcp->tcp_loopback);
-	ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
-	ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
-	ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
-
-	/* No need for the push timer now, in case it was scheduled */
-	if (tcp->tcp_push_tid != 0) {
-		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
-		tcp->tcp_push_tid = 0;
-	}
-	/*
-	 * If there's urgent data sitting in receive list and we didn't
-	 * get a chance to send up a SIGURG signal, make sure we send
-	 * it first before draining in order to ensure that SIOCATMARK
-	 * works properly.
-	 */
-	if (tcp->tcp_fused_sigurg) {
-		/*
-		 * sigurg_mpp is normally NULL, i.e. when we're still
-		 * fused and didn't get here because of tcp_unfuse().
-		 * In this case try hard to allocate the M_PCSIG mblk.
-		 */
-		if (sigurg_mpp == NULL &&
-		    (mp = allocb(1, BPRI_HI)) == NULL &&
-		    (mp = allocb_tryhard(1)) == NULL) {
-			/* Alloc failed; try again next time */
-			tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
-			    MSEC_TO_TICK(tcp_push_timer_interval));
-			return (B_TRUE);
-		} else if (sigurg_mpp != NULL) {
-			/*
-			 * Use the supplied M_PCSIG mblk; it means we're
-			 * either unfused or in the process of unfusing,
-			 * and the drain must happen now.
-			 */
-			mp = *sigurg_mpp;
-			*sigurg_mpp = NULL;
-		}
-		ASSERT(mp != NULL);
-
-		tcp->tcp_fused_sigurg = B_FALSE;
-		/* Send up the signal */
-		DB_TYPE(mp) = M_PCSIG;
-		*mp->b_wptr++ = (uchar_t)SIGURG;
-		putnext(q, mp);
-		/*
-		 * Let the regular tcp_rcv_drain() path handle
-		 * draining the data if we're no longer fused.
-		 */
-		if (!tcp->tcp_fused)
-			return (B_FALSE);
-	}
-
-	/* Drain the data */
-	while ((mp = tcp->tcp_rcv_list) != NULL) {
-		tcp->tcp_rcv_list = mp->b_next;
-		mp->b_next = NULL;
-#ifdef DEBUG
-		cnt += msgdsize(mp);
-#endif
-		putnext(q, mp);
-	}
-
-	ASSERT(cnt == tcp->tcp_rcv_cnt);
-	tcp->tcp_rcv_last_head = NULL;
-	tcp->tcp_rcv_last_tail = NULL;
-	tcp->tcp_rcv_cnt = 0;
-	tcp->tcp_rwnd = q->q_hiwat;
-
-	return (B_TRUE);
-}
-
-/*
- * This is the walker function, which is TCP specific.
- * It walks through the conn_hash bucket searching for the
- * next valid connp/tcp in the list, selecting connp/tcp
- * which haven't closed or condemned. It also REFHOLDS the
- * reference for the tcp, ensuring that the tcp exists
- * when the caller uses the tcp.
- *
- * tcp_get_next_conn
- * 	get the next entry in the conn global list
- * 	and put a reference on the next_conn.
- * 	decrement the reference on the current conn.
- */
-conn_t *
-tcp_get_next_conn(connf_t *connfp, conn_t *connp)
-{
-	conn_t	*next_connp;
-
-	if (connfp == NULL)
-		return (NULL);
-
-	mutex_enter(&connfp->connf_lock);
-
-	next_connp = (connp == NULL) ?
-	    connfp->connf_head : connp->conn_g_next;
-
-	while (next_connp != NULL) {
-		mutex_enter(&next_connp->conn_lock);
-		if ((next_connp->conn_state_flags &
-		    (CONN_CONDEMNED | CONN_INCIPIENT)) ||
-			!IPCL_IS_TCP(next_connp)) {
-			/*
-			 * This conn has been condemned or
-			 * is closing.
-			 */
-			mutex_exit(&next_connp->conn_lock);
-			next_connp = next_connp->conn_g_next;
-			continue;
-		}
-		ASSERT(next_connp->conn_tcp != NULL);
-		CONN_INC_REF_LOCKED(next_connp);
-		mutex_exit(&next_connp->conn_lock);
-		break;
-	}
-
-	mutex_exit(&connfp->connf_lock);
-
-	if (connp != NULL) {
-		CONN_DEC_REF(connp);
-	}
-
-	return (next_connp);
-}
-
 /*
  * Figure out the value of window scale opton.  Note that the rwnd is
  * ASSUMED to be rounded up to the nearest MSS before the calculation.
@@ -2808,7 +1950,7 @@
 		acceptor = tcp_acceptor_hash_lookup(acceptor_id);
 		if (acceptor == NULL) {
 			if (listener->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_accept: did not find acceptor 0x%x\n",
 				    acceptor_id);
@@ -3737,7 +2879,7 @@
 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad req, len %u",
 			    (uint_t)(mp->b_wptr - mp->b_rptr));
 		}
@@ -3768,7 +2910,7 @@
 			goto do_bind;
 		}
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad state, %d", tcp->tcp_state);
 		}
 		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -3805,7 +2947,7 @@
 		    sizeof (sin_t));
 		if (sin == NULL || !OK_32PTR((char *)sin)) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: bad address parameter, "
 				    "offset %d, len %d",
@@ -3835,7 +2977,7 @@
 		    tbr->ADDR_offset, sizeof (sin6_t));
 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: bad IPv6 address parameter, "
 				    "offset %d, len %d", tbr->ADDR_offset,
@@ -3857,7 +2999,7 @@
 
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_bind: bad address length, %d",
 			    tbr->ADDR_length);
 		}
@@ -3945,7 +3087,7 @@
 
 			if (secpolicy_net_privaddr(cr, requested_port) != 0) {
 				if (tcp->tcp_debug) {
-					(void) strlog(TCP_MODULE_ID, 0, 1,
+					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_ERROR|SL_TRACE,
 					    "tcp_bind: no priv for port %d",
 					    requested_port);
@@ -3963,7 +3105,7 @@
 	if (allocated_port == 0) {
 		if (bind_to_req_port_only) {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: requested addr busy");
 			}
@@ -3971,7 +3113,7 @@
 		} else {
 			/* If we are out of ports, fail the bind. */
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_bind: out of ports?");
 			}
@@ -4436,7 +3578,7 @@
 			(void) putnextctl1(q, M_FLUSH, FLUSHR);
 		}
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_clean_death: discon err %d", err);
 		}
 		mp = mi_tpi_discon_ind(NULL, err, 0);
@@ -4444,7 +3586,7 @@
 			putnext(q, mp);
 		} else {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_clean_death, sending M_ERROR");
 			}
@@ -4476,7 +3618,6 @@
 	if (tcp->tcp_state > TCPS_LISTEN) {
 		tcp_acceptor_hash_remove(tcp);
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 
@@ -4621,23 +3762,6 @@
 	return (0);
 }
 
-int
-tcp_modclose(queue_t *q)
-{
-	conn_t *connp = Q_TO_CONN(q);
-	ASSERT((connp->conn_flags & IPCL_TCPMOD) != 0);
-
-	qprocsoff(q);
-
-	if (connp->conn_cred != NULL) {
-		crfree(connp->conn_cred);
-		connp->conn_cred = NULL;
-	}
-	CONN_DEC_REF(connp);
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	return (0);
-}
-
 static int
 tcpclose_accept(queue_t *q)
 {
@@ -4798,7 +3922,6 @@
 		tcp_acceptor_hash_remove(tcp);
 
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 
@@ -4922,7 +4045,7 @@
 /*
  * Stop all TCP timers, and free the timer mblks if requested.
  */
-static void
+void
 tcp_timers_stop(tcp_t *tcp)
 {
 	if (tcp->tcp_timer_tid != 0) {
@@ -5285,7 +4408,7 @@
 		return (B_FALSE);
 
 	if (tcp->tcp_debug) {
-		(void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
 		    "tcp_drop_q0: listen half-open queue (max=%d) overflow"
 		    " (%d pending) on %s, drop one", tcp_conn_req_max_q0,
 		    tcp->tcp_conn_req_cnt_q0,
@@ -5371,8 +4494,8 @@
 		connp->conn_remv6 = ip6h->ip6_src;
 
 		/* db_cksumstuff is set at ip_fanout_tcp_v6 */
-		ifindex = (int)mp->b_datap->db_cksumstuff;
-		mp->b_datap->db_cksumstuff = 0;
+		ifindex = (int)DB_CKSUMSTUFF(mp);
+		DB_CKSUMSTUFF(mp) = 0;
 
 		sin6 = sin6_null;
 		sin6.sin6_addr = ip6h->ip6_src;
@@ -5727,8 +4850,8 @@
 		mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
 	}
 
-	new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
-	mp->b_datap->db_cksumstart = 0;
+	new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+	DB_CKSUMSTART(mp) = 0;
 
 	ASSERT(OK_32PTR(mp->b_rptr));
 	ipvers = IPH_HDR_VERSION(mp->b_rptr);
@@ -6012,7 +5135,7 @@
 		TCP_STAT(tcp_listendrop);
 		BUMP_MIB(&tcp_mib, tcpListenDrop);
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_conn_request: listen backlog (max=%d) "
 			    "overflow (%d pending) on %s",
 			    tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
@@ -6037,7 +5160,7 @@
 			mutex_exit(&tcp->tcp_eager_lock);
 			BUMP_MIB(&tcp_mib, tcpListenDropQ0);
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
 				    "tcp_conn_request: listen half-open queue "
 				    "(max=%d) full (%d pending) on %s",
 				    tcp_conn_req_max_q0,
@@ -6058,8 +5181,8 @@
 	 * otherwise an error case if neither of them is set.
 	 */
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
-		mp->b_datap->db_cksumstart = 0;
+		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+		DB_CKSUMSTART(mp) = 0;
 		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
 		econnp = (conn_t *)tcp_get_conn(arg2);
 		if (econnp == NULL)
@@ -6420,7 +5543,7 @@
 	uint32_t	conn_flags;
 
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
+		new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
 	} else {
 		goto done;
 	}
@@ -7174,7 +6297,7 @@
 	 */
 	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_disconnect: bad state, %d", tcp->tcp_state);
 		}
 		tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -7988,10 +7111,6 @@
 	/* Cancel outstanding timers */
 	tcp_timers_stop(tcp);
 
-	if (tcp->tcp_flow_stopped) {
-		tcp->tcp_flow_stopped = B_FALSE;
-		tcp_clrqfull(tcp);
-	}
 	/*
 	 * Reset everything in the state vector, after updating global
 	 * MIB data from instance counters.
@@ -8006,6 +7125,10 @@
 		tcp_zcopy_notify(tcp);
 	tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
 	tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
+	if (tcp->tcp_flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
 	tcp_close_mpp(&tcp->tcp_reass_head);
 	tcp->tcp_reass_tail = NULL;
 	if (tcp->tcp_rcv_list != NULL) {
@@ -8193,7 +7316,6 @@
 	tcp->tcp_fin_sent = 0;
 	tcp->tcp_ordrel_done = 0;
 
-	ASSERT(tcp->tcp_flow_stopped == 0);
 	tcp->tcp_debug = 0;
 	tcp->tcp_dontroute = 0;
 	tcp->tcp_broadcast = 0;
@@ -8390,14 +7512,22 @@
 	ASSERT(tcp->tcp_rthdrlen == 0);
 	PRESERVE(tcp->tcp_drop_opt_ack_cnt);
 
+	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
+	tcp->tcp_direct_sockfs = B_FALSE;
+	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
+	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
 
 	tcp->tcp_in_ack_unsent = 0;
 	tcp->tcp_cork = B_FALSE;
 
+	PRESERVE(tcp->tcp_squeue_bytes);
+
 #undef	DONTCARE
 #undef	PRESERVE
 }
@@ -8469,10 +7599,16 @@
 	tcp->tcp_mdt_hdr_head = 0;
 	tcp->tcp_mdt_hdr_tail = 0;
 
+	/* Reset fusion-related fields */
 	tcp->tcp_fused = B_FALSE;
 	tcp->tcp_unfusable = B_FALSE;
 	tcp->tcp_fused_sigurg = B_FALSE;
+	tcp->tcp_direct_sockfs = B_FALSE;
+	tcp->tcp_fuse_syncstr_stopped = B_FALSE;
 	tcp->tcp_loopback_peer = NULL;
+	tcp->tcp_fuse_rcv_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
 
 	/* Initialize the header template */
 	if (tcp->tcp_ipversion == IPV4_VERSION) {
@@ -9505,7 +8641,7 @@
 	    MSEC_TO_TICK(firetime));
 }
 
-static int
+int
 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
 {
 	queue_t	*q = tcp->tcp_rq;
@@ -9515,7 +8651,10 @@
 	if (TCP_IS_DETACHED(tcp))
 		return (mss);
 
-	if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
+	if (tcp->tcp_fused) {
+		maxpsz = tcp_fuse_maxpsz_set(tcp);
+		mss = INFPSZ;
+	} else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
 		/*
 		 * Set the sd_qn_maxpsz according to the socket send buffer
 		 * size, and sd_maxblk to INFPSZ (-1).  This will essentially
@@ -9545,9 +8684,6 @@
 	if (set_maxblk)
 		(void) mi_set_sth_maxblk(q, mss);
 
-	if (tcp->tcp_loopback)
-		(void) mi_set_sth_copyopt(tcp->tcp_rq, COPYCACHED);
-
 	return (mss);
 }
 
@@ -9868,7 +9004,6 @@
 		 */
 		connp->conn_flags |= IPCL_SOCKET;
 		tcp->tcp_issocket = 1;
-
 		WR(q)->q_qinfo = &tcp_sock_winit;
 	} else {
 #ifdef	_ILP32
@@ -10452,32 +9587,45 @@
 			if (!checkonly)
 				tcp->tcp_dgram_errind = onoff;
 			break;
-		case SO_SNDBUF:
+		case SO_SNDBUF: {
+			tcp_t *peer_tcp;
+
 			if (*i1 > tcp_max_buf) {
 				*outlenp = 0;
 				return (ENOBUFS);
 			}
-			if (!checkonly) {
-				tcp->tcp_xmit_hiwater = *i1;
-				if (tcp_snd_lowat_fraction != 0)
-					tcp->tcp_xmit_lowater =
-					    tcp->tcp_xmit_hiwater /
-					    tcp_snd_lowat_fraction;
-				(void) tcp_maxpsz_set(tcp, B_TRUE);
-				/*
-				 * If we are flow-controlled, recheck the
-				 * condition. There are apps that increase
-				 * SO_SNDBUF size when flow-controlled
-				 * (EWOULDBLOCK), and expect the flow control
-				 * condition to be lifted right away.
-				 */
-				if (tcp->tcp_flow_stopped &&
-				    tcp->tcp_unsent < tcp->tcp_xmit_hiwater) {
-					tcp->tcp_flow_stopped = B_FALSE;
-					tcp_clrqfull(tcp);
-				}
-			}
-			break;
+			if (checkonly)
+				break;
+
+			tcp->tcp_xmit_hiwater = *i1;
+			if (tcp_snd_lowat_fraction != 0)
+				tcp->tcp_xmit_lowater =
+				    tcp->tcp_xmit_hiwater /
+				    tcp_snd_lowat_fraction;
+			(void) tcp_maxpsz_set(tcp, B_TRUE);
+			/*
+			 * If we are flow-controlled, recheck the condition.
+			 * There are apps that increase SO_SNDBUF size when
+			 * flow-controlled (EWOULDBLOCK), and expect the flow
+			 * control condition to be lifted right away.
+			 *
+			 * For the fused tcp loopback case, in order to avoid
+			 * a race with the peer's tcp_fuse_rrw() we need to
+			 * hold its fuse_lock while accessing tcp_flow_stopped.
+			 */
+			peer_tcp = tcp->tcp_loopback_peer;
+			ASSERT(!tcp->tcp_fused || peer_tcp != NULL);
+			if (tcp->tcp_fused)
+				mutex_enter(&peer_tcp->tcp_fuse_lock);
+
+			if (tcp->tcp_flow_stopped &&
+			    TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
+				tcp_clrqfull(tcp);
+			}
+			if (tcp->tcp_fused)
+				mutex_exit(&peer_tcp->tcp_fuse_lock);
+			break;
+		}
 		case SO_RCVBUF:
 			if (*i1 > tcp_max_buf) {
 				*outlenp = 0;
@@ -11892,7 +11040,7 @@
  * M_DATA messages are added to the current element.
  * Other messages are added as new (b_next) elements.
  */
-static void
+void
 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
 {
 	ASSERT(seg_len == msgdsize(mp));
@@ -12380,7 +11528,7 @@
 		BUMP_MIB(&ip_mib, ipsecInSucceeded);
 		return (B_TRUE);
 	}
-	(void) strlog(TCP_MODULE_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
+	(void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
 	    "tcp inbound policy mismatch: %s, packet dropped\n",
 	    reason);
 	BUMP_MIB(&ip_mib, ipsecInFailed);
@@ -13469,7 +12617,7 @@
 			 */
 			seg_len -= gap;
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: unacceptable, gap %d, rgap %d, "
 				    "flags 0x%x, seg_seq %u, seg_ack %u, "
 				    "seg_len %d, rnxt %u, snxt %u, %s",
@@ -13873,7 +13021,7 @@
 			tcp->tcp_urp_mark_mp = mp1;
 			flags |= TH_SEND_URP_MARK;
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
 			    "last %x, %s",
 			    seg_seq, urp, tcp->tcp_urp_last,
@@ -14012,7 +13160,7 @@
 				mp1->b_wptr = (uchar_t *)&tei[1];
 				tcp->tcp_urp_mp = mp1;
 #ifdef DEBUG
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: allocated exdata_ind %s",
 				    tcp_display(tcp, NULL,
 				    DISP_PORT_ONLY));
@@ -14059,7 +13207,7 @@
 				tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT;
 			}
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: AT MARK, len %d, flags 0x%x, %s",
 			    seg_len, flags,
 			    tcp_display(tcp, NULL, DISP_PORT_ONLY));
@@ -14067,7 +13215,7 @@
 		} else {
 			/* Data left until we hit mark */
 #ifdef DEBUG
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 			    "tcp_rput: URP %d bytes left, %s",
 			    urp - seg_len, tcp_display(tcp, NULL,
 			    DISP_PORT_ONLY));
@@ -14990,7 +14138,7 @@
 		/* Ready for a new signal. */
 		tcp->tcp_urp_last_valid = B_FALSE;
 #ifdef DEBUG
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_rput: sending exdata_ind %s",
 		    tcp_display(tcp, NULL, DISP_PORT_ONLY));
 #endif /* DEBUG */
@@ -15026,7 +14174,7 @@
 			    tcp->tcp_fused_sigurg);
 			if (flags & TH_MARKNEXT_NEEDED) {
 #ifdef DEBUG
-				(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+				(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 				    "tcp_rput: sending MSGMARKNEXT %s",
 				    tcp_display(tcp, NULL,
 				    DISP_PORT_ONLY));
@@ -15167,7 +14315,7 @@
 		mp1 = tcp->tcp_urp_mark_mp;
 		tcp->tcp_urp_mark_mp = NULL;
 #ifdef DEBUG
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_rput: sending zero-length %s %s",
 		    ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
 		    "MSGNOTMARKNEXT"),
@@ -15853,7 +15001,7 @@
 			return;
 		case T_ERROR_ACK:
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_TRACE|SL_ERROR,
 				    "tcp_rput_other: case T_ERROR_ACK, "
 				    "ERROR_prim == %d",
@@ -15984,11 +15132,20 @@
 		ASSERT(tcp->tcp_connp->conn_sqp ==
 		    peer_tcp->tcp_connp->conn_sqp);
 
+		/*
+		 * Normally we would not get backenabled in synchronous
+		 * streams mode, but in case this happens, we need to stop
+		 * synchronous streams temporarily to prevent a race with
+		 * tcp_fuse_rrw() or tcp_fuse_rinfop().  It is safe to access
+		 * tcp_rcv_list here because those entry points will return
+		 * right away when synchronous streams is stopped.
+		 */
+		TCP_FUSE_SYNCSTR_STOP(tcp);
 		if (tcp->tcp_rcv_list != NULL)
 			(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
 
 		tcp_clrqfull(peer_tcp);
-		peer_tcp->tcp_flow_stopped = B_FALSE;
+		TCP_FUSE_SYNCSTR_RESUME(tcp);
 		TCP_STAT(tcp_fusion_backenabled);
 		return;
 	}
@@ -16118,6 +15275,30 @@
 	uint32_t	max_transmittable_rwnd;
 	boolean_t	tcp_detached = TCP_IS_DETACHED(tcp);
 
+	if (tcp->tcp_fused) {
+		size_t sth_hiwat;
+		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+		ASSERT(peer_tcp != NULL);
+		/*
+		 * Record the stream head's high water mark for
+		 * this endpoint; this is used for flow-control
+		 * purposes in tcp_fuse_output().
+		 */
+		sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
+		if (!tcp_detached)
+			(void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat);
+
+		/*
+		 * In the fusion case, the maxpsz stream head value of
+		 * our peer is set according to its send buffer size
+		 * and our receive buffer size; since the latter may
+		 * have changed we need to update the peer's maxpsz.
+		 */
+		(void) tcp_maxpsz_set(peer_tcp, B_TRUE);
+		return (rwnd);
+	}
+
 	if (tcp_detached)
 		old_max_rwnd = tcp->tcp_rwnd;
 	else
@@ -16196,23 +15377,16 @@
 	 * Set the Stream head high water mark. This doesn't have to be
 	 * here, since we are simply using default values, but we would
 	 * prefer to choose these values algorithmically, with a likely
-	 * relationship to rwnd.  For fused loopback tcp, we double the
-	 * amount of buffer in order to simulate the normal tcp case.
-	 */
-	if (tcp->tcp_fused) {
-		(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd << 1,
-		    tcp_sth_rcv_hiwat));
-	} else {
-		(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd,
-		    tcp_sth_rcv_hiwat));
-	}
+	 * relationship to rwnd.
+	 */
+	(void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat));
 	return (rwnd);
 }
 
 /*
  * Return SNMP stuff in buffer in mpdata.
  */
-static int
+int
 tcp_snmp_get(queue_t *q, mblk_t *mpctl)
 {
 	mblk_t			*mpdata;
@@ -16261,7 +15435,8 @@
 
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp_t *tcp;
 
 			if (connp->conn_zoneid != zoneid)
@@ -16406,7 +15581,7 @@
 
 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests  */
 /* ARGSUSED */
-static int
+int
 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
 {
 	mib2_tcpConnEntry_t	*tce = (mib2_tcpConnEntry_t *)ptr;
@@ -16627,7 +15802,8 @@
 
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16723,7 +15899,8 @@
 	for (i = 0; i < ipcl_bind_fanout_size; i++) {
 		connfp =  &ipcl_bind_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16770,7 +15947,8 @@
 	for (i = 0; i < ipcl_conn_fanout_size; i++) {
 		connfp =  &ipcl_conn_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			if (zoneid != GLOBAL_ZONEID &&
 			    zoneid != connp->conn_zoneid)
@@ -16927,7 +16105,7 @@
 			 */
 			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
 				if (tcp->tcp_debug) {
-					(void) strlog(TCP_MODULE_ID, 0, 1,
+					(void) strlog(TCP_MOD_ID, 0, 1,
 					    SL_TRACE, "tcp_timer: zero win");
 				}
 			} else {
@@ -17040,7 +16218,7 @@
 		return;
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
 			    "tcp_timer: strange state (%d) %s",
 			    tcp->tcp_state, tcp_display(tcp, NULL,
 			    DISP_PORT_ONLY));
@@ -17372,52 +16550,6 @@
 }
 
 /*
- * Write side put procedure for TCP module instance.
- * TCP as a module is only used for MIB browsers that push TCP over IP or
- * ARP. The only supported primitives are T_SVR4_OPTMGMT_REQ and
- * T_OPTMGMT_REQ. M_FLUSH messages are only passed downstream; we don't flush
- * our queues as we never enqueue messages there. All ioctls are NAKed and
- * everything else is freed.
- */
-static void
-tcp_wput_mod(queue_t *q, mblk_t *mp)
-{
-	switch (DB_TYPE(mp)) {
-	case M_PROTO:
-	case M_PCPROTO:
-		if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
-		    ((((union T_primitives *)mp->b_rptr)->type ==
-			T_SVR4_OPTMGMT_REQ) ||
-		    (((union T_primitives *)mp->b_rptr)->type ==
-			T_OPTMGMT_REQ))) {
-			/*
-			 * This is the only TPI primitive supported. Its
-			 * handling does not require tcp_t, but it does require
-			 * conn_t to check permissions.
-			 */
-			cred_t	*cr = DB_CREDDEF(mp, Q_TO_CONN(q)->conn_cred);
-			if (!snmpcom_req(q, mp, tcp_snmp_set,
-			    tcp_snmp_get, cr)) {
-				freemsg(mp);
-				return;
-			}
-		} else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
-		    != NULL)
-			qreply(q, mp);
-		break;
-	case M_FLUSH:
-		putnext(q, mp);
-		break;
-	case M_IOCTL:
-		miocnak(q, mp, 0, ENOTSUP);
-		break;
-	default:
-		freemsg(mp);
-		break;
-	}
-}
-
-/*
  * The TCP fast path write put procedure.
  * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
  */
@@ -17441,6 +16573,7 @@
 	int		usable;
 	conn_t		*connp = (conn_t *)arg;
 	tcp_t		*tcp = connp->conn_tcp;
+	uint32_t	msize;
 
 	/*
 	 * Try and ASSERT the minimum possible references on the
@@ -17455,8 +16588,15 @@
 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
 
 	/* Bypass tcp protocol for fused tcp loopback */
-	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
-		return;
+	if (tcp->tcp_fused) {
+		msize = msgdsize(mp);
+		mutex_enter(&connp->conn_lock);
+		tcp->tcp_squeue_bytes -= msize;
+		mutex_exit(&connp->conn_lock);
+
+		if (tcp_fuse_output(tcp, mp, msize))
+			return;
+	}
 
 	mss = tcp->tcp_mss;
 	if (tcp->tcp_xmit_zc_clean)
@@ -17482,6 +16622,11 @@
 	    (len == 0) ||
 	    (len > mss) ||
 	    (tcp->tcp_valid_bits != 0)) {
+		msize = msgdsize(mp);
+		mutex_enter(&connp->conn_lock);
+		tcp->tcp_squeue_bytes -= msize;
+		mutex_exit(&connp->conn_lock);
+
 		tcp_wput_data(tcp, mp, B_FALSE);
 		return;
 	}
@@ -17489,6 +16634,10 @@
 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
 	ASSERT(tcp->tcp_fin_sent == 0);
 
+	mutex_enter(&connp->conn_lock);
+	tcp->tcp_squeue_bytes -= len;
+	mutex_exit(&connp->conn_lock);
+
 	/* queue new packet onto retransmission queue */
 	if (tcp->tcp_xmit_head == NULL) {
 		tcp->tcp_xmit_head = mp;
@@ -17536,6 +16685,11 @@
 		goto slow;
 	}
 
+	if (tcp->tcp_flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
+
 	/*
 	 * determine if anything to send (Nagle).
 	 *
@@ -17789,6 +16943,13 @@
 	mp = NULL;
 
 	/*
+	 * For a loopback connection with tcp_direct_sockfs on, note that
+	 * we don't have to protect tcp_rcv_list yet because synchronous
+	 * streams has not yet been enabled and tcp_fuse_rrw() cannot
+	 * possibly race with us.
+	 */
+
+	/*
 	 * Set the max window size (tcp_rq->q_hiwat) of the acceptor
 	 * properly.  This is the first time we know of the acceptor'
 	 * queue.  So we do it here.
@@ -17828,9 +16989,8 @@
 	/* Allocate room for SACK options if needed. */
 	stropt->so_flags |= SO_WROFF;
 	if (tcp->tcp_fused) {
-		size_t sth_hiwat;
-
 		ASSERT(tcp->tcp_loopback);
+		ASSERT(tcp->tcp_loopback_peer != NULL);
 		/*
 		 * For fused tcp loopback, set the stream head's write
 		 * offset value to zero since we won't be needing any room
@@ -17839,16 +16999,16 @@
 		 * Non-fused tcp loopback case is handled separately below.
 		 */
 		stropt->so_wroff = 0;
-
-		/*
-		 * Override q_hiwat and set it to be twice that of the
-		 * previous value; this is to simulate non-fusion case.
-		 */
-		sth_hiwat = q->q_hiwat << 1;
-		if (sth_hiwat > tcp_max_buf)
-			sth_hiwat = tcp_max_buf;
-
-		stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
+		/*
+		 * Record the stream head's high water mark for this endpoint;
+		 * this is used for flow-control purposes in tcp_fuse_output().
+		 */
+		stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat);
+		/*
+		 * Update the peer's transmit parameters according to
+		 * our recently calculated high water mark value.
+		 */
+		(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
 	} else if (tcp->tcp_snd_sack_ok) {
 		stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
 		    (tcp->tcp_loopback ? 0 : tcp_wroff_xtra);
@@ -17857,15 +17017,6 @@
 		    tcp_wroff_xtra);
 	}
 
-	/*
-	 * If loopback, set COPYCACHED option to make sure NOT to use
-	 * non-temporal access.
-	 */
-	if (tcp->tcp_loopback) {
-		stropt->so_flags |= SO_COPYOPT;
-		stropt->so_copyopt = COPYCACHED;
-	}
-
 	/* Send the options up */
 	putnext(q, stropt_mp);
 
@@ -17909,7 +17060,6 @@
 			ASSERT(peer_tcp->tcp_fused);
 
 			tcp_clrqfull(peer_tcp);
-			peer_tcp->tcp_flow_stopped = B_FALSE;
 			TCP_STAT(tcp_fusion_backenabled);
 		}
 	}
@@ -17924,11 +17074,9 @@
 				 * tcp_clean_death was deferred
 				 * for T_ORDREL_IND - do it now
 				 */
-				(void) tcp_clean_death(
-					tcp,
-					    tcp->tcp_client_errno, 21);
-				tcp->tcp_deferred_clean_death =
-				    B_FALSE;
+				(void) tcp_clean_death(tcp,
+				    tcp->tcp_client_errno, 21);
+				tcp->tcp_deferred_clean_death = B_FALSE;
 			}
 		} else {
 			/*
@@ -17942,8 +17090,14 @@
 		tcp->tcp_hard_binding = B_FALSE;
 		tcp->tcp_hard_bound = B_TRUE;
 	}
+
 	tcp->tcp_detached = B_FALSE;
 
+	/* We can enable synchronous streams now */
+	if (tcp->tcp_fused) {
+		tcp_fuse_syncstr_enable_pair(tcp);
+	}
+
 	if (tcp->tcp_ka_enabled) {
 		tcp->tcp_ka_last_intrvl = 0;
 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
@@ -18236,7 +17390,7 @@
 	}
 }
 
-static void
+void
 tcp_wput(queue_t *q, mblk_t *mp)
 {
 	conn_t	*connp = Q_TO_CONN(q);
@@ -18245,12 +17399,27 @@
 	t_scalar_t type;
 	uchar_t *rptr;
 	struct iocblk	*iocp;
+	uint32_t	msize;
 
 	ASSERT(connp->conn_ref >= 2);
 
 	switch (DB_TYPE(mp)) {
 	case M_DATA:
-		CONN_INC_REF(connp);
+		tcp = connp->conn_tcp;
+		ASSERT(tcp != NULL);
+
+		msize = msgdsize(mp);
+
+		mutex_enter(&connp->conn_lock);
+		CONN_INC_REF_LOCKED(connp);
+
+		tcp->tcp_squeue_bytes += msize;
+		if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+			mutex_exit(&connp->conn_lock);
+			tcp_setqfull(tcp);
+		} else
+			mutex_exit(&connp->conn_lock);
+
 		(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
 		    tcp_output, connp, SQTAG_TCP_OUTPUT);
 		return;
@@ -18265,7 +17434,7 @@
 			type = ((union T_primitives *)rptr)->type;
 		} else {
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, dropping one...");
 			}
@@ -18292,7 +17461,7 @@
 		/*
 		 * Most ioctls can be processed right away without going via
 		 * squeues - process them right here. Those that do require
-		 * squeue (currently TCP_IOC_DEFAULT_Q and SIOCPOPSOCKFS)
+		 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
 		 * are processed by tcp_wput_ioctl().
 		 */
 		iocp = (struct iocblk *)mp->b_rptr;
@@ -18372,7 +17541,7 @@
 	ASSERT(wq->q_qinfo == &tcp_sock_winit);
 	wq->q_qinfo = &tcp_winit;
 
-	ASSERT(IS_TCP_CONN(connp));
+	ASSERT(IPCL_IS_TCP(connp));
 	ASSERT(TCP_IS_SOCKET(tcp));
 
 	if (DB_TYPE(mp) == M_PCPROTO &&
@@ -18540,7 +17709,6 @@
 	mutex_exit(&stp->sd_lock);
 }
 
-
 static void
 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
 {
@@ -18555,7 +17723,6 @@
 	uint32_t	hcksum_txflags = 0;
 	mblk_t		*ire_fp_mp;
 	uint_t		ire_fp_mp_len;
-	ill_poll_capab_t *ill_poll;
 
 	ASSERT(DB_TYPE(mp) == M_DATA);
 
@@ -18699,7 +17866,7 @@
 		 */
 	}
 
-	if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) && dohwcksum) {
+	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 		ASSERT(ill->ill_hcksum_capab != NULL);
 		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
 	}
@@ -18710,53 +17877,21 @@
 	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
 	up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
 
-	/*
-	 * Underlying interface supports hardware checksum offload for
-	 * the tcp payload, along with M_DATA fast path; leave the payload
-	 * checksum for the hardware to calculate.
-	 *
-	 * N.B: We only need to set up checksum info on the first mblk.
-	 */
-	if (hcksum_txflags & HCKSUM_INET_FULL_V4) {
-		/*
-		 * Hardware calculates pseudo-header, header and payload
-		 * checksums, so clear checksum field in TCP header.
-		 */
-		*up = 0;
-		mp->b_datap->db_struioun.cksum.flags |= HCK_FULLCKSUM;
-	} else if (hcksum_txflags & HCKSUM_INET_PARTIAL) {
-		uint32_t sum;
-		/*
-		 * Partial checksum offload has been enabled.  Fill the
-		 * checksum field in the TCP header with the pseudo-header
-		 * checksum value.
-		 */
-		sum = *up + cksum + IP_TCP_CSUM_COMP;
-		sum = (sum & 0xFFFF) + (sum >> 16);
-		*up = (sum & 0xFFFF) + (sum >> 16);
-		mp->b_datap->db_cksumstart = IP_SIMPLE_HDR_LENGTH;
-		mp->b_datap->db_cksumstuff = IP_SIMPLE_HDR_LENGTH + 16;
-		mp->b_datap->db_cksumend = ntohs(ipha->ipha_length);
-		mp->b_datap->db_struioun.cksum.flags |= HCK_PARTIALCKSUM;
-	} else {
-		/* software checksumming */
+	IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
+	    IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
+
+	/* Software checksum? */
+	if (DB_CKSUMFLAGS(mp) == 0) {
 		TCP_STAT(tcp_out_sw_cksum);
-		*up = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH,
-		    cksum + IP_TCP_CSUM_COMP);
-		mp->b_datap->db_struioun.cksum.flags = 0;
+		TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+		    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
 	}
 
 	ipha->ipha_fragment_offset_and_flags |=
 	    (uint32_t)htons(ire->ire_frag_flag);
 
-	/*
-	 * Hardware supports IP header checksum offload; clear contents
-	 * of IP header checksum field.  Otherwise we calculate it.
-	 */
-	if (hcksum_txflags & HCKSUM_IPHDRCKSUM) {
-		ipha->ipha_hdr_checksum = 0;
-		mp->b_datap->db_struioun.cksum.flags |= HCK_IPV4_HDRCKSUM;
-	} else {
+	/* Calculate IP header checksum if hardware isn't capable */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
 		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
 		    ((uint16_t *)ipha)[4]);
 	}
@@ -18769,13 +17904,13 @@
 	ire->ire_last_used_time = lbolt;
 	BUMP_MIB(&ip_mib, ipOutRequests);
 
-	if (ill->ill_capabilities & ILL_CAPAB_POLL) {
-		ill_poll = ill->ill_poll_capab;
-		ASSERT(ill_poll != NULL);
-		ASSERT(ill_poll->ill_tx != NULL);
-		ASSERT(ill_poll->ill_tx_handle != NULL);
-
-		ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);
+	if (ILL_POLL_CAPABLE(ill)) {
+		/*
+		 * Send the packet directly to DLD, where it may be queued
+		 * depending on the availability of transmit resources at
+		 * the media layer.
+		 */
+		IP_POLL_ILL_TX(ill, mp);
 	} else {
 		putnext(ire->ire_stq, mp);
 	}
@@ -18876,7 +18011,7 @@
 			    DISP_ADDR_AND_PORT));
 #else
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_TRACE|SL_ERROR,
 				    "tcp_wput_data: data after ordrel, %s\n",
 				    tcp_display(tcp, NULL,
@@ -18888,6 +18023,10 @@
 		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
 			tcp_zcopy_notify(tcp);
 		freemsg(mp);
+		if (tcp->tcp_flow_stopped &&
+		    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+			tcp_clrqfull(tcp);
+		}
 		return;
 	}
 
@@ -19214,15 +18353,12 @@
 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
 	}
 	/* Note that len is the amount we just sent but with a negative sign */
-	len += tcp->tcp_unsent;
-	tcp->tcp_unsent = len;
+	tcp->tcp_unsent += len;
 	if (tcp->tcp_flow_stopped) {
-		if (len <= tcp->tcp_xmit_lowater) {
-			tcp->tcp_flow_stopped = B_FALSE;
+		if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
 			tcp_clrqfull(tcp);
 		}
-	} else if (len >= tcp->tcp_xmit_hiwater) {
-		tcp->tcp_flow_stopped = B_TRUE;
+	} else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
 		tcp_setqfull(tcp);
 	}
 }
@@ -19361,6 +18497,12 @@
 }
 
 /*
+ * Smaller and private version of pdescinfo_t used specifically for TCP,
+ * which allows for only two payload spans per packet.
+ */
+typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
+
+/*
  * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
  * scheme, and returns one the following:
  *
@@ -19404,9 +18546,6 @@
 #define	IPVER(ip6h)	((((uint32_t *)ip6h)[0] >> 4) & 0x7)
 #endif
 
-#define	TCP_CSUM_OFFSET	16
-#define	TCP_CSUM_SIZE	2
-
 #define	PREP_NEW_MULTIDATA() {			\
 	mmd = NULL;				\
 	md_mp = md_hbuf = NULL;			\
@@ -19542,8 +18681,7 @@
 
 	ill = ire_to_ill(ire);
 	ASSERT(ill != NULL);
-	ASSERT((ill->ill_capabilities & ILL_CAPAB_MDT) == 0 ||
-	    ill->ill_mdt_capab != NULL);
+	ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
 
 	if (!tcp->tcp_ire_ill_check_done) {
 		tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
@@ -19576,16 +18714,16 @@
 
 	/* does the interface support hardware checksum offload? */
 	hwcksum_flags = 0;
-	if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&
+	if (ILL_HCKSUM_CAPABLE(ill) &&
 	    (ill->ill_hcksum_capab->ill_hcksum_txflags &
-	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM)) &&
-	    dohwcksum) {
+	    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
+	    HCKSUM_IPHDRCKSUM)) && dohwcksum) {
 		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
 		    HCKSUM_IPHDRCKSUM)
 			hwcksum_flags = HCK_IPV4_HDRCKSUM;
 
 		if (ill->ill_hcksum_capab->ill_hcksum_txflags &
-		    HCKSUM_INET_FULL_V4)
+		    (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
 			hwcksum_flags |= HCK_FULLCKSUM;
 		else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
 		    HCKSUM_INET_PARTIAL)
@@ -19726,10 +18864,16 @@
 			 * checksum offload; these are currently for IPv4.
 			 * For full checksum offload, they are set to zero.
 			 */
-			if (af == AF_INET &&
-			    (hwcksum_flags & HCK_PARTIALCKSUM)) {
-				start = IP_SIMPLE_HDR_LENGTH;
-				stuff = IP_SIMPLE_HDR_LENGTH + TCP_CSUM_OFFSET;
+			if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
+				if (af == AF_INET) {
+					start = IP_SIMPLE_HDR_LENGTH;
+					stuff = IP_SIMPLE_HDR_LENGTH +
+					    TCP_CHECKSUM_OFFSET;
+				} else {
+					start = IPV6_HDR_LEN;
+					stuff = IPV6_HDR_LEN +
+					    TCP_CHECKSUM_OFFSET;
+				}
 			} else {
 				start = stuff = 0;
 			}
@@ -19748,8 +18892,8 @@
 			    /* fastpath mblk */
 			    (af == AF_INET) ? ire->ire_dlureq_mp :
 			    ire->ire_nce->nce_res_mp,
-			    /* hardware checksum enabled (IPv4 only) */
-			    (af == AF_INET && hwcksum_flags != 0),
+			    /* hardware checksum enabled */
+			    (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
 			    /* hardware checksum offsets */
 			    start, stuff, 0,
 			    /* hardware checksum flag */
@@ -20224,8 +19368,8 @@
 				ASSERT(IPVER(ip6h) == IPV6_VERSION);
 				ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
 				ASSERT(PDESC_HDRL(pkt_info) >=
-				    (IPV6_HDR_LEN + TCP_CSUM_OFFSET +
-				    TCP_CSUM_SIZE));
+				    (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
+				    TCP_CHECKSUM_SIZE));
 				ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
 
 				if (tcp->tcp_ip_forward_progress) {
@@ -20273,29 +19417,45 @@
 				/* offset for TCP header checksum */
 				up = IPH_TCPH_CHECKSUMP(ipha,
 				    IP_SIMPLE_HDR_LENGTH);
-
-				if (hwcksum_flags & HCK_FULLCKSUM) {
-					/*
-					 * Hardware calculates pseudo-header,
-					 * header and payload checksums, so
-					 * zero out this field.
-					 */
-					*up = 0;
-				} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
-					uint32_t sum;
-
-					/* pseudo-header checksumming */
-					sum = *up + cksum + IP_TCP_CSUM_COMP;
-					sum = (sum & 0xFFFF) + (sum >> 16);
-					*up = (sum & 0xFFFF) + (sum >> 16);
-				} else {
-					/* software checksumming */
-					TCP_STAT(tcp_out_sw_cksum);
-					*up = IP_MD_CSUM(pkt,
-					    IP_SIMPLE_HDR_LENGTH,
-					    cksum + IP_TCP_CSUM_COMP);
-				}
-
+			} else {
+				up = (uint16_t *)&ip6h->ip6_src;
+
+				/* calculate pseudo-header checksum */
+				cksum = up[0] + up[1] + up[2] + up[3] +
+				    up[4] + up[5] + up[6] + up[7] +
+				    up[8] + up[9] + up[10] + up[11] +
+				    up[12] + up[13] + up[14] + up[15];
+
+				/* Fold the initial sum */
+				cksum = (cksum & 0xffff) + (cksum >> 16);
+
+				up = (uint16_t *)(((uchar_t *)ip6h) +
+				    IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
+			}
+
+			if (hwcksum_flags & HCK_FULLCKSUM) {
+				/* clear checksum field for hardware */
+				*up = 0;
+			} else if (hwcksum_flags & HCK_PARTIALCKSUM) {
+				uint32_t sum;
+
+				/* pseudo-header checksumming */
+				sum = *up + cksum + IP_TCP_CSUM_COMP;
+				sum = (sum & 0xFFFF) + (sum >> 16);
+				*up = (sum & 0xFFFF) + (sum >> 16);
+			} else {
+				/* software checksumming */
+				TCP_STAT(tcp_out_sw_cksum);
+				TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+				    tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
+				*up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
+				    cksum + IP_TCP_CSUM_COMP);
+				if (*up == 0)
+					*up = 0xFFFF;
+			}
+
+			/* IPv4 header checksum */
+			if (af == AF_INET) {
 				ipha->ipha_fragment_offset_and_flags |=
 				    (uint32_t)htons(ire->ire_frag_flag);
 
@@ -20306,19 +19466,6 @@
 					    ((uint32_t *)ipha)[0],
 					    ((uint16_t *)ipha)[4]);
 				}
-			} else {
-				up = (uint16_t *)(((uchar_t *)ip6h) +
-				    IPV6_HDR_LEN + TCP_CSUM_OFFSET);
-
-				/*
-				 * Software checksumming (hardware checksum
-				 * offload for IPv6 will hopefully be
-				 * implemented one day).
-				 */
-				TCP_STAT(tcp_out_sw_cksum);
-				*up = IP_MD_CSUM(pkt,
-				    IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
-				    htons(IPPROTO_TCP));
 			}
 
 			/* advance header offset */
@@ -20373,8 +19520,6 @@
 #undef PREP_NEW_MULTIDATA
 #undef PREP_NEW_PBUF
 #undef IPVER
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
 
 	IRE_REFRELE(ire);
 	return (0);
@@ -20999,7 +20144,7 @@
 	 */
 	if (ip_multidata_outbound && check_mdt &&
 	    !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
-	    ill != NULL && (ill->ill_capabilities & ILL_CAPAB_MDT) &&
+	    ill != NULL && ILL_MDT_CAPABLE(ill) &&
 	    !CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
 	    !(ire->ire_flags & RTF_MULTIRT) &&
 	    !IPP_ENABLED(IPP_LOCAL_OUT) &&
@@ -21112,7 +20257,6 @@
 		 * tcp_xmit_lowater, so re-enable flow.
 		 */
 		if (tcp->tcp_flow_stopped) {
-			tcp->tcp_flow_stopped = B_FALSE;
 			tcp_clrqfull(tcp);
 		}
 	}
@@ -21305,26 +20449,47 @@
 		}
 		tcp_def_q_set(tcp, mp);
 		return;
-	case SIOCPOPSOCKFS:
-		/*
-		 * sockfs is being I_POP'ed, reset the flag
-		 * indicating this
-		 */
-		tcp->tcp_issocket = B_FALSE;
-
-		/*
-		 * Insert this socket into the acceptor hash.
-		 * We might need it for T_CONN_RES message
-		 */
+	case _SIOCSOCKFALLBACK:
+		/*
+		 * Either sockmod is about to be popped and the socket
+		 * would now be treated as a plain stream, or a module
+		 * is about to be pushed so we could no longer use read-
+		 * side synchronous streams for fused loopback tcp.
+		 * Drain any queued data and disable direct sockfs
+		 * interface from now on.
+		 */
+		if (!tcp->tcp_issocket) {
+			DB_TYPE(mp) = M_IOCNAK;
+			iocp->ioc_error = EINVAL;
+		} else {
 #ifdef	_ILP32
-		tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
+			tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
 #else
-		tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+			tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
 #endif
-		tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
-		mp->b_datap->db_type = M_IOCACK;
+			/*
+			 * Insert this socket into the acceptor hash.
+			 * We might need it for T_CONN_RES message
+			 */
+			tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+
+			if (tcp->tcp_fused) {
+				/*
+				 * This is a fused loopback tcp; disable
+				 * read-side synchronous streams interface
+				 * and drain any queued data.  It is okay
+				 * to do this for non-synchronous streams
+				 * fused tcp as well.
+				 */
+				tcp_fuse_disable_pair(tcp, B_FALSE);
+			}
+			tcp->tcp_issocket = B_FALSE;
+			TCP_STAT(tcp_sock_fallback);
+
+			DB_TYPE(mp) = M_IOCACK;
+			iocp->ioc_error = 0;
+		}
 		iocp->ioc_count = 0;
-		iocp->ioc_error = 0;
 		iocp->ioc_rval = 0;
 		qreply(q, mp);
 		return;
@@ -21364,7 +20529,9 @@
 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
 		type = ((union T_primitives *)rptr)->type;
 		if (type == T_EXDATA_REQ) {
-			len = msgdsize(mp->b_cont) - 1;
+			uint32_t msize = msgdsize(mp->b_cont);
+
+			len = msize - 1;
 			if (len < 0) {
 				freemsg(mp);
 				return;
@@ -21381,7 +20548,7 @@
 			tcp->tcp_valid_bits |= TCP_URG_VALID;
 
 			/* Bypass tcp protocol for fused tcp loopback */
-			if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
+			if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
 				return;
 		} else if (type != T_DATA_REQ) {
 			goto non_urgent_data;
@@ -21393,7 +20560,7 @@
 		return;
 	} else {
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, dropping one...");
 		}
 		freemsg(mp);
@@ -21454,7 +20621,7 @@
 			 * the other side. Just ignore it.
 			 */
 			if (tcp->tcp_debug) {
-				(void) strlog(TCP_MODULE_ID, 0, 1,
+				(void) strlog(TCP_MOD_ID, 0, 1,
 				    SL_ERROR|SL_TRACE,
 				    "tcp_wput_proto, T_ORDREL_REQ out of "
 				    "state %s",
@@ -21468,7 +20635,7 @@
 		break;
 	default:
 		if (tcp->tcp_debug) {
-			(void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 			    "tcp_wput_proto, bogus TPI msg, type %d",
 			    tprim->type);
 		}
@@ -21530,7 +20697,7 @@
 
 	/* If a text string is passed in with the request, pass it to strlog. */
 	if (str != NULL && tcp->tcp_debug) {
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
 		    str, seq, ack, ctl);
 	}
@@ -21737,7 +20904,7 @@
 	}
 
 	if (str && q && tcp_dbg) {
-		(void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
 		    "flags 0x%x",
 		    str, seq, ack, ctl);
@@ -22478,7 +21645,7 @@
 }
 
 /* This function handles the push timeout. */
-static void
+void
 tcp_push_timer(void *arg)
 {
 	conn_t	*connp = (conn_t *)arg;
@@ -22488,10 +21655,18 @@
 
 	ASSERT(tcp->tcp_listener == NULL);
 
+	/*
+	 * We need to stop synchronous streams temporarily to prevent a race
+	 * with tcp_fuse_rrw() or tcp_fusion rinfop().  It is safe to access
+	 * tcp_rcv_list here because those entry points will return right
+	 * away when synchronous streams is stopped.
+	 */
+	TCP_FUSE_SYNCSTR_STOP(tcp);
 	tcp->tcp_push_tid = 0;
 	if ((tcp->tcp_rcv_list != NULL) &&
 	    (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED))
 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+	TCP_FUSE_SYNCSTR_RESUME(tcp);
 }
 
 /*
@@ -24059,15 +23234,14 @@
 	tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack,
 	    sizeof (tcp_g_t_info_ack));
 
-#if TCP_COUNTERS || TCP_DEBUG_COUNTER
-	if ((tcp_kstat = kstat_create("tcp", 0, "tcpstat",
+	if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat",
 		"net", KSTAT_TYPE_NAMED,
 		sizeof (tcp_statistics) / sizeof (kstat_named_t),
 		KSTAT_FLAG_VIRTUAL)) != NULL) {
 		tcp_kstat->ks_data = &tcp_statistics;
 		kstat_install(tcp_kstat);
 	}
-#endif
+
 	tcp_kstat_init();
 }
 
@@ -24181,7 +23355,8 @@
 		connfp = &ipcl_globalhash_fanout[i];
 		connp = NULL;
 
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 
 			tcp = connp->conn_tcp;
 			cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
@@ -24373,7 +23548,7 @@
 	 */
 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 		logflags |= SL_CONSOLE;
-	(void) strlog(TCP_MODULE_ID, 0, 1, logflags,
+	(void) strlog(TCP_MOD_ID, 0, 1, logflags,
 		"TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
 		"start = %d, end = %d\n", lbuf, lport, rbuf, rport,
 		acp->ac_start, acp->ac_end);
@@ -24529,7 +23704,7 @@
 	 */
 	if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
 		logflags |= SL_CONSOLE;
-	(void) strlog(TCP_MODULE_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
+	(void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
 	    "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
 	if (err == 0 && count == 0)
 		err = ENOENT;
@@ -24846,7 +24021,7 @@
 	}
 done:
 	if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
-		mp->b_datap->db_cksumstart = 0;
+		DB_CKSUMSTART(mp) = 0;
 		mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
 		TCP_STAT(tcp_time_wait_syn_fail);
 	}
@@ -24965,7 +24140,7 @@
 /*
  * TCP Timers Implementation.
  */
-static timeout_id_t
+timeout_id_t
 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
 {
 	mblk_t *mp;
@@ -25038,7 +24213,7 @@
  * it. But since both should execute on the same squeue, this race should not
  * occur.
  */
-static clock_t
+clock_t
 tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
 {
 	mblk_t	*mp = (mblk_t *)id;
@@ -25165,30 +24340,48 @@
  * End of TCP Timers implementation.
  */
 
-static void
+/*
+ * tcp_{set,clr}qfull() functions are used to either set or clear QFULL
+ * on the specified backing STREAMS q. Note, the caller may make the
+ * decision to call based on the tcp_t.tcp_flow_stopped value which
+ * when check outside the q's lock is only an advisory check ...
+ */
+
+void
 tcp_setqfull(tcp_t *tcp)
 {
 	queue_t *q = tcp->tcp_wq;
 
 	if (!(q->q_flag & QFULL)) {
-		TCP_STAT(tcp_flwctl_on);
 		mutex_enter(QLOCK(q));
-		q->q_flag |= QFULL;
-		mutex_exit(QLOCK(q));
-	}
-}
-
-static void
+		if (!(q->q_flag & QFULL)) {
+			/* still need to set QFULL */
+			q->q_flag |= QFULL;
+			tcp->tcp_flow_stopped = B_TRUE;
+			mutex_exit(QLOCK(q));
+			TCP_STAT(tcp_flwctl_on);
+		} else {
+			mutex_exit(QLOCK(q));
+		}
+	}
+}
+
+void
 tcp_clrqfull(tcp_t *tcp)
 {
 	queue_t *q = tcp->tcp_wq;
 
 	if (q->q_flag & QFULL) {
 		mutex_enter(QLOCK(q));
-		q->q_flag &= ~QFULL;
-		mutex_exit(QLOCK(q));
-		if (q->q_flag & QWANTW)
-			qbackenable(q, 0);
+		if (q->q_flag & QFULL) {
+			q->q_flag &= ~QFULL;
+			tcp->tcp_flow_stopped = B_FALSE;
+			mutex_exit(QLOCK(q));
+			if (q->q_flag & QWANTW)
+				qbackenable(q, 0);
+		} else {
+			mutex_exit(QLOCK(q));
+		}
 	}
 }
 
@@ -25254,8 +24447,8 @@
 		{ "connTableSize6",	KSTAT_DATA_INT32, 0 }
 	};
 
-	tcp_mibkp = kstat_create("tcp", 0, "tcp", "mib2", KSTAT_TYPE_NAMED,
-	    NUM_OF_FIELDS(tcp_named_kstat_t), 0);
+	tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME,
+	    "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0);
 
 	if (tcp_mibkp == NULL)
 		return;
@@ -25304,7 +24497,8 @@
 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 		connfp = &ipcl_globalhash_fanout[i];
 		connp = NULL;
-		while ((connp = tcp_get_next_conn(connfp, connp))) {
+		while ((connp =
+		    ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
 			tcp = connp->conn_tcp;
 			switch (tcp_snmp_state(tcp)) {
 			case MIB2_TCP_established:
@@ -25401,7 +24595,7 @@
 	tcph = (tcph_t *)&mp->b_rptr[hdr_len];
 	if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
 		mp->b_datap->db_struioflag |= STRUIO_EAGER;
-		mp->b_datap->db_cksumstart = (intptr_t)sqp;
+		DB_CKSUMSTART(mp) = (intptr_t)sqp;
 	}
 
 	squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,
--- a/usr/src/uts/common/inet/tcp/tcp6ddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,7 +37,13 @@
 #define	INET_DEVDESC	"TCP6 STREAMS driver %I%"
 #define	INET_MODDESC	"TCP6 STREAMS module %I%"
 #define	INET_DEVMINOR	TCP_MINOR6
-#define	INET_DEVMTFLAGS	D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here.  Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define	INET_DEVMTFLAGS	(D_MP|_D_DIRECT)
 #define	INET_MODMTFLAGS	D_MP
 
 #include "../inetddi.c"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,1087 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipp_common.h>
+
+/*
+ * This file implements TCP fusion - a protocol-less data path for TCP
+ * loopback connections.  The fusion of two local TCP endpoints occurs
+ * at connection establishment time.  Various conditions (see details
+ * in tcp_fuse()) need to be met for fusion to be successful.  If it
+ * fails, we fall back to the regular TCP data path; if it succeeds,
+ * both endpoints proceed to use tcp_fuse_output() as the transmit path.
+ * tcp_fuse_output() enqueues application data directly onto the peer's
+ * receive queue; no protocol processing is involved.  After enqueueing
+ * the data, the sender can either push (putnext) data up the receiver's
+ * read queue; or the sender can simply return and let the receiver
+ * retrieve the enqueued data via the synchronous streams entry point
+ * tcp_fuse_rrw().  The latter path is taken if synchronous streams is
+ * enabled (the default).  It is disabled if sockfs no longer resides
+ * directly on top of tcp module due to a module insertion or removal.
+ * It also needs to be temporarily disabled when sending urgent data
+ * because the tcp_fuse_rrw() path bypasses the M_PROTO processing done
+ * by strsock_proto() hook.
+ *
+ * Sychronization is handled by squeue and the mutex tcp_fuse_lock.
+ * One of the requirements for fusion to succeed is that both endpoints
+ * need to be using the same squeue.  This ensures that neither side
+ * can disappear while the other side is still sending data.  By itself,
+ * squeue is not sufficient for guaranteeing safety when synchronous
+ * streams is enabled.  The reason is that tcp_fuse_rrw() doesn't enter
+ * the squeue and its access to tcp_rcv_list and other fusion-related
+ * fields needs to be sychronized with the sender.  tcp_fuse_lock is
+ * used for this purpose.  When there is urgent data, the sender needs
+ * to push the data up the receiver's streams read queue.  In order to
+ * avoid holding the tcp_fuse_lock across putnext(), the sender sets
+ * the peer tcp's tcp_fuse_syncstr_stopped bit and releases tcp_fuse_lock
+ * (see macro TCP_FUSE_SYNCSTR_STOP()).  If tcp_fuse_rrw() enters after
+ * this point, it will see that synchronous streams is temporarily
+ * stopped and it will immediately return EBUSY without accessing the
+ * tcp_rcv_list or other fields protected by the tcp_fuse_lock.  This
+ * will result in strget() calling getq_noenab() to dequeue data from
+ * the stream head instead.  After the sender has finished pushing up
+ * all urgent data, it will clear the tcp_fuse_syncstr_stopped bit using
+ * TCP_FUSE_SYNCSTR_RESUME and the receiver may then resume using
+ * tcp_fuse_rrw() to retrieve data from tcp_rcv_list.
+ *
+ * The following note applies only to the synchronous streams mode.
+ *
+ * Flow control is done by checking the size of receive buffer and
+ * the number of data blocks, both set to different limits.  This is
+ * different than regular streams flow control where cumulative size
+ * check dominates block count check -- streams queue high water mark
+ * typically represents bytes.  Each enqueue triggers notifications
+ * to the receiving process; a build up of data blocks indicates a
+ * slow receiver and the sender should be blocked or informed at the
+ * earliest moment instead of further wasting system resources.  In
+ * effect, this is equivalent to limiting the number of outstanding
+ * segments in flight.
+ */
+
+/*
+ * Macros that determine whether or not IP processing is needed for TCP.
+ */
+#define	TCP_IPOPT_POLICY_V4(tcp)					\
+	((tcp)->tcp_ipversion == IPV4_VERSION &&			\
+	((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH ||		\
+	CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) ||		\
+	CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
+
+#define	TCP_IPOPT_POLICY_V6(tcp)					\
+	((tcp)->tcp_ipversion == IPV6_VERSION &&			\
+	((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN ||			\
+	CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) ||		\
+	CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
+
+#define	TCP_LOOPBACK_IP(tcp)						\
+	(TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) ||	\
+	!CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
+
+/*
+ * Setting this to false means we disable fusion altogether and
+ * loopback connections would go through the protocol paths.
+ */
+boolean_t do_tcp_fusion = B_TRUE;
+
+/*
+ * Enabling this flag allows sockfs to retrieve data directly
+ * from a fused tcp endpoint using synchronous streams interface.
+ */
+boolean_t do_tcp_direct_sockfs = B_TRUE;
+
+/*
+ * This is the minimum amount of outstanding writes allowed on
+ * a synchronous streams-enabled receiving endpoint before the
+ * sender gets flow-controlled.  Setting this value to 0 means
+ * that the data block limit is equivalent to the byte count
+ * limit, which essentially disables the check.
+ */
+#define	TCP_FUSION_RCV_UNREAD_MIN	8
+uint_t tcp_fusion_rcv_unread_min = TCP_FUSION_RCV_UNREAD_MIN;
+
+static void	tcp_fuse_syncstr_enable(tcp_t *);
+static void	tcp_fuse_syncstr_disable(tcp_t *);
+static void	strrput_sig(queue_t *, boolean_t);
+
+/*
+ * This routine gets called by the eager tcp upon changing state from
+ * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
+ * and the active connect tcp such that the regular tcp processings
+ * may be bypassed under allowable circumstances.  Because the fusion
+ * requires both endpoints to be in the same squeue, it does not work
+ * for simultaneous active connects because there is no easy way to
+ * switch from one squeue to another once the connection is created.
+ * This is different from the eager tcp case where we assign it the
+ * same squeue as the one given to the active connect tcp during open.
+ */
+void
+tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
+{
+	conn_t *peer_connp, *connp = tcp->tcp_connp;
+	tcp_t *peer_tcp;
+
+	ASSERT(!tcp->tcp_fused);
+	ASSERT(tcp->tcp_loopback);
+	ASSERT(tcp->tcp_loopback_peer == NULL);
+	/*
+	 * We need to inherit q_hiwat of the listener tcp, but we can't
+	 * really use tcp_listener since we get here after sending up
+	 * T_CONN_IND and tcp_wput_accept() may be called independently,
+	 * at which point tcp_listener is cleared; this is why we use
+	 * tcp_saved_listener.  The listener itself is guaranteed to be
+	 * around until tcp_accept_finish() is called on this eager --
+	 * this won't happen until we're done since we're inside the
+	 * eager's perimeter now.
+	 */
+	ASSERT(tcp->tcp_saved_listener != NULL);
+
+	/*
+	 * Lookup peer endpoint; search for the remote endpoint having
+	 * the reversed address-port quadruplet in ESTABLISHED state,
+	 * which is guaranteed to be unique in the system.  Zone check
+	 * is applied accordingly for loopback address, but not for
+	 * local address since we want fusion to happen across Zones.
+	 */
+	if (tcp->tcp_ipversion == IPV4_VERSION) {
+		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
+		    (ipha_t *)iphdr, tcph);
+	} else {
+		peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
+		    (ip6_t *)iphdr, tcph);
+	}
+
+	/*
+	 * We can only proceed if peer exists, resides in the same squeue
+	 * as our conn and is not raw-socket.  The squeue assignment of
+	 * this eager tcp was done earlier at the time of SYN processing
+	 * in ip_fanout_tcp{_v6}.  Note that similar squeues by itself
+	 * doesn't guarantee a safe condition to fuse, hence we perform
+	 * additional tests below.
+	 */
+	ASSERT(peer_connp == NULL || peer_connp != connp);
+	if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
+	    !IPCL_IS_TCP(peer_connp)) {
+		if (peer_connp != NULL) {
+			TCP_STAT(tcp_fusion_unqualified);
+			CONN_DEC_REF(peer_connp);
+		}
+		return;
+	}
+	peer_tcp = peer_connp->conn_tcp;	/* active connect tcp */
+
+	ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
+	ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
+	ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
+
+	/*
+	 * Fuse the endpoints; we perform further checks against both
+	 * tcp endpoints to ensure that a fusion is allowed to happen.
+	 * In particular we bail out for non-simple TCP/IP or if IPsec/
+	 * IPQoS policy exists.
+	 */
+	if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
+	    !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
+	    !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+		mblk_t *mp;
+		struct stroptions *stropt;
+		queue_t *peer_rq = peer_tcp->tcp_rq;
+
+		ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
+		ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+		ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
+
+		/*
+		 * We need to drain data on both endpoints during unfuse.
+		 * If we need to send up SIGURG at the time of draining,
+		 * we want to be sure that an mblk is readily available.
+		 * This is why we pre-allocate the M_PCSIG mblks for both
+		 * endpoints which will only be used during/after unfuse.
+		 */
+		if ((mp = allocb(1, BPRI_HI)) == NULL)
+			goto failed;
+
+		tcp->tcp_fused_sigurg_mp = mp;
+
+		if ((mp = allocb(1, BPRI_HI)) == NULL)
+			goto failed;
+
+		peer_tcp->tcp_fused_sigurg_mp = mp;
+
+		/* Allocate M_SETOPTS mblk */
+		if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
+			goto failed;
+
+		/* Fuse both endpoints */
+		peer_tcp->tcp_loopback_peer = tcp;
+		tcp->tcp_loopback_peer = peer_tcp;
+		peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
+
+		/*
+		 * We never use regular tcp paths in fusion and should
+		 * therefore clear tcp_unsent on both endpoints.  Having
+		 * them set to non-zero values means asking for trouble
+		 * especially after unfuse, where we may end up sending
+		 * through regular tcp paths which expect xmit_list and
+		 * friends to be correctly setup.
+		 */
+		peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
+
+		tcp_timers_stop(tcp);
+		tcp_timers_stop(peer_tcp);
+
+		/*
+		 * At this point we are a detached eager tcp and therefore
+		 * don't have a queue assigned to us until accept happens.
+		 * In the mean time the peer endpoint may immediately send
+		 * us data as soon as fusion is finished, and we need to be
+		 * able to flow control it in case it sends down huge amount
+		 * of data while we're still detached.  To prevent that we
+		 * inherit the listener's q_hiwat value; this is temporary
+		 * since we'll repeat the process in tcp_accept_finish().
+		 */
+		(void) tcp_fuse_set_rcv_hiwat(tcp,
+		    tcp->tcp_saved_listener->tcp_rq->q_hiwat);
+
+		/*
+		 * Set the stream head's write offset value to zero since we
+		 * won't be needing any room for TCP/IP headers; tell it to
+		 * not break up the writes (this would reduce the amount of
+		 * work done by kmem); and configure our receive buffer.
+		 * Note that we can only do this for the active connect tcp
+		 * since our eager is still detached; it will be dealt with
+		 * later in tcp_accept_finish().
+		 */
+		DB_TYPE(mp) = M_SETOPTS;
+		mp->b_wptr += sizeof (*stropt);
+
+		stropt = (struct stroptions *)mp->b_rptr;
+		stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
+		stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
+		stropt->so_wroff = 0;
+
+		/*
+		 * Record the stream head's high water mark for
+		 * peer endpoint; this is used for flow-control
+		 * purposes in tcp_fuse_output().
+		 */
+		stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp,
+		    peer_rq->q_hiwat);
+
+		/* Send the options up */
+		putnext(peer_rq, mp);
+	} else {
+		TCP_STAT(tcp_fusion_unqualified);
+	}
+	CONN_DEC_REF(peer_connp);
+	return;
+
+failed:
+	if (tcp->tcp_fused_sigurg_mp != NULL) {
+		freeb(tcp->tcp_fused_sigurg_mp);
+		tcp->tcp_fused_sigurg_mp = NULL;
+	}
+	if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
+		freeb(peer_tcp->tcp_fused_sigurg_mp);
+		peer_tcp->tcp_fused_sigurg_mp = NULL;
+	}
+	CONN_DEC_REF(peer_connp);
+}
+
+/*
+ * Unfuse a previously-fused pair of tcp loopback endpoints.
+ */
+void
+tcp_unfuse(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused && peer_tcp != NULL);
+	ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+	ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
+	ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
+	ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
+
+	/*
+	 * We disable synchronous streams, drain any queued data and
+	 * clear tcp_direct_sockfs.  The synchronous streams entry
+	 * points will become no-ops after this point.
+	 */
+	tcp_fuse_disable_pair(tcp, B_TRUE);
+
+	/*
+	 * Update th_seq and th_ack in the header template
+	 */
+	U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
+	U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+	U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
+	U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
+
+	/* Unfuse the endpoints */
+	peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
+	peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
+}
+
+/*
+ * Fusion output routine for urgent data.  This routine is called by
+ * tcp_fuse_output() for handling non-M_DATA mblks.
+ */
+void
+tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
+{
+	mblk_t *mp1;
+	struct T_exdata_ind *tei;
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	mblk_t *head, *prev_head = NULL;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+	ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
+	ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
+
+	/*
+	 * Urgent data arrives in the form of T_EXDATA_REQ from above.
+	 * Each occurence denotes a new urgent pointer.  For each new
+	 * urgent pointer we signal (SIGURG) the receiving app to indicate
+	 * that it needs to go into urgent mode.  This is similar to the
+	 * urgent data handling in the regular tcp.  We don't need to keep
+	 * track of where the urgent pointer is, because each T_EXDATA_REQ
+	 * "advances" the urgent pointer for us.
+	 *
+	 * The actual urgent data carried by T_EXDATA_REQ is then prepended
+	 * by a T_EXDATA_IND before being enqueued behind any existing data
+	 * destined for the receiving app.  There is only a single urgent
+	 * pointer (out-of-band mark) for a given tcp.  If the new urgent
+	 * data arrives before the receiving app reads some existing urgent
+	 * data, the previous marker is lost.  This behavior is emulated
+	 * accordingly below, by removing any existing T_EXDATA_IND messages
+	 * and essentially converting old urgent data into non-urgent.
+	 */
+	ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
+	/* Let sender get out of urgent mode */
+	tcp->tcp_valid_bits &= ~TCP_URG_VALID;
+
+	/*
+	 * This flag indicates that a signal needs to be sent up.
+	 * This flag will only get cleared once SIGURG is delivered and
+	 * is not affected by the tcp_fused flag -- delivery will still
+	 * happen even after an endpoint is unfused, to handle the case
+	 * where the sending endpoint immediately closes/unfuses after
+	 * sending urgent data and the accept is not yet finished.
+	 */
+	peer_tcp->tcp_fused_sigurg = B_TRUE;
+
+	/* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
+	DB_TYPE(mp) = M_PROTO;
+	tei = (struct T_exdata_ind *)mp->b_rptr;
+	tei->PRIM_type = T_EXDATA_IND;
+	tei->MORE_flag = 0;
+	mp->b_wptr = (uchar_t *)&tei[1];
+
+	TCP_STAT(tcp_fusion_urg);
+	BUMP_MIB(&tcp_mib, tcpOutUrg);
+
+	head = peer_tcp->tcp_rcv_list;
+	while (head != NULL) {
+		/*
+		 * Remove existing T_EXDATA_IND, keep the data which follows
+		 * it and relink our list.  Note that we don't modify the
+		 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
+		 */
+		if (DB_TYPE(head) != M_DATA) {
+			mp1 = head;
+
+			ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
+			head = mp1->b_cont;
+			mp1->b_cont = NULL;
+			head->b_next = mp1->b_next;
+			mp1->b_next = NULL;
+			if (prev_head != NULL)
+				prev_head->b_next = head;
+			if (peer_tcp->tcp_rcv_list == mp1)
+				peer_tcp->tcp_rcv_list = head;
+			if (peer_tcp->tcp_rcv_last_head == mp1)
+				peer_tcp->tcp_rcv_last_head = head;
+			freeb(mp1);
+		}
+		prev_head = head;
+		head = head->b_next;
+	}
+}
+
+/*
+ * Fusion output routine, called by tcp_output() and tcp_wput_proto().
+ */
+boolean_t
+tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	queue_t *peer_rq;
+	uint_t max_unread;
+	boolean_t flow_stopped;
+	boolean_t urgent = (DB_TYPE(mp) != M_DATA);
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+	ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
+	    DB_TYPE(mp) == M_PCPROTO);
+
+	peer_rq = peer_tcp->tcp_rq;
+	max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater;
+
+	/* If this connection requires IP, unfuse and use regular path */
+	if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
+	    IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+		TCP_STAT(tcp_fusion_aborted);
+		tcp_unfuse(tcp);
+		return (B_FALSE);
+	}
+
+	if (send_size == 0) {
+		freemsg(mp);
+		return (B_TRUE);
+	}
+
+	/*
+	 * Handle urgent data; we either send up SIGURG to the peer now
+	 * or do it later when we drain, in case the peer is detached
+	 * or if we're short of memory for M_PCSIG mblk.
+	 */
+	if (urgent) {
+		/*
+		 * We stop synchronous streams when we have urgent data
+		 * queued to prevent tcp_fuse_rrw() from pulling it.  If
+		 * for some reasons the urgent data can't be delivered
+		 * below, synchronous streams will remain stopped until
+		 * someone drains the tcp_rcv_list.
+		 */
+		TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+		tcp_fuse_output_urg(tcp, mp);
+	}
+
+	mutex_enter(&peer_tcp->tcp_fuse_lock);
+	/*
+	 * Wake up and signal the peer; it is okay to do this before
+	 * enqueueing because we are holding the lock.  One of the
+	 * advantages of synchronous streams is the ability for us to
+	 * find out when the application performs a read on the socket,
+	 * by way of tcp_fuse_rrw() entry point being called.  Every
+	 * data that gets enqueued onto the receiver is treated as if
+	 * it has arrived at the receiving endpoint, thus generating
+	 * SIGPOLL/SIGIO for asynchronous socket just as in the strrput()
+	 * case.  However, we only wake up the application when necessary,
+	 * i.e. during the first enqueue.  When tcp_fuse_rrw() is called
+	 * it will send everything upstream.
+	 */
+	if (peer_tcp->tcp_direct_sockfs && !urgent &&
+	    !TCP_IS_DETACHED(peer_tcp)) {
+		if (peer_tcp->tcp_rcv_list == NULL)
+			STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq));
+		/* Update poll events and send SIGPOLL/SIGIO if necessary */
+		STR_SENDSIG(STREAM(peer_tcp->tcp_rq));
+	}
+
+	/*
+	 * Enqueue data into the peer's receive list; we may or may not
+	 * drain the contents depending on the conditions below.
+	 */
+	tcp_rcv_enqueue(peer_tcp, mp, send_size);
+
+	/* In case it wrapped around and also to keep it constant */
+	peer_tcp->tcp_rwnd += send_size;
+
+	/*
+	 * Exercise flow-control when needed; we will get back-enabled
+	 * in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw().
+	 * If tcp_direct_sockfs is on or if the peer endpoint is detached,
+	 * we emulate streams flow control by checking the peer's queue
+	 * size and high water mark; otherwise we simply use canputnext()
+	 * to decide if we need to stop our flow.
+	 *
+	 * The outstanding unread data block check does not apply for a
+	 * detached receiver; this is to avoid unnecessary blocking of the
+	 * sender while the accept is currently in progress and is quite
+	 * similar to the regular tcp.
+	 */
+	if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0)
+		max_unread = UINT_MAX;
+
+	flow_stopped = tcp->tcp_flow_stopped;
+	if (!flow_stopped &&
+	    (((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) &&
+	    (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater ||
+	    ++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) ||
+	    (!peer_tcp->tcp_direct_sockfs &&
+	    !TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) {
+		tcp_setqfull(tcp);
+		flow_stopped = B_TRUE;
+		TCP_STAT(tcp_fusion_flowctl);
+		DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp,
+		    uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt,
+		    uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt);
+	} else if (flow_stopped &&
+	    TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+		tcp_clrqfull(tcp);
+	}
+
+	loopback_packets++;
+	tcp->tcp_last_sent_len = send_size;
+
+	/* Need to adjust the following SNMP MIB-related variables */
+	tcp->tcp_snxt += send_size;
+	tcp->tcp_suna = tcp->tcp_snxt;
+	peer_tcp->tcp_rnxt += send_size;
+	peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
+
+	BUMP_MIB(&tcp_mib, tcpOutDataSegs);
+	UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
+
+	BUMP_MIB(&tcp_mib, tcpInSegs);
+	BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
+	UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
+
+	BUMP_LOCAL(tcp->tcp_obsegs);
+	BUMP_LOCAL(peer_tcp->tcp_ibsegs);
+
+	mutex_exit(&peer_tcp->tcp_fuse_lock);
+
+	DTRACE_PROBE2(tcp__fuse__output, tcp_t *, tcp, uint_t, send_size);
+
+	if (!TCP_IS_DETACHED(peer_tcp)) {
+		/*
+		 * Drain the peer's receive queue it has urgent data or if
+		 * we're not flow-controlled.  There is no need for draining
+		 * normal data when tcp_direct_sockfs is on because the peer
+		 * will pull the data via tcp_fuse_rrw().
+		 */
+		if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) {
+			ASSERT(peer_tcp->tcp_rcv_list != NULL);
+			(void) tcp_fuse_rcv_drain(peer_rq, peer_tcp, NULL);
+			/*
+			 * If synchronous streams was stopped above due
+			 * to the presence of urgent data, re-enable it.
+			 */
+			if (urgent)
+				TCP_FUSE_SYNCSTR_RESUME(peer_tcp);
+		}
+	}
+	return (B_TRUE);
+}
+
+/*
+ * This routine gets called to deliver data upstream on a fused or
+ * previously fused tcp loopback endpoint; the latter happens only
+ * when there is a pending SIGURG signal plus urgent data that can't
+ * be sent upstream in the past.
+ */
+boolean_t
+tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
+{
+	mblk_t *mp;
+#ifdef DEBUG
+	uint_t cnt = 0;
+#endif
+
+	ASSERT(tcp->tcp_loopback);
+	ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
+	ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
+	ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
+
+	/* No need for the push timer now, in case it was scheduled */
+	if (tcp->tcp_push_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
+		tcp->tcp_push_tid = 0;
+	}
+	/*
+	 * If there's urgent data sitting in receive list and we didn't
+	 * get a chance to send up a SIGURG signal, make sure we send
+	 * it first before draining in order to ensure that SIOCATMARK
+	 * works properly.
+	 */
+	if (tcp->tcp_fused_sigurg) {
+		/*
+		 * sigurg_mpp is normally NULL, i.e. when we're still
+		 * fused and didn't get here because of tcp_unfuse().
+		 * In this case try hard to allocate the M_PCSIG mblk.
+		 */
+		if (sigurg_mpp == NULL &&
+		    (mp = allocb(1, BPRI_HI)) == NULL &&
+		    (mp = allocb_tryhard(1)) == NULL) {
+			/* Alloc failed; try again next time */
+			tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
+			    MSEC_TO_TICK(tcp_push_timer_interval));
+			return (B_TRUE);
+		} else if (sigurg_mpp != NULL) {
+			/*
+			 * Use the supplied M_PCSIG mblk; it means we're
+			 * either unfused or in the process of unfusing,
+			 * and the drain must happen now.
+			 */
+			mp = *sigurg_mpp;
+			*sigurg_mpp = NULL;
+		}
+		ASSERT(mp != NULL);
+
+		tcp->tcp_fused_sigurg = B_FALSE;
+		/* Send up the signal */
+		DB_TYPE(mp) = M_PCSIG;
+		*mp->b_wptr++ = (uchar_t)SIGURG;
+		putnext(q, mp);
+		/*
+		 * Let the regular tcp_rcv_drain() path handle
+		 * draining the data if we're no longer fused.
+		 */
+		if (!tcp->tcp_fused)
+			return (B_FALSE);
+	}
+
+	/*
+	 * In the synchronous streams case, we generate SIGPOLL/SIGIO for
+	 * each M_DATA that gets enqueued onto the receiver.  At this point
+	 * we are about to drain any queued data via putnext().  In order
+	 * to avoid extraneous signal generation from strrput(), we set
+	 * STRGETINPROG flag at the stream head prior to the draining and
+	 * restore it afterwards.  This masks out signal generation only
+	 * for M_DATA messages and does not affect urgent data.
+	 */
+	if (tcp->tcp_direct_sockfs)
+		strrput_sig(q, B_FALSE);
+
+	/* Drain the data */
+	while ((mp = tcp->tcp_rcv_list) != NULL) {
+		tcp->tcp_rcv_list = mp->b_next;
+		mp->b_next = NULL;
+#ifdef DEBUG
+		cnt += msgdsize(mp);
+#endif
+		putnext(q, mp);
+		TCP_STAT(tcp_fusion_putnext);
+	}
+
+	if (tcp->tcp_direct_sockfs)
+		strrput_sig(q, B_TRUE);
+
+	ASSERT(cnt == tcp->tcp_rcv_cnt);
+	tcp->tcp_rcv_last_head = NULL;
+	tcp->tcp_rcv_last_tail = NULL;
+	tcp->tcp_rcv_cnt = 0;
+	tcp->tcp_fuse_rcv_unread_cnt = 0;
+	tcp->tcp_rwnd = q->q_hiwat;
+
+	return (B_TRUE);
+}
+
+/*
+ * Synchronous stream entry point for sockfs to retrieve
+ * data directly from tcp_rcv_list.
+ */
+int
+tcp_fuse_rrw(queue_t *q, struiod_t *dp)
+{
+	tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
+	mblk_t *mp;
+
+	mutex_enter(&tcp->tcp_fuse_lock);
+	/*
+	 * If someone had turned off tcp_direct_sockfs or if synchronous
+	 * streams is temporarily disabled, we return EBUSY.  This causes
+	 * strget() to dequeue data from the stream head instead.
+	 */
+	if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped) {
+		mutex_exit(&tcp->tcp_fuse_lock);
+		TCP_STAT(tcp_fusion_rrw_busy);
+		return (EBUSY);
+	}
+
+	if ((mp = tcp->tcp_rcv_list) != NULL) {
+		tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+		DTRACE_PROBE3(tcp__fuse__rrw, tcp_t *, tcp,
+		    uint32_t, tcp->tcp_rcv_cnt, ssize_t, dp->d_uio.uio_resid);
+
+		tcp->tcp_rcv_list = NULL;
+		TCP_STAT(tcp_fusion_rrw_msgcnt);
+
+		/*
+		 * At this point nothing should be left in tcp_rcv_list.
+		 * The only possible case where we would have a chain of
+		 * b_next-linked messages is urgent data, but we wouldn't
+		 * be here if that's true since urgent data is delivered
+		 * via putnext() and synchronous streams is stopped until
+		 * tcp_fuse_rcv_drain() is finished.
+		 */
+		ASSERT(DB_TYPE(mp) == M_DATA && mp->b_next == NULL);
+
+		tcp->tcp_rcv_last_head = NULL;
+		tcp->tcp_rcv_last_tail = NULL;
+		tcp->tcp_rcv_cnt = 0;
+		tcp->tcp_fuse_rcv_unread_cnt = 0;
+
+		if (peer_tcp->tcp_flow_stopped) {
+			tcp_clrqfull(peer_tcp);
+			TCP_STAT(tcp_fusion_backenabled);
+		}
+	}
+
+	/*
+	 * Either we just dequeued everything or we get here from sockfs
+	 * and have nothing to return; in this case clear RSLEEP.
+	 */
+	ASSERT(tcp->tcp_rcv_last_head == NULL);
+	ASSERT(tcp->tcp_rcv_last_tail == NULL);
+	ASSERT(tcp->tcp_rcv_cnt == 0);
+	ASSERT(tcp->tcp_fuse_rcv_unread_cnt == 0);
+	STR_WAKEUP_CLEAR(STREAM(q));
+
+	mutex_exit(&tcp->tcp_fuse_lock);
+	dp->d_mp = mp;
+	return (0);
+}
+
+/*
+ * Synchronous stream entry point used by certain ioctls to retrieve
+ * information about or peek into the tcp_rcv_list.
+ */
+int
+tcp_fuse_rinfop(queue_t *q, infod_t *dp)
+{
+	tcp_t	*tcp = Q_TO_CONN(q)->conn_tcp;
+	mblk_t	*mp;
+	uint_t	cmd = dp->d_cmd;
+	int	res = 0;
+	int	error = 0;
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&tcp->tcp_fuse_lock);
+	/* If shutdown on read has happened, return nothing */
+	mutex_enter(&stp->sd_lock);
+	if (stp->sd_flag & STREOF) {
+		mutex_exit(&stp->sd_lock);
+		goto done;
+	}
+	mutex_exit(&stp->sd_lock);
+
+	/*
+	 * It is OK not to return an answer if tcp_rcv_list is
+	 * currently not accessible.
+	 */
+	if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped ||
+	    (mp = tcp->tcp_rcv_list) == NULL)
+		goto done;
+
+	if (cmd & INFOD_COUNT) {
+		/*
+		 * We have at least one message and
+		 * could return only one at a time.
+		 */
+		dp->d_count++;
+		res |= INFOD_COUNT;
+	}
+	if (cmd & INFOD_BYTES) {
+		/*
+		 * Return size of all data messages.
+		 */
+		dp->d_bytes += tcp->tcp_rcv_cnt;
+		res |= INFOD_BYTES;
+	}
+	if (cmd & INFOD_FIRSTBYTES) {
+		/*
+		 * Return size of first data message.
+		 */
+		dp->d_bytes = msgdsize(mp);
+		res |= INFOD_FIRSTBYTES;
+		dp->d_cmd &= ~INFOD_FIRSTBYTES;
+	}
+	if (cmd & INFOD_COPYOUT) {
+		mblk_t *mp1;
+		int n;
+
+		if (DB_TYPE(mp) == M_DATA) {
+			mp1 = mp;
+		} else {
+			mp1 = mp->b_cont;
+			ASSERT(mp1 != NULL);
+		}
+
+		/*
+		 * Return data contents of first message.
+		 */
+		ASSERT(DB_TYPE(mp1) == M_DATA);
+		while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+			n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+			if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+			    UIO_READ, dp->d_uiop)) != 0) {
+				goto done;
+			}
+			mp1 = mp1->b_cont;
+		}
+		res |= INFOD_COPYOUT;
+		dp->d_cmd &= ~INFOD_COPYOUT;
+	}
+done:
+	mutex_exit(&tcp->tcp_fuse_lock);
+
+	dp->d_res |= res;
+
+	return (error);
+}
+
+/*
+ * Enable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_enable(tcp_t *tcp)
+{
+	queue_t *rq = tcp->tcp_rq;
+	struct stdata *stp = STREAM(rq);
+
+	/* We can only enable synchronous streams for sockfs mode */
+	tcp->tcp_direct_sockfs = tcp->tcp_issocket && do_tcp_direct_sockfs;
+
+	if (!tcp->tcp_direct_sockfs)
+		return;
+
+	mutex_enter(&stp->sd_lock);
+	mutex_enter(QLOCK(rq));
+
+	/*
+	 * We replace our q_qinfo with one that has the qi_rwp entry point.
+	 * Clear SR_SIGALLDATA because we generate the equivalent signal(s)
+	 * for every enqueued data in tcp_fuse_output().
+	 */
+	rq->q_qinfo = &tcp_loopback_rinit;
+	rq->q_struiot = tcp_loopback_rinit.qi_struiot;
+	stp->sd_struiordq = rq;
+	stp->sd_rput_opt &= ~SR_SIGALLDATA;
+
+	mutex_exit(QLOCK(rq));
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_disable(tcp_t *tcp)
+{
+	queue_t *rq = tcp->tcp_rq;
+	struct stdata *stp = STREAM(rq);
+
+	if (!tcp->tcp_direct_sockfs)
+		return;
+
+	mutex_enter(&stp->sd_lock);
+	mutex_enter(QLOCK(rq));
+
+	/*
+	 * Reset q_qinfo to point to the default tcp entry points.
+	 * Also restore SR_SIGALLDATA so that strrput() can generate
+	 * the signals again for future M_DATA messages.
+	 */
+	rq->q_qinfo = &tcp_rinit;
+	rq->q_struiot = tcp_rinit.qi_struiot;
+	stp->sd_struiordq = NULL;
+	stp->sd_rput_opt |= SR_SIGALLDATA;
+	tcp->tcp_direct_sockfs = B_FALSE;
+
+	mutex_exit(QLOCK(rq));
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Enable synchronous streams on a pair of fused tcp endpoints.
+ */
+void
+tcp_fuse_syncstr_enable_pair(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+
+	tcp_fuse_syncstr_enable(tcp);
+	tcp_fuse_syncstr_enable(peer_tcp);
+}
+
+/*
+ * Allow or disallow signals to be generated by strrput().
+ */
+static void
+strrput_sig(queue_t *q, boolean_t on)
+{
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&stp->sd_lock);
+	if (on)
+		stp->sd_flag &= ~STRGETINPROG;
+	else
+		stp->sd_flag |= STRGETINPROG;
+	mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a pair of fused tcp endpoints and drain
+ * any queued data; called either during unfuse or upon transitioning from
+ * a socket to a stream endpoint due to _SIOCSOCKFALLBACK.
+ */
+void
+tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+
+	/*
+	 * We need to prevent tcp_fuse_rrw() from entering before
+	 * we can disable synchronous streams.
+	 */
+	TCP_FUSE_SYNCSTR_STOP(tcp);
+	TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+
+	/*
+	 * Drain any pending data; the detached check is needed because
+	 * we may be called as a result of a tcp_unfuse() triggered by
+	 * tcp_fuse_output().  Note that in case of a detached tcp, the
+	 * draining will happen later after the tcp is unfused.  For non-
+	 * urgent data, this can be handled by the regular tcp_rcv_drain().
+	 * If we have urgent data sitting in the receive list, we will
+	 * need to send up a SIGURG signal first before draining the data.
+	 * All of these will be handled by the code in tcp_fuse_rcv_drain()
+	 * when called from tcp_rcv_drain().
+	 */
+	if (!TCP_IS_DETACHED(tcp)) {
+		(void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
+		    (unfusing ? &tcp->tcp_fused_sigurg_mp : NULL));
+	}
+	if (!TCP_IS_DETACHED(peer_tcp)) {
+		(void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
+		    (unfusing ? &peer_tcp->tcp_fused_sigurg_mp : NULL));
+	}
+
+	/* Lift up any flow-control conditions */
+	if (tcp->tcp_flow_stopped) {
+		tcp_clrqfull(tcp);
+		TCP_STAT(tcp_fusion_backenabled);
+	}
+	if (peer_tcp->tcp_flow_stopped) {
+		tcp_clrqfull(peer_tcp);
+		TCP_STAT(tcp_fusion_backenabled);
+	}
+
+	/* Disable synchronous streams */
+	tcp_fuse_syncstr_disable(tcp);
+	tcp_fuse_syncstr_disable(peer_tcp);
+}
+
+/*
+ * Calculate the size of receive buffer for a fused tcp endpoint.
+ */
+size_t
+tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
+{
+	ASSERT(tcp->tcp_fused);
+
+	/* Ensure that value is within the maximum upper bound */
+	if (rwnd > tcp_max_buf)
+		rwnd = tcp_max_buf;
+
+	/* Obey the absolute minimum tcp receive high water mark */
+	if (rwnd < tcp_sth_rcv_hiwat)
+		rwnd = tcp_sth_rcv_hiwat;
+
+	/*
+	 * Round up to system page size in case SO_RCVBUF is modified
+	 * after SO_SNDBUF; the latter is also similarly rounded up.
+	 */
+	rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
+	tcp->tcp_fuse_rcv_hiwater = rwnd;
+	return (rwnd);
+}
+
+/*
+ * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
+ */
+int
+tcp_fuse_maxpsz_set(tcp_t *tcp)
+{
+	tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+	uint_t sndbuf = tcp->tcp_xmit_hiwater;
+	uint_t maxpsz = sndbuf;
+
+	ASSERT(tcp->tcp_fused);
+	ASSERT(peer_tcp != NULL);
+	ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0);
+	/*
+	 * In the fused loopback case, we want the stream head to split
+	 * up larger writes into smaller chunks for a more accurate flow-
+	 * control accounting.  Our maxpsz is half of the sender's send
+	 * buffer or the receiver's receive buffer, whichever is smaller.
+	 * We round up the buffer to system page size due to the lack of
+	 * TCP MSS concept in Fusion.
+	 */
+	if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater)
+		maxpsz = peer_tcp->tcp_fuse_rcv_hiwater;
+	maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
+
+	/*
+	 * Calculate the peer's limit for the number of outstanding unread
+	 * data block.  This is the amount of data blocks that are allowed
+	 * to reside in the receiver's queue before the sender gets flow
+	 * controlled.  It is used only in the synchronous streams mode as
+	 * a way to throttle the sender when it performs consecutive writes
+	 * faster than can be read.  The value is derived from SO_SNDBUF in
+	 * order to give the sender some control; we divide it with a large
+	 * value (16KB) to produce a fairly low initial limit.
+	 */
+	if (tcp_fusion_rcv_unread_min == 0) {
+		/* A value of 0 means that we disable the check */
+		peer_tcp->tcp_fuse_rcv_unread_hiwater = 0;
+	} else {
+		peer_tcp->tcp_fuse_rcv_unread_hiwater =
+		    MAX(sndbuf >> 14, tcp_fusion_rcv_unread_min);
+	}
+	return (maxpsz);
+}
--- a/usr/src/uts/common/inet/tcp/tcpddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -38,7 +38,13 @@
 #define	INET_DEVDESC	"TCP STREAMS driver %I%"
 #define	INET_MODDESC	"TCP STREAMS module %I%"
 #define	INET_DEVMINOR	TCP_MINOR
-#define	INET_DEVMTFLAGS	D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here.  Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define	INET_DEVMTFLAGS	(D_MP|_D_DIRECT)
 #define	INET_MODMTFLAGS	D_MP
 
 #include "../inetddi.c"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/inet/tcp_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -0,0 +1,332 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_INET_TCP_IMPL_H
+#define	_INET_TCP_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * TCP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <inet/tcp.h>
+
+#define	TCP_MOD_ID	5105
+
+/*
+ * Was this tcp created via socket() interface?
+ */
+#define	TCP_IS_SOCKET(tcp)	((tcp)->tcp_issocket)
+
+/*
+ * Is this tcp not attached to any upper client?
+ */
+#define	TCP_IS_DETACHED(tcp)	((tcp)->tcp_detached)
+
+#define	TCP_TIMER(tcp, f, tim)		\
+	tcp_timeout(tcp->tcp_connp, f, tim)
+#define	TCP_TIMER_CANCEL(tcp, id)	\
+	tcp_timeout_cancel(tcp->tcp_connp, id)
+
+/*
+ * To restart the TCP retransmission timer.
+ */
+#define	TCP_TIMER_RESTART(tcp, intvl) {					\
+	if ((tcp)->tcp_timer_tid != 0)					\
+		(void) TCP_TIMER_CANCEL((tcp), (tcp)->tcp_timer_tid);	\
+	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer,		\
+	    MSEC_TO_TICK(intvl));					\
+}
+
+/*
+ * This stops synchronous streams for a fused tcp endpoint
+ * and prevents tcp_rrw() from pulling data from it.
+ */
+#define	TCP_FUSE_SYNCSTR_STOP(tcp) {				\
+	if ((tcp)->tcp_direct_sockfs) {				\
+		mutex_enter(&(tcp)->tcp_fuse_lock);		\
+		(tcp)->tcp_fuse_syncstr_stopped = B_TRUE;	\
+		mutex_exit(&(tcp)->tcp_fuse_lock);		\
+	}							\
+}
+
+/*
+ * This resumes synchronous streams for this fused tcp endpoint
+ * and allows tcp_rrw() to pull data from it again.
+ */
+#define	TCP_FUSE_SYNCSTR_RESUME(tcp) {				\
+	if ((tcp)->tcp_direct_sockfs) {				\
+		mutex_enter(&(tcp)->tcp_fuse_lock);		\
+		(tcp)->tcp_fuse_syncstr_stopped = B_FALSE;	\
+		mutex_exit(&(tcp)->tcp_fuse_lock);		\
+	}							\
+}
+
+/*
+ * Write-side flow-control is implemented via the per instance STREAMS
+ * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
+ * and clearing QFULL and calling qbackenable() to restart the flow based
+ * on the number of TCP unsent bytes (i.e. those not on the wire waiting
+ * for a remote ACK).
+ *
+ * This is different than a standard STREAMS kmod which when using the
+ * STREAMS Q the framework would automatictly flow-control based on the
+ * defined hiwat/lowat values as mblk_t's are enqueued/dequeued.
+ *
+ * As of FireEngine TCP write-side flow-control needs to take into account
+ * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes
+ * (i.e. from tcp_wput() -> tcp_output()).
+ *
+ * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to
+ * count the number of bytes enqueued by tcp_wput() and the number of bytes
+ * dequeued and processed by tcp_output().
+ *
+ * So, the total number of bytes unsent is (squeue_bytes + unsent) with all
+ * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES.
+ */
+extern void	tcp_clrqfull(tcp_t *);
+extern void	tcp_setqfull(tcp_t *);
+
+#define	TCP_UNSENT_BYTES(tcp) \
+	((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent)
+
+/* Named Dispatch Parameter Management Structure */
+typedef struct tcpparam_s {
+	uint32_t	tcp_param_min;
+	uint32_t	tcp_param_max;
+	uint32_t	tcp_param_val;
+	char		*tcp_param_name;
+} tcpparam_t;
+
+extern tcpparam_t tcp_param_arr[];
+
+#define	tcp_time_wait_interval			tcp_param_arr[0].tcp_param_val
+#define	tcp_conn_req_max_q			tcp_param_arr[1].tcp_param_val
+#define	tcp_conn_req_max_q0			tcp_param_arr[2].tcp_param_val
+#define	tcp_conn_req_min			tcp_param_arr[3].tcp_param_val
+#define	tcp_conn_grace_period			tcp_param_arr[4].tcp_param_val
+#define	tcp_cwnd_max_				tcp_param_arr[5].tcp_param_val
+#define	tcp_dbg					tcp_param_arr[6].tcp_param_val
+#define	tcp_smallest_nonpriv_port		tcp_param_arr[7].tcp_param_val
+#define	tcp_ip_abort_cinterval			tcp_param_arr[8].tcp_param_val
+#define	tcp_ip_abort_linterval			tcp_param_arr[9].tcp_param_val
+#define	tcp_ip_abort_interval			tcp_param_arr[10].tcp_param_val
+#define	tcp_ip_notify_cinterval			tcp_param_arr[11].tcp_param_val
+#define	tcp_ip_notify_interval			tcp_param_arr[12].tcp_param_val
+#define	tcp_ipv4_ttl				tcp_param_arr[13].tcp_param_val
+#define	tcp_keepalive_interval_high		tcp_param_arr[14].tcp_param_max
+#define	tcp_keepalive_interval			tcp_param_arr[14].tcp_param_val
+#define	tcp_keepalive_interval_low		tcp_param_arr[14].tcp_param_min
+#define	tcp_maxpsz_multiplier			tcp_param_arr[15].tcp_param_val
+#define	tcp_mss_def_ipv4			tcp_param_arr[16].tcp_param_val
+#define	tcp_mss_max_ipv4			tcp_param_arr[17].tcp_param_val
+#define	tcp_mss_min				tcp_param_arr[18].tcp_param_val
+#define	tcp_naglim_def				tcp_param_arr[19].tcp_param_val
+#define	tcp_rexmit_interval_initial		tcp_param_arr[20].tcp_param_val
+#define	tcp_rexmit_interval_max			tcp_param_arr[21].tcp_param_val
+#define	tcp_rexmit_interval_min			tcp_param_arr[22].tcp_param_val
+#define	tcp_deferred_ack_interval		tcp_param_arr[23].tcp_param_val
+#define	tcp_snd_lowat_fraction			tcp_param_arr[24].tcp_param_val
+#define	tcp_sth_rcv_hiwat			tcp_param_arr[25].tcp_param_val
+#define	tcp_sth_rcv_lowat			tcp_param_arr[26].tcp_param_val
+#define	tcp_dupack_fast_retransmit		tcp_param_arr[27].tcp_param_val
+#define	tcp_ignore_path_mtu			tcp_param_arr[28].tcp_param_val
+#define	tcp_smallest_anon_port			tcp_param_arr[29].tcp_param_val
+#define	tcp_largest_anon_port			tcp_param_arr[30].tcp_param_val
+#define	tcp_xmit_hiwat				tcp_param_arr[31].tcp_param_val
+#define	tcp_xmit_lowat				tcp_param_arr[32].tcp_param_val
+#define	tcp_recv_hiwat				tcp_param_arr[33].tcp_param_val
+#define	tcp_recv_hiwat_minmss			tcp_param_arr[34].tcp_param_val
+#define	tcp_fin_wait_2_flush_interval		tcp_param_arr[35].tcp_param_val
+#define	tcp_co_min				tcp_param_arr[36].tcp_param_val
+#define	tcp_max_buf				tcp_param_arr[37].tcp_param_val
+#define	tcp_strong_iss				tcp_param_arr[38].tcp_param_val
+#define	tcp_rtt_updates				tcp_param_arr[39].tcp_param_val
+#define	tcp_wscale_always			tcp_param_arr[40].tcp_param_val
+#define	tcp_tstamp_always			tcp_param_arr[41].tcp_param_val
+#define	tcp_tstamp_if_wscale			tcp_param_arr[42].tcp_param_val
+#define	tcp_rexmit_interval_extra		tcp_param_arr[43].tcp_param_val
+#define	tcp_deferred_acks_max			tcp_param_arr[44].tcp_param_val
+#define	tcp_slow_start_after_idle		tcp_param_arr[45].tcp_param_val
+#define	tcp_slow_start_initial			tcp_param_arr[46].tcp_param_val
+#define	tcp_co_timer_interval			tcp_param_arr[47].tcp_param_val
+#define	tcp_sack_permitted			tcp_param_arr[48].tcp_param_val
+#define	tcp_trace				tcp_param_arr[49].tcp_param_val
+#define	tcp_compression_enabled			tcp_param_arr[50].tcp_param_val
+#define	tcp_ipv6_hoplimit			tcp_param_arr[51].tcp_param_val
+#define	tcp_mss_def_ipv6			tcp_param_arr[52].tcp_param_val
+#define	tcp_mss_max_ipv6			tcp_param_arr[53].tcp_param_val
+#define	tcp_rev_src_routes			tcp_param_arr[54].tcp_param_val
+#define	tcp_local_dack_interval			tcp_param_arr[55].tcp_param_val
+#define	tcp_ndd_get_info_interval		tcp_param_arr[56].tcp_param_val
+#define	tcp_local_dacks_max			tcp_param_arr[57].tcp_param_val
+#define	tcp_ecn_permitted			tcp_param_arr[58].tcp_param_val
+#define	tcp_rst_sent_rate_enabled		tcp_param_arr[59].tcp_param_val
+#define	tcp_rst_sent_rate			tcp_param_arr[60].tcp_param_val
+#define	tcp_push_timer_interval			tcp_param_arr[61].tcp_param_val
+#define	tcp_use_smss_as_mss_opt			tcp_param_arr[62].tcp_param_val
+#define	tcp_keepalive_abort_interval_high	tcp_param_arr[63].tcp_param_max
+#define	tcp_keepalive_abort_interval		tcp_param_arr[63].tcp_param_val
+#define	tcp_keepalive_abort_interval_low	tcp_param_arr[63].tcp_param_min
+
+/* Kstats */
+typedef struct tcp_stat {
+	kstat_named_t	tcp_time_wait;
+	kstat_named_t	tcp_time_wait_syn;
+	kstat_named_t	tcp_time_wait_syn_success;
+	kstat_named_t	tcp_time_wait_syn_fail;
+	kstat_named_t	tcp_reinput_syn;
+	kstat_named_t	tcp_ip_output;
+	kstat_named_t	tcp_detach_non_time_wait;
+	kstat_named_t	tcp_detach_time_wait;
+	kstat_named_t	tcp_time_wait_reap;
+	kstat_named_t	tcp_clean_death_nondetached;
+	kstat_named_t	tcp_reinit_calls;
+	kstat_named_t	tcp_eager_err1;
+	kstat_named_t	tcp_eager_err2;
+	kstat_named_t	tcp_eager_blowoff_calls;
+	kstat_named_t	tcp_eager_blowoff_q;
+	kstat_named_t	tcp_eager_blowoff_q0;
+	kstat_named_t	tcp_not_hard_bound;
+	kstat_named_t	tcp_no_listener;
+	kstat_named_t	tcp_found_eager;
+	kstat_named_t	tcp_wrong_queue;
+	kstat_named_t	tcp_found_eager_binding1;
+	kstat_named_t	tcp_found_eager_bound1;
+	kstat_named_t	tcp_eager_has_listener1;
+	kstat_named_t	tcp_open_alloc;
+	kstat_named_t	tcp_open_detached_alloc;
+	kstat_named_t	tcp_rput_time_wait;
+	kstat_named_t	tcp_listendrop;
+	kstat_named_t	tcp_listendropq0;
+	kstat_named_t	tcp_wrong_rq;
+	kstat_named_t	tcp_rsrv_calls;
+	kstat_named_t	tcp_eagerfree2;
+	kstat_named_t	tcp_eagerfree3;
+	kstat_named_t	tcp_eagerfree4;
+	kstat_named_t	tcp_eagerfree5;
+	kstat_named_t	tcp_timewait_syn_fail;
+	kstat_named_t	tcp_listen_badflags;
+	kstat_named_t	tcp_timeout_calls;
+	kstat_named_t	tcp_timeout_cached_alloc;
+	kstat_named_t	tcp_timeout_cancel_reqs;
+	kstat_named_t	tcp_timeout_canceled;
+	kstat_named_t	tcp_timermp_alloced;
+	kstat_named_t	tcp_timermp_freed;
+	kstat_named_t	tcp_timermp_allocfail;
+	kstat_named_t	tcp_timermp_allocdblfail;
+	kstat_named_t	tcp_push_timer_cnt;
+	kstat_named_t	tcp_ack_timer_cnt;
+	kstat_named_t	tcp_ire_null1;
+	kstat_named_t	tcp_ire_null;
+	kstat_named_t	tcp_ip_send;
+	kstat_named_t	tcp_ip_ire_send;
+	kstat_named_t   tcp_wsrv_called;
+	kstat_named_t   tcp_flwctl_on;
+	kstat_named_t	tcp_timer_fire_early;
+	kstat_named_t	tcp_timer_fire_miss;
+	kstat_named_t	tcp_freelist_cleanup;
+	kstat_named_t	tcp_rput_v6_error;
+	kstat_named_t	tcp_out_sw_cksum;
+	kstat_named_t	tcp_out_sw_cksum_bytes;
+	kstat_named_t	tcp_zcopy_on;
+	kstat_named_t	tcp_zcopy_off;
+	kstat_named_t	tcp_zcopy_backoff;
+	kstat_named_t	tcp_zcopy_disable;
+	kstat_named_t	tcp_mdt_pkt_out;
+	kstat_named_t	tcp_mdt_pkt_out_v4;
+	kstat_named_t	tcp_mdt_pkt_out_v6;
+	kstat_named_t	tcp_mdt_discarded;
+	kstat_named_t	tcp_mdt_conn_halted1;
+	kstat_named_t	tcp_mdt_conn_halted2;
+	kstat_named_t	tcp_mdt_conn_halted3;
+	kstat_named_t	tcp_mdt_conn_resumed1;
+	kstat_named_t	tcp_mdt_conn_resumed2;
+	kstat_named_t	tcp_mdt_legacy_small;
+	kstat_named_t	tcp_mdt_legacy_all;
+	kstat_named_t	tcp_mdt_legacy_ret;
+	kstat_named_t	tcp_mdt_allocfail;
+	kstat_named_t	tcp_mdt_addpdescfail;
+	kstat_named_t	tcp_mdt_allocd;
+	kstat_named_t	tcp_mdt_linked;
+	kstat_named_t	tcp_fusion_flowctl;
+	kstat_named_t	tcp_fusion_backenabled;
+	kstat_named_t	tcp_fusion_urg;
+	kstat_named_t	tcp_fusion_putnext;
+	kstat_named_t	tcp_fusion_unfusable;
+	kstat_named_t	tcp_fusion_aborted;
+	kstat_named_t	tcp_fusion_unqualified;
+	kstat_named_t	tcp_fusion_rrw_busy;
+	kstat_named_t	tcp_fusion_rrw_msgcnt;
+	kstat_named_t	tcp_in_ack_unsent_drop;
+	kstat_named_t	tcp_sock_fallback;
+} tcp_stat_t;
+
+extern tcp_stat_t tcp_statistics;
+
+#define	TCP_STAT(x)		(tcp_statistics.x.value.ui64++)
+#define	TCP_STAT_UPDATE(x, n)	(tcp_statistics.x.value.ui64 += (n))
+#define	TCP_STAT_SET(x, n)	(tcp_statistics.x.value.ui64 = (n))
+
+extern struct qinit tcp_loopback_rinit, tcp_rinit;
+extern boolean_t do_tcp_fusion;
+
+extern int	tcp_maxpsz_set(tcp_t *, boolean_t);
+extern void	tcp_timers_stop(tcp_t *);
+extern void	tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t);
+extern void	tcp_push_timer(void *);
+extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
+extern clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
+
+extern void	tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
+extern void	tcp_unfuse(tcp_t *);
+extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
+extern void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
+extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
+extern void	tcp_fuse_syncstr_enable_pair(tcp_t *);
+extern void	tcp_fuse_disable_pair(tcp_t *, boolean_t);
+extern int	tcp_fuse_rrw(queue_t *, struiod_t *);
+extern int	tcp_fuse_rinfop(queue_t *, infod_t *);
+extern size_t	tcp_fuse_set_rcv_hiwat(tcp_t *, size_t);
+extern int	tcp_fuse_maxpsz_set(tcp_t *);
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_TCP_IMPL_H */
--- a/usr/src/uts/common/inet/udp/udp.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udp.c	Sat Oct 22 22:50:14 2005 -0700
@@ -31,6 +31,8 @@
 
 #include <sys/types.h>
 #include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
 #include <sys/stropts.h>
 #include <sys/strlog.h>
 #include <sys/strsun.h>
@@ -50,6 +52,7 @@
 #include <sys/zone.h>
 
 #include <sys/socket.h>
+#include <sys/sockio.h>
 #include <sys/vtrace.h>
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
@@ -59,11 +62,15 @@
 #include <netinet/icmp6.h>
 #include <netinet/udp.h>
 #include <net/if.h>
+#include <net/route.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/ip_impl.h>
 #include <inet/ip6.h>
 #include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_multi.h>
 #include <inet/mi.h>
 #include <inet/mib2.h>
 #include <inet/nd.h>
@@ -71,9 +78,12 @@
 #include <inet/snmpcom.h>
 #include <inet/kstatcom.h>
 #include <inet/udp_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipp_common.h>
 
 /*
- * The ipsec_info.h header file is here since it has the defination for the
+ * The ipsec_info.h header file is here since it has the definition for the
  * M_CTL message types used by IP to convey information to the ULP. The
  * ipsec_info.h needs the pfkeyv2.h, hence the latters presence.
  */
@@ -81,40 +91,138 @@
 #include <inet/ipsec_info.h>
 
 /*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX. These and other externs should really move to a udp header file.
- */
-extern optdb_obj_t	udp_opt_obj;
-extern uint_t		udp_max_optsize;
-
-
-/*
  * Synchronization notes:
  *
- * UDP uses a combination of the queue-pair STREAMS perimeter, a global
- * lock and a set of bind hash locks to protect its data structures.
+ * UDP uses a combination of its internal perimeter, a global lock and
+ * a set of bind hash locks to protect its data structures.  Please see
+ * the note above udp_mode_assertions for details about the internal
+ * perimeter.
  *
- * The queue-pair perimeter is not acquired exclusively in the put
- * procedures thus when udp_rput or udp_wput needs exclusive access to
- * the udp_t instance structure it will use qwriter(..., PERIM_INNER) to
- * asynchronously acquire exclusive access to the udp_t instance.
- *
- * When UDP global data needs to be modified the udp_g_lock mutex is acquired.
- * Currently, udp_g_head and udp_g_epriv_ports[] are protected by it.
- *
- * When an UDP endpoint is bound to a local port, it is inserted into
+ * When a UDP endpoint is bound to a local port, it is inserted into
  * a bind hash list.  The list consists of an array of udp_fanout_t buckets.
  * The size of the array is controlled by the udp_bind_fanout_size variable.
  * This variable can be changed in /etc/system if the default value is
- * not large enough.  Each bind hash bucket is protected by a per bucket lock.
- * It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
+ * not large enough.  Each bind hash bucket is protected by a per bucket
+ * lock.  It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
  * structure.  An UDP endpoint is removed from the bind hash list only
  * when it is being unbound or being closed.  The per bucket lock also
- * protects an UDP endpoint's state changes.
+ * protects a UDP endpoint's state changes.
+ *
+ * Plumbing notes:
+ *
+ * Both udp and ip are merged, but the streams plumbing is kept unchanged
+ * in that udp is always pushed atop /dev/ip.  This is done to preserve
+ * backwards compatibility for certain applications which rely on such
+ * plumbing geometry to do things such as issuing I_POP on the stream
+ * in order to obtain direct access to /dev/ip, etc.
+ *
+ * All UDP processings happen in the /dev/ip instance; the udp module
+ * instance does not possess any state about the endpoint, and merely
+ * acts as a dummy module whose presence is to keep the streams plumbing
+ * appearance unchanged.  At open time /dev/ip allocates a conn_t that
+ * happens to embed a udp_t.  This stays dormant until the time udp is
+ * pushed, which indicates to /dev/ip that it must convert itself from
+ * an IP to a UDP endpoint.
+ *
+ * We only allow for the following plumbing cases:
+ *
+ * Normal:
+ *	/dev/ip is first opened and later udp is pushed directly on top.
+ *	This is the default action that happens when a udp socket or
+ *	/dev/udp is opened.  The conn_t created by /dev/ip instance is
+ *	now shared and is marked with IPCL_UDP.
+ *
+ * SNMP-only:
+ *	udp is pushed on top of a module other than /dev/ip.  When this
+ *	happens it will support only SNMP semantics.  A new conn_t is
+ *	allocated and marked with IPCL_UDPMOD.
+ *
+ * The above cases imply that we don't support any intermediate module to
+ * reside in between /dev/ip and udp -- in fact, we never supported such
+ * scenario in the past as the inter-layer communication semantics have
+ * always been private.  Also note that the normal case allows for SNMP
+ * requests to be processed in addition to the rest of UDP operations.
+ *
+ * The normal case plumbing is depicted by the following diagram:
+ *
+ *	+---------------+---------------+
+ *	|		|		| udp
+ *	|     udp_wq	|    udp_rq	|
+ *	|		|    UDP_RD	|
+ *	|		|		|
+ *	+---------------+---------------+
+ *		|		^
+ *		v		|
+ *	+---------------+---------------+
+ *	|		|		| /dev/ip
+ *	|     ip_wq	|     ip_rq	| conn_t
+ *	|     UDP_WR	|		|
+ *	|		|		|
+ *	+---------------+---------------+
+ *
+ * Messages arriving at udp_wq from above will end up in ip_wq before
+ * it gets processed, i.e. udp write entry points will advance udp_wq
+ * and use its q_next value as ip_wq in order to use the conn_t that
+ * is stored in its q_ptr.  Likewise, messages generated by ip to the
+ * module above udp will appear as if they are originated from udp_rq,
+ * i.e. putnext() calls to the module above udp is done using the
+ * udp_rq instead of ip_rq in order to avoid udp_rput() which does
+ * nothing more than calling putnext().
+ *
+ * The above implies the following rule of thumb:
+ *
+ *   1. udp_t is obtained from conn_t, which is created by the /dev/ip
+ *	instance and is stored in q_ptr of both ip_wq and ip_rq.  There
+ *	is no direct reference to conn_t from either udp_wq or udp_rq.
+ *
+ *   2. Write-side entry points of udp can obtain the conn_t via the
+ *	Q_TO_CONN() macro, using the queue value obtain from UDP_WR().
+ *
+ *   3. While in /dev/ip context, putnext() to the module above udp can
+ *	be done by supplying the queue value obtained from UDP_RD().
+ *
  */
 
+static queue_t *UDP_WR(queue_t *);
+static queue_t *UDP_RD(queue_t *);
+
+udp_stat_t udp_statistics = {
+	{ "udp_ip_send",		KSTAT_DATA_UINT64 },
+	{ "udp_ip_ire_send",		KSTAT_DATA_UINT64 },
+	{ "udp_ire_null",		KSTAT_DATA_UINT64 },
+	{ "udp_drain",			KSTAT_DATA_UINT64 },
+	{ "udp_sock_fallback",		KSTAT_DATA_UINT64 },
+	{ "udp_rrw_busy",		KSTAT_DATA_UINT64 },
+	{ "udp_rrw_msgcnt",		KSTAT_DATA_UINT64 },
+	{ "udp_out_sw_cksum",		KSTAT_DATA_UINT64 },
+	{ "udp_out_sw_cksum_bytes",	KSTAT_DATA_UINT64 },
+	{ "udp_out_opt",		KSTAT_DATA_UINT64 },
+	{ "udp_out_err_notconn",	KSTAT_DATA_UINT64 },
+	{ "udp_out_err_output",		KSTAT_DATA_UINT64 },
+	{ "udp_out_err_tudr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_pktinfo",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvdstaddr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvif",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvslla",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvucred",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvttl",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvhopopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvhoplimit",	KSTAT_DATA_UINT64 },
+	{ "udp_in_recvdstopts",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvrtdstopts",	KSTAT_DATA_UINT64 },
+	{ "udp_in_recvrthdr",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvpktinfo",		KSTAT_DATA_UINT64 },
+	{ "udp_in_recvtclass",		KSTAT_DATA_UINT64 },
+#ifdef DEBUG
+	{ "udp_data_conn",		KSTAT_DATA_UINT64 },
+	{ "udp_data_notconn",		KSTAT_DATA_UINT64 },
+#endif
+};
+
+static kstat_t *udp_ksp;
+struct kmem_cache *udp_cache;
+
 /*
  * Bind hash list size and hash function.  It has to be a power of 2 for
  * hashing.
@@ -151,14 +259,6 @@
 	"later.\n"
 #define	NDD_OUT_OF_BUF_MSG	"<< Out of buffer >>\n"
 
-/* Named Dispatch Parameter Management Structure */
-typedef struct udpparam_s {
-	uint32_t udp_param_min;
-	uint32_t udp_param_max;
-	uint32_t udp_param_value;
-	char	*udp_param_name;
-} udpparam_t;
-
 static void	udp_addr_req(queue_t *q, mblk_t *mp);
 static void	udp_bind(queue_t *q, mblk_t *mp);
 static void	udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
@@ -188,15 +288,6 @@
 static  int	udp_unitdata_opt_process(queue_t *q, mblk_t *mp,
 		    int *errorp, void *thisdg_attrs);
 static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
-int		udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr);
-int		udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr);
-int		udp_opt_set(queue_t *q, uint_t optset_context,
-		    int level, int name,
-		    uint_t inlen, uchar_t *invalp,
-		    uint_t *outlenp, uchar_t *outvalp,
-		    void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
 static int	udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
 static boolean_t udp_param_register(udpparam_t *udppa, int cnt);
 static int	udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -205,62 +296,91 @@
 		    uchar_t **optbufp, uint_t *optlenp);
 static void	udp_report_item(mblk_t *mp, udp_t *udp);
 static void	udp_rput(queue_t *q, mblk_t *mp);
+static void	udp_rput_other(queue_t *, mblk_t *);
+static int	udp_rinfop(queue_t *q, infod_t *dp);
+static int	udp_rrw(queue_t *q, struiod_t *dp);
 static	void	udp_rput_bind_ack(queue_t *q, mblk_t *mp);
-static void	udp_rput_other(queue_t *q, mblk_t *mp);
-static int	udp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int	udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
-		    uchar_t *ptr, int len);
 static int	udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
 		    cred_t *cr);
-static void	udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
+static void	udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha);
+static void	udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
+		    t_scalar_t destlen, t_scalar_t err);
 static void	udp_unbind(queue_t *q, mblk_t *mp);
 static in_port_t udp_update_next_port(in_port_t port, boolean_t random);
 static void	udp_wput(queue_t *q, mblk_t *mp);
-static void	udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
-		    t_scalar_t tudr_optlen);
+static mblk_t	*udp_output_v4(conn_t *, mblk_t *mp, ipaddr_t v4dst,
+		    uint16_t port, uint_t srcid, int *error);
+static mblk_t	*udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
+		    t_scalar_t tudr_optlen, int *error);
 static void	udp_wput_other(queue_t *q, mblk_t *mp);
 static void	udp_wput_iocdata(queue_t *q, mblk_t *mp);
+static void	udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
+		    socklen_t addrlen);
+static size_t	udp_set_rcv_hiwat(udp_t *udp, size_t size);
 
 static void	udp_kstat_init(void);
 static void	udp_kstat_fini(void);
 static int	udp_kstat_update(kstat_t *kp, int rw);
+static void	udp_input_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void	udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2);
+
+static void	udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp,
+		    uint_t pkt_len);
+static void	udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing);
+static void	udp_enter(conn_t *, mblk_t *, sqproc_t, uint8_t);
+static void	udp_exit(conn_t *);
+static void	udp_become_writer(conn_t *, mblk_t *, sqproc_t, uint8_t);
+#ifdef DEBUG
+static void	udp_mode_assertions(udp_t *, int);
+#endif /* DEBUG */
 
 major_t UDP6_MAJ;
-#define		UDP6		"udp6"
-
-#define		UDP_MAXPACKET_IPV4	\
-	(IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
-#define		UDP_MAXPACKET_IPV6	\
-	(IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
-
-static struct module_info info =  {
-	5607, "udp", 1, INFPSZ, 512, 128
+#define	UDP6 "udp6"
+
+#define	UDP_RECV_HIWATER	(56 * 1024)
+#define	UDP_RECV_LOWATER	128
+#define	UDP_XMIT_HIWATER	(56 * 1024)
+#define	UDP_XMIT_LOWATER	1024
+
+static struct module_info udp_info =  {
+	UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER
+};
+
+static struct qinit udp_rinit = {
+	(pfi_t)udp_rput, NULL, udp_open, udp_close, NULL,
+	&udp_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD
 };
 
-static struct qinit rinit = {
-	(pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, &info
+static struct qinit udp_winit = {
+	(pfi_t)udp_wput, NULL, NULL, NULL, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
 };
 
-static struct qinit winit = {
-	(pfi_t)udp_wput, NULL, NULL, NULL, NULL, &info
+/* Support for just SNMP if UDP is not pushed directly over device IP */
+struct qinit udp_snmp_rinit = {
+	(pfi_t)putnext, NULL, udp_open, ip_snmpmod_close, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
+};
+
+struct qinit udp_snmp_winit = {
+	(pfi_t)ip_snmpmod_wput, NULL, udp_open, ip_snmpmod_close, NULL,
+	&udp_info, NULL, NULL, NULL, STRUIOT_NONE
 };
 
 struct streamtab udpinfo = {
-	&rinit, &winit
+	&udp_rinit, &udp_winit
 };
 
 static	sin_t	sin_null;	/* Zero address for quick clears */
 static	sin6_t	sin6_null;	/* Zero address for quick clears */
 
-/* Protected by udp_g_lock */
-static void	*udp_g_head;	/* Head for list of open udp streams. */
-kmutex_t	udp_g_lock;	/* Protects the above variable */
-
 /* Hint not protected by any lock */
 static in_port_t	udp_g_next_port_to_try;
 
 /*
- * Extra privileged ports. In host byte order. Protected by udp_g_lock.
+ * Extra privileged ports. In host byte order.
  */
 #define	UDP_NUM_EPRIV_PORTS	64
 static int	udp_g_num_epriv_ports = UDP_NUM_EPRIV_PORTS;
@@ -273,6 +393,7 @@
 static mib2_udp_t	udp_mib;	/* SNMP fixed size info */
 static kstat_t		*udp_mibkp;	/* kstat exporting udp_mib data */
 
+#define	UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
 
 /* Default structure copied into T_INFO_ACK messages */
 static struct T_info_ack udp_g_t_info_ack_ipv4 = {
@@ -289,6 +410,8 @@
 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
 };
 
+#define	UDP_MAXPACKET_IPV6 (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
+
 static	struct T_info_ack udp_g_t_info_ack_ipv6 = {
 	T_INFO_ACK,
 	UDP_MAXPACKET_IPV6,	/* TSDU_size.  Excl. headers */
@@ -311,33 +434,23 @@
  * in udp_open.
  * All of these are alterable, within the min/max values given, at run time.
  */
-static udpparam_t	udp_param_arr[] = {
-	/* min	max		value		name */
-	{ 0L,	256,		32,		"udp_wroff_extra" },
-	{ 1L,	255,		255,		"udp_ipv4_ttl" },
-	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"udp_ipv6_hoplimit"},
-	{ 1024,	(32 * 1024),	1024,		"udp_smallest_nonpriv_port" },
-	{ 0,	1,		1,		"udp_do_checksum" },
-	{ 1024,	UDP_MAX_PORT,	(32 * 1024),	"udp_smallest_anon_port" },
-	{ 1024,	UDP_MAX_PORT,	UDP_MAX_PORT,	"udp_largest_anon_port" },
-	{ 4096,	1024*1024,	56*1024,	"udp_xmit_hiwat"},
-	{ 0,	1024*1024,	1024,		"udp_xmit_lowat"},
-	{ 4096,	1024*1024,	56*1024,	"udp_recv_hiwat"},
-	{ 65536, 1024*1024*1024, 2*1024*1024,	"udp_max_buf"},
-	{ 100,	60000,		1000,		"udp_ndd_get_info_interval"},
+/* BEGIN CSTYLED */
+udpparam_t udp_param_arr[] = {
+ /*min		max		value		name */
+ { 0L,		256,		32,		"udp_wroff_extra" },
+ { 1L,		255,		255,		"udp_ipv4_ttl" },
+ { 0,		IPV6_MAX_HOPS,	IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"},
+ { 1024,	(32 * 1024),	1024,		"udp_smallest_nonpriv_port" },
+ { 0,		1,		1,		"udp_do_checksum" },
+ { 1024,	UDP_MAX_PORT,	(32 * 1024),	"udp_smallest_anon_port" },
+ { 1024,	UDP_MAX_PORT,	UDP_MAX_PORT,	"udp_largest_anon_port" },
+ { UDP_XMIT_LOWATER, (1<<30), UDP_XMIT_HIWATER,	"udp_xmit_hiwat"},
+ { 0,		     (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"},
+ { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER,	"udp_recv_hiwat"},
+ { 65536,	(1<<30),	2*1024*1024,	"udp_max_buf"},
+ { 100,		60000,		1000,		"udp_ndd_get_info_interval"},
 };
-#define	udp_wroff_extra			udp_param_arr[0].udp_param_value
-#define	udp_ipv4_ttl			udp_param_arr[1].udp_param_value
-#define	udp_ipv6_hoplimit		udp_param_arr[2].udp_param_value
-#define	udp_smallest_nonpriv_port	udp_param_arr[3].udp_param_value
-#define	udp_do_checksum			udp_param_arr[4].udp_param_value
-#define	udp_smallest_anon_port		udp_param_arr[5].udp_param_value
-#define	udp_largest_anon_port		udp_param_arr[6].udp_param_value
-#define	udp_xmit_hiwat			udp_param_arr[7].udp_param_value
-#define	udp_xmit_lowat			udp_param_arr[8].udp_param_value
-#define	udp_recv_hiwat			udp_param_arr[9].udp_param_value
-#define	udp_max_buf			udp_param_arr[10].udp_param_value
-#define	udp_ndd_get_info_interval	udp_param_arr[11].udp_param_value
+/* END CSTYLED */
 
 /*
  * The smallest anonymous port in the priviledged port range which UDP
@@ -354,9 +467,434 @@
  */
 
 void (*cl_inet_bind)(uchar_t protocol, sa_family_t addr_family,
-			uint8_t *laddrp, in_port_t lport) = NULL;
+    uint8_t *laddrp, in_port_t lport) = NULL;
 void (*cl_inet_unbind)(uint8_t protocol, sa_family_t addr_family,
-			uint8_t *laddrp, in_port_t lport) = NULL;
+    uint8_t *laddrp, in_port_t lport) = NULL;
+
+typedef union T_primitives *t_primp_t;
+
+#define	UDP_ENQUEUE_MP(udp, mp, proc, tag) {			\
+	ASSERT((mp)->b_prev == NULL && (mp)->b_queue == NULL);	\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(mp)->b_queue = (queue_t *)((uintptr_t)tag);		\
+	(mp)->b_prev = (mblk_t *)proc;				\
+	if ((udp)->udp_mphead == NULL)				\
+		(udp)->udp_mphead = (mp);			\
+	else							\
+		(udp)->udp_mptail->b_next = (mp);		\
+	(udp)->udp_mptail = (mp);				\
+	(udp)->udp_mpcount++;					\
+}
+
+#define	UDP_READERS_INCREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_reader_count++;				\
+}
+
+#define	UDP_READERS_DECREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_reader_count--;				\
+	if ((udp)->udp_reader_count == 0)			\
+		cv_broadcast(&(udp)->udp_connp->conn_cv);	\
+}
+
+#define	UDP_SQUEUE_DECREF(udp) {				\
+	ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock));	\
+	(udp)->udp_squeue_count--;				\
+	if ((udp)->udp_squeue_count == 0)			\
+		cv_broadcast(&(udp)->udp_connp->conn_cv);	\
+}
+
+/*
+ * Notes on UDP endpoint synchronization:
+ *
+ * UDP needs exclusive operation on a per endpoint basis, when executing
+ * functions that modify the endpoint state.  udp_rput_other() deals with
+ * packets with IP options, and processing these packets end up having
+ * to update the endpoint's option related state.  udp_wput_other() deals
+ * with control operations from the top, e.g. connect() that needs to
+ * update the endpoint state.  These could be synchronized using locks,
+ * but the current version uses squeues for this purpose.  squeues may
+ * give performance improvement for certain cases such as connected UDP
+ * sockets; thus the framework allows for using squeues.
+ *
+ * The perimeter routines are described as follows:
+ *
+ * udp_enter():
+ *	Enter the UDP endpoint perimeter.
+ *
+ * udp_become_writer():
+ *	Become exclusive on the UDP endpoint.  Specifies a function
+ *	that will be called exclusively either immediately or later
+ *	when the perimeter is available exclusively.
+ *
+ * udp_exit():
+ *	Exit the UDP perimeter.
+ *
+ * Entering UDP from the top or from the bottom must be done using
+ * udp_enter().  No lock must be held while attempting to enter the UDP
+ * perimeter.  When finished, udp_exit() must be called to get out of
+ * the perimeter.
+ *
+ * UDP operates in either MT_HOT mode or in SQUEUE mode.  In MT_HOT mode,
+ * multiple threads may enter a UDP endpoint concurrently.  This is used
+ * for sending and/or receiving normal data.  Control operations and other
+ * special cases call udp_become_writer() to become exclusive on a per
+ * endpoint basis and this results in transitioning to SQUEUE mode.  squeue
+ * by definition serializes access to the conn_t.  When there are no more
+ * pending messages on the squeue for the UDP connection, the endpoint
+ * reverts to MT_HOT mode.  During the interregnum when not all MT threads
+ * of an endpoint have finished, messages are queued in the UDP endpoint
+ * and the UDP is in UDP_MT_QUEUED mode or UDP_QUEUED_SQUEUE mode.
+ *
+ * These modes have the following analogs:
+ *
+ *	UDP_MT_HOT/udp_reader_count==0		none
+ *	UDP_MT_HOT/udp_reader_count>0		RW_READ_LOCK
+ *	UDP_MT_QUEUED				RW_WRITE_WANTED
+ *	UDP_SQUEUE or UDP_QUEUED_SQUEUE		RW_WRITE_LOCKED
+ *
+ * Stable modes:	UDP_MT_HOT, UDP_SQUEUE
+ * Transient modes:	UDP_MT_QUEUED, UDP_QUEUED_SQUEUE
+ *
+ * While in stable modes, UDP keeps track of the number of threads
+ * operating on the endpoint.  The udp_reader_count variable represents
+ * the number of threads entering the endpoint as readers while it is
+ * in UDP_MT_HOT mode.  Transitioning to UDP_SQUEUE happens when there
+ * is only a single reader, i.e. when this counter drops to 1.  Likewise,
+ * udp_squeue_count represents the number of threads operating on the
+ * endpoint's squeue while it is in UDP_SQUEUE mode.  The mode transition
+ * to UDP_MT_HOT happens after the last thread exits the endpoint, i.e.
+ * when this counter drops to 0.
+ *
+ * The default mode is set to UDP_MT_HOT and UDP alternates between
+ * UDP_MT_HOT and UDP_SQUEUE as shown in the state transition below.
+ *
+ * Mode transition:
+ * ----------------------------------------------------------------
+ * old mode		Event				New mode
+ * ----------------------------------------------------------------
+ * UDP_MT_HOT		Call to udp_become_writer()	UDP_SQUEUE
+ *			and udp_reader_count == 1
+ *
+ * UDP_MT_HOT		Call to udp_become_writer()	UDP_MT_QUEUED
+ *			and udp_reader_count > 1
+ *
+ * UDP_MT_QUEUED	udp_reader_count drops to zero	UDP_QUEUED_SQUEUE
+ *
+ * UDP_QUEUED_SQUEUE	All messages enqueued on the	UDP_SQUEUE
+ *			internal UDP queue successfully
+ *			moved to squeue AND udp_squeue_count != 0
+ *
+ * UDP_QUEUED_SQUEUE	All messages enqueued on the	UDP_MT_HOT
+ *			internal UDP queue successfully
+ *			moved to squeue AND udp_squeue_count
+ *			drops to zero
+ *
+ * UDP_SQUEUE		udp_squeue_count drops to zero	UDP_MT_HOT
+ * ----------------------------------------------------------------
+ */
+
+static queue_t *
+UDP_WR(queue_t *q)
+{
+	ASSERT(q->q_ptr == NULL && _OTHERQ(q)->q_ptr == NULL);
+	ASSERT(WR(q)->q_next != NULL && WR(q)->q_next->q_ptr != NULL);
+	ASSERT(IPCL_IS_UDP(Q_TO_CONN(WR(q)->q_next)));
+
+	return (_WR(q)->q_next);
+}
+
+static queue_t *
+UDP_RD(queue_t *q)
+{
+	ASSERT(q->q_ptr != NULL && _OTHERQ(q)->q_ptr != NULL);
+	ASSERT(IPCL_IS_UDP(Q_TO_CONN(q)));
+	ASSERT(RD(q)->q_next != NULL && RD(q)->q_next->q_ptr == NULL);
+
+	return (_RD(q)->q_next);
+}
+
+#ifdef DEBUG
+#define	UDP_MODE_ASSERTIONS(udp, caller) udp_mode_assertions(udp, caller)
+#else
+#define	UDP_MODE_ASSERTIONS(udp, caller)
+#endif
+
+/* Invariants */
+#ifdef DEBUG
+
+uint32_t udp_count[4];
+
+/* Context of udp_mode_assertions */
+#define	UDP_ENTER		1
+#define	UDP_BECOME_WRITER	2
+#define	UDP_EXIT		3
+
+static void
+udp_mode_assertions(udp_t *udp, int caller)
+{
+	ASSERT(MUTEX_HELD(&udp->udp_connp->conn_lock));
+
+	switch (udp->udp_mode) {
+	case UDP_MT_HOT:
+		/*
+		 * Messages have not yet been enqueued on the internal queue,
+		 * otherwise we would have switched to UDP_MT_QUEUED. Likewise
+		 * by definition, there can't be any messages enqueued on the
+		 * squeue. The UDP could be quiescent, so udp_reader_count
+		 * could be zero at entry.
+		 */
+		ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0 &&
+		    udp->udp_squeue_count == 0);
+		ASSERT(caller == UDP_ENTER || udp->udp_reader_count != 0);
+		udp_count[0]++;
+		break;
+
+	case UDP_MT_QUEUED:
+		/*
+		 * The last MT thread to exit the udp perimeter empties the
+		 * internal queue and then switches the UDP to
+		 * UDP_QUEUED_SQUEUE mode. Since we are still in UDP_MT_QUEUED
+		 * mode, it means there must be at least 1 MT thread still in
+		 * the perimeter and at least 1 message on the internal queue.
+		 */
+		ASSERT(udp->udp_reader_count >= 1 && udp->udp_mphead != NULL &&
+		    udp->udp_mpcount != 0 && udp->udp_squeue_count == 0);
+		udp_count[1]++;
+		break;
+
+	case UDP_QUEUED_SQUEUE:
+		/*
+		 * The switch has happened from MT to SQUEUE. So there can't
+		 * any MT threads. Messages could still pile up on the internal
+		 * queue until the transition is complete and we move to
+		 * UDP_SQUEUE mode. We can't assert on nonzero udp_squeue_count
+		 * since the squeue could drain any time.
+		 */
+		ASSERT(udp->udp_reader_count == 0);
+		udp_count[2]++;
+		break;
+
+	case UDP_SQUEUE:
+		/*
+		 * The transition is complete. Thre can't be any messages on
+		 * the internal queue. The udp could be quiescent or the squeue
+		 * could drain any time, so we can't assert on nonzero
+		 * udp_squeue_count during entry. Nor can we assert that
+		 * udp_reader_count is zero, since, a reader thread could have
+		 * directly become writer in line by calling udp_become_writer
+		 * without going through the queued states.
+		 */
+		ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0);
+		ASSERT(caller == UDP_ENTER || udp->udp_squeue_count != 0);
+		udp_count[3]++;
+		break;
+	}
+}
+#endif
+
+#define	_UDP_ENTER(connp, mp, proc, tag) {				\
+	udp_t *_udp = (connp)->conn_udp;				\
+									\
+	mutex_enter(&(connp)->conn_lock);				\
+	if ((connp)->conn_state_flags & CONN_CLOSING) {			\
+		mutex_exit(&(connp)->conn_lock);			\
+		freemsg(mp);						\
+	} else {							\
+		UDP_MODE_ASSERTIONS(_udp, UDP_ENTER);			\
+									\
+		switch (_udp->udp_mode) {				\
+		case UDP_MT_HOT:					\
+			/* We can execute as reader right away. */	\
+			UDP_READERS_INCREF(_udp);			\
+			mutex_exit(&(connp)->conn_lock);		\
+			(*(proc))(connp, mp, (connp)->conn_sqp);	\
+			break;						\
+									\
+		case UDP_SQUEUE:					\
+			/*						\
+			 * We are in squeue mode, send the		\
+			 * packet to the squeue				\
+			 */						\
+			_udp->udp_squeue_count++;			\
+			CONN_INC_REF_LOCKED(connp);			\
+			mutex_exit(&(connp)->conn_lock);		\
+			squeue_enter((connp)->conn_sqp, mp, proc,	\
+			    connp, tag);				\
+			break;						\
+									\
+		case UDP_MT_QUEUED:					\
+		case UDP_QUEUED_SQUEUE:					\
+			/*						\
+			 * Some messages may have been enqueued		\
+			 * ahead of us.  Enqueue the new message	\
+			 * at the tail of the internal queue to		\
+			 * preserve message ordering.			\
+			 */						\
+			UDP_ENQUEUE_MP(_udp, mp, proc, tag);		\
+			mutex_exit(&(connp)->conn_lock);		\
+			break;						\
+		}							\
+	}								\
+}
+
+static void
+udp_enter(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+	_UDP_ENTER(connp, mp, proc, tag);
+}
+
+static void
+udp_become_writer(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+	udp_t	*udp;
+
+	udp = connp->conn_udp;
+
+	mutex_enter(&connp->conn_lock);
+
+	UDP_MODE_ASSERTIONS(udp, UDP_BECOME_WRITER);
+
+	switch (udp->udp_mode) {
+	case UDP_MT_HOT:
+		if (udp->udp_reader_count == 1) {
+			/*
+			 * We are the only MT thread. Switch to squeue mode
+			 * immediately.
+			 */
+			udp->udp_mode = UDP_SQUEUE;
+			udp->udp_squeue_count = 1;
+			CONN_INC_REF_LOCKED(connp);
+			mutex_exit(&connp->conn_lock);
+			squeue_enter(connp->conn_sqp, mp, proc, connp, tag);
+			return;
+		}
+		/* FALLTHRU */
+
+	case UDP_MT_QUEUED:
+		/* Enqueue the packet internally in UDP */
+		udp->udp_mode = UDP_MT_QUEUED;
+		UDP_ENQUEUE_MP(udp, mp, proc, tag);
+		mutex_exit(&connp->conn_lock);
+		return;
+
+	case UDP_SQUEUE:
+	case UDP_QUEUED_SQUEUE:
+		/*
+		 * We are already exclusive. i.e. we are already
+		 * writer. Simply call the desired function.
+		 */
+		udp->udp_squeue_count++;
+		mutex_exit(&connp->conn_lock);
+		(*proc)(connp, mp, connp->conn_sqp);
+		return;
+	}
+}
+
+/*
+ * Transition from MT mode to SQUEUE mode, when the last MT thread
+ * is exiting the UDP perimeter. Move all messages from the internal
+ * udp queue to the squeue. A better way would be to move all the
+ * messages in one shot, this needs more support from the squeue framework
+ */
+static void
+udp_switch_to_squeue(udp_t *udp)
+{
+	mblk_t *mp;
+	mblk_t	*mp_next;
+	sqproc_t proc;
+	uint8_t	tag;
+	conn_t	*connp = udp->udp_connp;
+
+	ASSERT(MUTEX_HELD(&connp->conn_lock));
+	ASSERT(udp->udp_mode == UDP_MT_QUEUED);
+	while (udp->udp_mphead != NULL) {
+		mp = udp->udp_mphead;
+		udp->udp_mphead = NULL;
+		udp->udp_mptail = NULL;
+		udp->udp_mpcount = 0;
+		udp->udp_mode = UDP_QUEUED_SQUEUE;
+		mutex_exit(&connp->conn_lock);
+		/*
+		 * It is best not to hold any locks across the calls
+		 * to squeue functions. Since we drop the lock we
+		 * need to go back and check the udp_mphead once again
+		 * after the squeue_fill and hence the while loop at
+		 * the top of this function
+		 */
+		for (; mp != NULL; mp = mp_next) {
+			mp_next = mp->b_next;
+			proc = (sqproc_t)mp->b_prev;
+			tag = (uint8_t)((uintptr_t)mp->b_queue);
+			mp->b_next = NULL;
+			mp->b_prev = NULL;
+			mp->b_queue = NULL;
+			CONN_INC_REF(connp);
+			udp->udp_squeue_count++;
+			squeue_fill(connp->conn_sqp, mp, proc, connp,
+			    tag);
+		}
+		mutex_enter(&connp->conn_lock);
+	}
+	/*
+	 * udp_squeue_count of zero implies that the squeue has drained
+	 * even before we arrived here (i.e. after the squeue_fill above)
+	 */
+	udp->udp_mode = (udp->udp_squeue_count != 0) ?
+	    UDP_SQUEUE : UDP_MT_HOT;
+}
+
+#define	_UDP_EXIT(connp) {						\
+	udp_t *_udp = (connp)->conn_udp;				\
+									\
+	mutex_enter(&(connp)->conn_lock);				\
+	UDP_MODE_ASSERTIONS(_udp, UDP_EXIT);				\
+									\
+	switch (_udp->udp_mode) {					\
+	case UDP_MT_HOT:						\
+		UDP_READERS_DECREF(_udp);				\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_SQUEUE:						\
+		UDP_SQUEUE_DECREF(_udp);				\
+		if (_udp->udp_squeue_count == 0)			\
+		    _udp->udp_mode = UDP_MT_HOT;			\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_MT_QUEUED:						\
+		/*							\
+		 * If this is the last MT thread, we need to		\
+		 * switch to squeue mode				\
+		 */							\
+		UDP_READERS_DECREF(_udp);				\
+		if (_udp->udp_reader_count == 0)			\
+			udp_switch_to_squeue(_udp);			\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+									\
+	case UDP_QUEUED_SQUEUE:						\
+		UDP_SQUEUE_DECREF(_udp);				\
+		/*							\
+		 * Even if the udp_squeue_count drops to zero, we	\
+		 * don't want to change udp_mode to UDP_MT_HOT here.	\
+		 * The thread in udp_switch_to_squeue will take care	\
+		 * of the transition to UDP_MT_HOT, after emptying	\
+		 * any more new messages that have been enqueued in	\
+		 * udp_mphead.						\
+		 */							\
+		mutex_exit(&(connp)->conn_lock);			\
+		break;							\
+	}								\
+}
+
+static void
+udp_exit(conn_t *connp)
+{
+	_UDP_EXIT(connp);
+}
 
 /*
  * Return the next anonymous port in the priviledged port range for
@@ -379,9 +917,13 @@
 udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 {
 	udp_fanout_t	*udpf;
-	udp_t		*udp;
 	int		i;
 	zoneid_t	zoneid;
+	conn_t		*connp;
+	udp_t		*udp;
+
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 
 	/* Refer to comments in udp_status_report(). */
 	if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
@@ -403,8 +945,7 @@
 	    " zone lport src addr        dest addr       port  state");
 	/*    1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 
 	for (i = 0; i < udp_bind_fanout_size; i++) {
 		udpf = &udp_bind_fanout[i];
@@ -415,7 +956,7 @@
 		if (zoneid != GLOBAL_ZONEID) {
 			/* skip to first entry in this zone; might be none */
 			while (udp != NULL &&
-			    udp->udp_zoneid != zoneid)
+			    udp->udp_connp->conn_zoneid != zoneid)
 				udp = udp->udp_bind_hash;
 		}
 		if (udp != NULL) {
@@ -432,7 +973,7 @@
 			}
 			for (; udp != NULL; udp = udp->udp_bind_hash) {
 				if (zoneid == GLOBAL_ZONEID ||
-				    zoneid == udp->udp_zoneid)
+				    zoneid == udp->udp_connp->conn_zoneid)
 					udp_report_item(mp->b_cont, udp);
 			}
 		}
@@ -542,7 +1083,6 @@
 	in_port_t	port;		/* Host byte order */
 	in_port_t	requested_port;	/* Host byte order */
 	struct T_bind_req *tbr;
-	udp_t		*udp;
 	int		count;
 	in6_addr_t	v6src;
 	boolean_t	bind_to_req_port_only;
@@ -550,8 +1090,11 @@
 	udp_fanout_t	*udpf;
 	in_port_t	lport;		/* Network byte order */
 	zoneid_t	zoneid;
-
-	udp = (udp_t *)q->q_ptr;
+	conn_t		*connp;
+	udp_t		*udp;
+
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_bind: bad req, len %u",
@@ -559,6 +1102,7 @@
 		udp_err_ack(q, mp, TPROTO, 0);
 		return;
 	}
+
 	if (udp->udp_state != TS_UNBND) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_bind: bad state, %u", udp->udp_state);
@@ -673,7 +1217,7 @@
 		}
 
 		if (priv) {
-			cred_t *cr = DB_CREDDEF(mp, udp->udp_credp);
+			cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
 
 			if (secpolicy_net_privaddr(cr, port) != 0) {
 				udp_err_ack(q, mp, TACCES, 0);
@@ -736,7 +1280,7 @@
 		loopmax = udp_largest_anon_port - udp_smallest_anon_port + 1;
 	}
 
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 	for (;;) {
 		udp_t		*udp1;
 		boolean_t	is_inaddr_any;
@@ -753,7 +1297,7 @@
 		for (udp1 = udpf->uf_udp; udp1 != NULL;
 		    udp1 = udp1->udp_bind_hash) {
 			if (lport != udp1->udp_port ||
-			    zoneid != udp1->udp_zoneid)
+			    zoneid != udp1->udp_connp->conn_zoneid)
 				continue;
 
 			/*
@@ -933,7 +1477,39 @@
 		mp->b_cont->b_wptr += sizeof (ire_t);
 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
 	}
-	putnext(q, mp);
+	if (udp->udp_family == AF_INET6)
+		mp = ip_bind_v6(q, mp, connp, NULL);
+	else
+		mp = ip_bind_v4(q, mp, connp);
+
+	if (mp != NULL)
+		udp_rput_other(_RD(q), mp);
+	else
+		CONN_INC_REF(connp);
+}
+
+
+void
+udp_resume_bind(conn_t *connp, mblk_t *mp)
+{
+	udp_enter(connp, mp, udp_resume_bind_cb, SQTAG_BIND_RETRY);
+}
+
+/*
+ * This is called from ip_wput_nondata to resume a deferred UDP bind.
+ */
+/* ARGSUSED */
+static void
+udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2)
+{
+	conn_t *connp = arg;
+
+	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+
+	udp_rput_other(connp->conn_rq, mp);
+
+	CONN_OPER_PENDING_DONE(connp);
+	udp_exit(connp);
 }
 
 /*
@@ -958,15 +1534,16 @@
 	sin6_t	*sin6;
 	sin_t	*sin;
 	struct T_conn_req	*tcr;
-	udp_t	*udp, *udp1;
 	in6_addr_t v6dst;
 	ipaddr_t v4dst;
 	uint16_t dstport;
 	uint32_t flowinfo;
 	mblk_t	*mp1, *mp2;
 	udp_fanout_t	*udpf;
-
-	udp = (udp_t *)q->q_ptr;
+	udp_t	*udp, *udp1;
+
+	udp = Q_TO_UDP(q);
+
 	tcr = (struct T_conn_req *)mp->b_rptr;
 
 	/* A bit of sanity checking */
@@ -987,6 +1564,7 @@
 	ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
 
 	udpf = &udp_bind_fanout[UDP_BIND_HASH(udp->udp_port)];
+
 	if (udp->udp_state == TS_DATA_XFER) {
 		/* Already connected - clear out state */
 		mutex_enter(&udpf->uf_lock);
@@ -1185,20 +1763,67 @@
 	linkb(mp1, mp);
 	linkb(mp1, mp2);
 
-	putnext(q, mp1);
+	if (udp->udp_family == AF_INET)
+		mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+	else
+		mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+
+	if (mp1 != NULL)
+		udp_rput_other(_RD(q), mp1);
+	else
+		CONN_INC_REF(udp->udp_connp);
 }
 
-/* This is the close routine for udp.  It frees the per-stream data. */
 static int
 udp_close(queue_t *q)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
-
-	TRACE_1(TR_FAC_UDP, TR_UDP_CLOSE,
-		"udp_close: q %p", q);
+	conn_t	*connp = Q_TO_CONN(UDP_WR(q));
+	udp_t	*udp;
+	queue_t	*ip_rq = RD(UDP_WR(q));
+
+	ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+	udp = connp->conn_udp;
+
+	ip_quiesce_conn(connp);
+	/*
+	 * Disable read-side synchronous stream
+	 * interface and drain any queued data.
+	 */
+	udp_rcv_drain(q, udp, B_TRUE);
+	ASSERT(!udp->udp_direct_sockfs);
 
 	qprocsoff(q);
 
+	/* restore IP module's high and low water marks to default values */
+	ip_rq->q_hiwat = ip_rq->q_qinfo->qi_minfo->mi_hiwat;
+	WR(ip_rq)->q_hiwat = WR(ip_rq)->q_qinfo->qi_minfo->mi_hiwat;
+	WR(ip_rq)->q_lowat = WR(ip_rq)->q_qinfo->qi_minfo->mi_lowat;
+
+	ASSERT(udp->udp_rcv_cnt == 0);
+	ASSERT(udp->udp_rcv_msgcnt == 0);
+	ASSERT(udp->udp_rcv_list_head == NULL);
+	ASSERT(udp->udp_rcv_list_tail == NULL);
+
+	/* connp is now single threaded. */
+	udp_close_free(connp);
+	/*
+	 * Restore connp as an IP endpoint.  We don't need
+	 * any locks since we are now single threaded
+	 */
+	connp->conn_flags &= ~IPCL_UDP;
+	connp->conn_state_flags &=
+	    ~(CONN_CLOSING | CONN_CONDEMNED | CONN_QUIESCED);
+	return (0);
+}
+
+/*
+ * Called in the close path from IP (ip_quiesce_conn) to quiesce the conn
+ */
+void
+udp_quiesce_conn(conn_t *connp)
+{
+	udp_t	*udp = connp->conn_udp;
+
 	if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
 		/*
 		 * Running in cluster mode - register unbind information
@@ -1215,16 +1840,30 @@
 	}
 
 	udp_bind_hash_remove(udp, B_FALSE);
-	mutex_enter(&udp_g_lock);
-	/* Unlink the udp structure and release the minor device number. */
-	mi_close_unlink(&udp_g_head, (IDP)udp);
-	mutex_exit(&udp_g_lock);
+
+	mutex_enter(&connp->conn_lock);
+	while (udp->udp_reader_count != 0 || udp->udp_squeue_count != 0 ||
+	    udp->udp_mode != UDP_MT_HOT) {
+		cv_wait(&connp->conn_cv, &connp->conn_lock);
+	}
+	mutex_exit(&connp->conn_lock);
+}
+
+void
+udp_close_free(conn_t *connp)
+{
+	udp_t *udp = connp->conn_udp;
+
 	/* If there are any options associated with the stream, free them. */
-	if (udp->udp_ip_snd_options)
+	if (udp->udp_ip_snd_options) {
 		mi_free((char *)udp->udp_ip_snd_options);
-
-	if (udp->udp_ip_rcv_options)
+		udp->udp_ip_snd_options = NULL;
+	}
+
+	if (udp->udp_ip_rcv_options) {
 		mi_free((char *)udp->udp_ip_rcv_options);
+		udp->udp_ip_rcv_options = NULL;
+	}
 
 	/* Free memory associated with sticky options */
 	if (udp->udp_sticky_hdrs_len != 0) {
@@ -1233,30 +1872,33 @@
 		udp->udp_sticky_hdrs = NULL;
 		udp->udp_sticky_hdrs_len = 0;
 	}
+
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_hopopts,
 		    udp->udp_sticky_ipp.ipp_hopoptslen);
+		udp->udp_sticky_ipp.ipp_hopopts = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_rtdstopts,
 		    udp->udp_sticky_ipp.ipp_rtdstoptslen);
+		udp->udp_sticky_ipp.ipp_rtdstopts = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
 		kmem_free(udp->udp_sticky_ipp.ipp_rthdr,
 		    udp->udp_sticky_ipp.ipp_rthdrlen);
+		udp->udp_sticky_ipp.ipp_rthdr = NULL;
 	}
 	if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
 		kmem_free(udp->udp_sticky_ipp.ipp_dstopts,
 		    udp->udp_sticky_ipp.ipp_dstoptslen);
+		udp->udp_sticky_ipp.ipp_dstopts = NULL;
 	}
 	udp->udp_sticky_ipp.ipp_fields &=
 	    ~(IPPF_HOPOPTS|IPPF_RTDSTOPTS|IPPF_RTHDR|IPPF_DSTOPTS);
 
-	crfree(udp->udp_credp);
-	/* Free the data structure */
-	mi_close_free((IDP)udp);
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	return (0);
+	udp->udp_connp = NULL;
+	connp->conn_udp = NULL;
+	kmem_cache_free(udp_cache, udp);
 }
 
 /*
@@ -1277,12 +1919,10 @@
 static void
 udp_disconnect(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp;
+	udp_t	*udp = Q_TO_UDP(q);
 	mblk_t	*mp1;
 	udp_fanout_t *udpf;
 
-	udp = (udp_t *)q->q_ptr;
-
 	if (udp->udp_state != TS_DATA_XFER) {
 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 		    "udp_disconnect: bad state, %u", udp->udp_state);
@@ -1331,7 +1971,16 @@
 
 	/* Append the T_OK_ACK to the T_BIND_REQ for udp_rput */
 	linkb(mp1, mp);
-	putnext(q, mp1);
+
+	if (udp->udp_family == AF_INET6)
+		mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+	else
+		mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+
+	if (mp1 != NULL)
+		udp_rput_other(_RD(q), mp1);
+	else
+		CONN_INC_REF(udp->udp_connp);
 }
 
 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
@@ -1339,7 +1988,7 @@
 udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
 {
 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
-		qreply(q, mp);
+		putnext(UDP_RD(q), mp);
 }
 
 /* Shorthand to generate and send TPI error acks to our client */
@@ -1355,7 +2004,7 @@
 		teackp->ERROR_prim = primitive;
 		teackp->TLI_error = t_error;
 		teackp->UNIX_error = sys_error;
-		qreply(q, mp);
+		putnext(UDP_RD(q), mp);
 	}
 }
 
@@ -1372,10 +2021,6 @@
 	return (0);
 }
 
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
 /* ARGSUSED */
 static int
 udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1393,11 +2038,9 @@
 		return (EINVAL);
 	}
 
-	mutex_enter(&udp_g_lock);
 	/* Check if the value is already in the list */
 	for (i = 0; i < udp_g_num_epriv_ports; i++) {
 		if (new_value == udp_g_epriv_ports[i]) {
-			mutex_exit(&udp_g_lock);
 			return (EEXIST);
 		}
 	}
@@ -1407,20 +2050,14 @@
 			break;
 	}
 	if (i == udp_g_num_epriv_ports) {
-		mutex_exit(&udp_g_lock);
 		return (EOVERFLOW);
 	}
 
 	/* Set the new value */
 	udp_g_epriv_ports[i] = (in_port_t)new_value;
-	mutex_exit(&udp_g_lock);
 	return (0);
 }
 
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
 /* ARGSUSED */
 static int
 udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1438,20 +2075,17 @@
 		return (EINVAL);
 	}
 
-	mutex_enter(&udp_g_lock);
 	/* Check that the value is already in the list */
 	for (i = 0; i < udp_g_num_epriv_ports; i++) {
 		if (udp_g_epriv_ports[i] == new_value)
 			break;
 	}
 	if (i == udp_g_num_epriv_ports) {
-		mutex_exit(&udp_g_lock);
 		return (ESRCH);
 	}
 
 	/* Clear the value */
 	udp_g_epriv_ports[i] = 0;
-	mutex_exit(&udp_g_lock);
 	return (0);
 }
 
@@ -1478,8 +2112,8 @@
 	sin6_t	sin6;
 	mblk_t	*mp1;
 	int	error = 0;
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	size_t	mp_size = MBLKL(mp);
+	udp_t	*udp = Q_TO_UDP(q);
 
 	/*
 	 * Assume IP provides aligned packets - otherwise toss
@@ -1495,7 +2129,7 @@
 	 */
 	if (!udp->udp_dgram_errind || mp_size < sizeof (ipha_t)) {
 noticmpv4:
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -1590,7 +2224,7 @@
 		break;
 	}
 	if (mp1)
-		putnext(q, mp1);
+		putnext(UDP_RD(q), mp1);
 	freemsg(mp);
 }
 
@@ -1609,7 +2243,6 @@
 static void
 udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
 {
-	udp_t		*udp = (udp_t *)q->q_ptr;
 	icmp6_t		*icmp6;
 	ip6_t		*ip6h, *outer_ip6h;
 	uint16_t	hdr_length;
@@ -1619,13 +2252,14 @@
 	mblk_t		*mp1;
 	int		error = 0;
 	size_t		mp_size = MBLKL(mp);
+	udp_t		*udp = Q_TO_UDP(q);
 
 	/*
 	 * Verify that we have a complete IP header. If not, send it upstream.
 	 */
 	if (mp_size < sizeof (ip6_t)) {
 noticmpv6:
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -1736,7 +2370,7 @@
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		putnext(q, newmp);
+		putnext(UDP_RD(q), newmp);
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1766,7 +2400,7 @@
 	mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
 	    error);
 	if (mp1)
-		putnext(q, mp1);
+		putnext(UDP_RD(q), mp1);
 	freemsg(mp);
 }
 
@@ -1780,11 +2414,11 @@
 static void
 udp_addr_req(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	sin_t	*sin;
 	sin6_t	*sin6;
 	mblk_t	*ackmp;
 	struct T_addr_ack *taa;
+	udp_t	*udp = Q_TO_UDP(q);
 
 	/* Make it large enough for worst case */
 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1894,7 +2528,7 @@
 		}
 	}
 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
-	qreply(q, ackmp);
+	putnext(UDP_RD(q), ackmp);
 }
 
 static void
@@ -1918,9 +2552,9 @@
 static void
 udp_capability_req(queue_t *q, mblk_t *mp)
 {
-	udp_t			*udp = (udp_t *)q->q_ptr;
 	t_uscalar_t		cap_bits1;
 	struct T_capability_ack	*tcap;
+	udp_t	*udp = Q_TO_UDP(q);
 
 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
 
@@ -1937,7 +2571,7 @@
 		tcap->CAP_bits1 |= TC1_INFO;
 	}
 
-	qreply(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -1948,7 +2582,7 @@
 static void
 udp_info_req(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
+	udp_t *udp = Q_TO_UDP(q);
 
 	/* Create a T_INFO_ACK message. */
 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
@@ -1956,7 +2590,7 @@
 	if (!mp)
 		return;
 	udp_copy_info((struct T_info_ack *)mp->b_rptr, udp);
-	qreply(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -2102,20 +2736,19 @@
  * This is the open routine for udp.  It allocates a udp_t structure for
  * the stream and, on the first open of the module, creates an ND table.
  */
+/* ARGSUSED */
 static int
 udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 {
 	int	err;
 	udp_t	*udp;
+	conn_t *connp;
+	zoneid_t zoneid = getzoneid();
+	queue_t	*ip_wq;
+	char	*name;
 
 	TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q);
 
-	/*
-	 * Defer the qprocson until everything is initialized since
-	 * we are D_MTPERQ and after qprocson the rput routine can
-	 * run.
-	 */
-
 	/* If the stream is already open, return immediately. */
 	if (q->q_ptr != NULL)
 		return (0);
@@ -2124,85 +2757,110 @@
 	if (sflag != MODOPEN)
 		return (EINVAL);
 
+	q->q_hiwat = udp_recv_hiwat;
+	WR(q)->q_hiwat = udp_xmit_hiwat;
+	WR(q)->q_lowat = udp_xmit_lowat;
+
+	/* Insert ourselves in the stream since we're about to walk q_next */
+	qprocson(q);
+
+	udp = kmem_cache_alloc(udp_cache, KM_SLEEP);
+	bzero(udp, sizeof (*udp));
+
 	/*
-	 * Create and initialize a udp_t structure for this stream.
+	 * UDP is supported only as a module and it has to be pushed directly
+	 * above the device instance of IP. If UDP is pushed anywhere else
+	 * on a stream, it will support just T_SVR4_OPTMGMT_REQ for the
+	 * sake of MIB browsers and fail everything else.
 	 */
-	udp = (udp_t *)mi_open_alloc_sleep(sizeof (udp_t));
+	ip_wq = WR(q)->q_next;
+	if (ip_wq->q_next != NULL ||
+	    (name = ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL ||
+	    strcmp(name, IP_MOD_NAME) != 0 ||
+	    ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) {
+		/* Support just SNMP for MIB browsers */
+		connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP);
+		connp->conn_rq = q;
+		connp->conn_wq = WR(q);
+		connp->conn_flags |= IPCL_UDPMOD;
+		connp->conn_cred = credp;
+		connp->conn_zoneid = zoneid;
+		connp->conn_udp = udp;
+		udp->udp_connp = connp;
+		q->q_ptr = WR(q)->q_ptr = connp;
+		crhold(credp);
+		q->q_qinfo = &udp_snmp_rinit;
+		WR(q)->q_qinfo = &udp_snmp_winit;
+		return (0);
+	}
+
+	/*
+	 * Initialize the udp_t structure for this stream.
+	 */
+	q = RD(ip_wq);
+	connp = Q_TO_CONN(q);
+	mutex_enter(&connp->conn_lock);
+	connp->conn_proto = IPPROTO_UDP;
+	connp->conn_flags |= IPCL_UDP;
+	connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+	connp->conn_udp = udp;
 
 	/* Set the initial state of the stream and the privilege status. */
-	q->q_ptr = WR(q)->q_ptr = udp;
+	udp->udp_connp = connp;
 	udp->udp_state = TS_UNBND;
+	udp->udp_mode = UDP_MT_HOT;
 	if (getmajor(*devp) == (major_t)UDP6_MAJ) {
 		udp->udp_family = AF_INET6;
 		udp->udp_ipversion = IPV6_VERSION;
 		udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
 		udp->udp_ttl = udp_ipv6_hoplimit;
+		connp->conn_af_isv6 = B_TRUE;
+		connp->conn_flags |= IPCL_ISV6;
 	} else {
 		udp->udp_family = AF_INET;
 		udp->udp_ipversion = IPV4_VERSION;
 		udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
 		udp->udp_ttl = udp_ipv4_ttl;
-	}
-
-	/*
-	 * The receive hiwat is only looked at on the stream head queue.
-	 * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
-	 */
-	q->q_hiwat = udp_recv_hiwat;
+		connp->conn_af_isv6 = B_FALSE;
+		connp->conn_flags &= ~IPCL_ISV6;
+	}
 
 	udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
-	udp->udp_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-	udp->udp_credp = credp;
-	crhold(credp);
-
-	udp->udp_zoneid = getzoneid();
-
-	/*
-	 * Acquire the lock and link it into the list of open streams.
-	 */
-	mutex_enter(&udp_g_lock);
-	err = mi_open_link(&udp_g_head, (IDP)udp, devp, flag, sflag, credp);
-	mutex_exit(&udp_g_lock);
-	if (err != 0)
-		goto error;
-
-	qprocson(q);
+	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+	connp->conn_zoneid = zoneid;
+
+	if (connp->conn_flags & IPCL_SOCKET) {
+		udp->udp_issocket = B_TRUE;
+		udp->udp_direct_sockfs = B_TRUE;
+	}
+	mutex_exit(&connp->conn_lock);
 
 	/*
 	 * The transmit hiwat/lowat is only looked at on IP's queue.
-	 * Store in q_hiwat in order to return on SO_SNDBUF
+	 * Store in q_hiwat in order to return on SO_SNDBUF/SO_RCVBUF
 	 * getsockopts.
 	 */
+	q->q_hiwat = udp_recv_hiwat;
 	WR(q)->q_hiwat = udp_xmit_hiwat;
-	WR(q)->q_next->q_hiwat = WR(q)->q_hiwat;
 	WR(q)->q_lowat = udp_xmit_lowat;
-	WR(q)->q_next->q_lowat = WR(q)->q_lowat;
 
 	if (udp->udp_family == AF_INET6) {
 		/* Build initial header template for transmit */
 		if ((err = udp_build_hdrs(q, udp)) != 0) {
-			qprocsoff(q);
-			/*
-			 * Unlink the udp structure and release
-			 * the minor device number.
-			 */
-			mutex_enter(&udp_g_lock);
-			mi_close_unlink(&udp_g_head, (IDP)udp);
-			mutex_exit(&udp_g_lock);
-			goto error;
-		}
-	}
-
-	/* Set the Stream head write offset. */
-	(void) mi_set_sth_wroff(q, udp->udp_max_hdr_len + udp_wroff_extra);
-	(void) mi_set_sth_hiwat(q, q->q_hiwat);
+			qprocsoff(UDP_RD(q));
+			udp->udp_connp = NULL;
+			connp->conn_udp = NULL;
+			kmem_cache_free(udp_cache, udp);
+			return (err);
+		}
+	}
+
+	/* Set the Stream head write offset and high watermark. */
+	(void) mi_set_sth_wroff(UDP_RD(q),
+	    udp->udp_max_hdr_len + udp_wroff_extra);
+	(void) mi_set_sth_hiwat(UDP_RD(q), udp_set_rcv_hiwat(udp, q->q_hiwat));
+
 	return (0);
-
-error:
-	q->q_ptr = WR(q)->q_ptr = NULL;
-	crfree(credp);
-	mi_close_free((IDP)udp);
-	return (err);
 }
 
 /*
@@ -2212,7 +2870,6 @@
 static boolean_t
 udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
 {
-
 	return (B_TRUE);
 }
 
@@ -2255,15 +2912,22 @@
 }
 
 /*
- * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * This routine retrieves the current status of socket options
+ * and expects the caller to pass in the queue pointer of the
+ * upper instance.  It returns the size of the option retrieved.
  */
 int
 udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
 {
 	int	*i1 = (int *)ptr;
-	udp_t	*udp = (udp_t *)q->q_ptr;
-	ip6_pkt_t	*ipp = &udp->udp_sticky_ipp;
+	conn_t	*connp;
+	udp_t	*udp;
+	ip6_pkt_t *ipp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
+	ipp = &udp->udp_sticky_ipp;
 
 	switch (level) {
 	case SOL_SOCKET:
@@ -2333,7 +2997,7 @@
 			*(uchar_t *)ptr = udp->udp_multicast_ttl;
 			return (sizeof (uchar_t));
 		case IP_MULTICAST_LOOP:
-			*ptr = udp->udp_multicast_loop;
+			*ptr = connp->conn_multicast_loop;
 			return (sizeof (uint8_t));
 		case IP_RECVOPTS:
 			*i1 = udp->udp_recvopts;
@@ -2394,7 +3058,7 @@
 			*i1 = udp->udp_multicast_ttl;
 			break;	/* goto sizeof (int) option return */
 		case IPV6_MULTICAST_LOOP:
-			*i1 = udp->udp_multicast_loop;
+			*i1 = connp->conn_multicast_loop;
 			break;	/* goto sizeof (int) option return */
 		case IPV6_JOIN_GROUP:
 		case IPV6_LEAVE_GROUP:
@@ -2520,18 +3184,26 @@
 	return (sizeof (int));
 }
 
-/* This routine sets socket options. */
+/*
+ * This routine sets socket options; it expects the caller
+ * to pass in the queue pointer of the upper instance.
+ */
 /* ARGSUSED */
 int
 udp_opt_set(queue_t *q, uint_t optset_context, int level,
     int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
     uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
 	int	*i1 = (int *)invalp;
 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
 	boolean_t checkonly;
 	int	error;
+	conn_t	*connp;
+	udp_t	*udp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
 
 	switch (optset_context) {
 	case SETFN_OPTCOM_CHECKONLY:
@@ -2619,7 +3291,7 @@
 			}
 			if (!checkonly) {
 				q->q_hiwat = *i1;
-				q->q_next->q_hiwat = *i1;
+				WR(UDP_RD(q))->q_hiwat = *i1;
 			}
 			break;
 		case SO_RCVBUF:
@@ -2629,7 +3301,9 @@
 			}
 			if (!checkonly) {
 				RD(q)->q_hiwat = *i1;
-				(void) mi_set_sth_hiwat(RD(q), *i1);
+				UDP_RD(q)->q_hiwat = *i1;
+				(void) mi_set_sth_hiwat(UDP_RD(q),
+				    udp_set_rcv_hiwat(udp, *i1));
 			}
 			break;
 		case SO_DGRAM_ERRIND:
@@ -2709,7 +3383,7 @@
 			break;
 		case IP_MULTICAST_LOOP:
 			if (!checkonly)
-				udp->udp_multicast_loop = *invalp;
+				connp->conn_multicast_loop = *invalp;
 			break;
 		case IP_RECVOPTS:
 			if (!checkonly)
@@ -2847,7 +3521,7 @@
 				return (EINVAL);
 			}
 			if (!checkonly)
-				udp->udp_multicast_loop = *i1;
+				connp->conn_multicast_loop = *i1;
 			break;
 		case IPV6_JOIN_GROUP:
 		case IPV6_LEAVE_GROUP:
@@ -3093,6 +3767,7 @@
 					ipp->ipp_rtdstopts = NULL;
 					ipp->ipp_rtdstoptslen = 0;
 				}
+
 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
 			} else {
@@ -3447,12 +4122,13 @@
 }
 
 static void
-udp_rput(queue_t *q, mblk_t *mp)
+udp_input(conn_t *connp, mblk_t *mp)
 {
 	struct T_unitdata_ind	*tudi;
-	uchar_t			*rptr;
-	int			hdr_length;
+	uchar_t			*rptr;		/* Pointer to IP header */
+	int			hdr_length;	/* Length of IP+UDP headers */
 	int			udi_size;	/* Size of T_unitdata_ind */
+	int			mp_len;
 	udp_t			*udp;
 	udpha_t			*udpha;
 	int			ipversion;
@@ -3462,104 +4138,56 @@
 	mblk_t			*mp1;
 	mblk_t			*options_mp = NULL;
 	in_pktinfo_t		*pinfo = NULL;
-	size_t			mp_size = MBLKL(mp);
 	cred_t			*cr = NULL;
+	queue_t			*q = connp->conn_rq;
 	pid_t			cpid;
 
 	TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
 	    "udp_rput_start: q %p mp %p", q, mp);
 
-	udp = (udp_t *)q->q_ptr;
+	udp = connp->conn_udp;
 	rptr = mp->b_rptr;
-
-	switch (mp->b_datap->db_type) {
-	case M_DATA:
-		/*
-		 * M_DATA messages contain IP datagrams.  They are handled
-		 * after this switch.
-		 */
-		break;
-	case M_PROTO:
-	case M_PCPROTO:
-		/* M_PROTO messages contain some type of TPI message. */
-		if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
-			freemsg(mp);
-			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-				"udp_rput_end: q %p (%S)", q, "protoshort");
-			return;
-		}
-		qwriter(q, mp, udp_rput_other, PERIM_INNER);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "proto");
-		return;
-	case M_FLUSH:
-		if (*mp->b_rptr & FLUSHR)
-			flushq(q, FLUSHDATA);
-		putnext(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "flush");
-		return;
-	case M_CTL:
-		if (udp->udp_recvif || udp->udp_recvslla ||
-		    udp->udp_ipv6_recvpktinfo) {
-			/*
-			 * IP should have prepended the options data in an M_CTL
-			 * Check M_CTL "type" to make sure are not here bcos of
-			 * a valid ICMP message
-			 */
-			if (mp_size == sizeof (in_pktinfo_t) &&
-			    ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
-			    IN_PKTINFO) {
-				pinfo = (in_pktinfo_t *)mp->b_rptr;
-				/*
-				 * Jump to normal data processing, this is not
-				 * an ICMP message
-				 */
-				break;
-			}
-		}
-		/*
-		 * ICMP messages.
-		 */
-		udp_icmp_error(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "m_ctl");
-		return;
-	default:
-		putnext(q, mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "default");
-		return;
-	}
+	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+	ASSERT(OK_32PTR(rptr));
 
 	/*
-	 * If we are here bcos the IP_RECVIF or IP_RECVSLLA then we need to
-	 * extract the mblk and adjust the rptr
+	 * IP should have prepended the options data in an M_CTL
+	 * Check M_CTL "type" to make sure are not here bcos of
+	 * a valid ICMP message
 	 */
-	if (pinfo != NULL) {
-		ASSERT(mp->b_datap->db_type == M_CTL);
-		options_mp = mp;
-		mp = mp->b_cont;
-		rptr = mp->b_rptr;
-		mp_size = MBLKL(mp);
-	}
+	if (DB_TYPE(mp) == M_CTL) {
+		if (MBLKL(mp) == sizeof (in_pktinfo_t) &&
+		    ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
+		    IN_PKTINFO) {
+			/*
+			 * IP_RECVIF or IP_RECVSLLA information has been
+			 * appended to the packet by IP. We need to
+			 * extract the mblk and adjust the rptr
+			 */
+			pinfo = (in_pktinfo_t *)mp->b_rptr;
+			options_mp = mp;
+			mp = mp->b_cont;
+			rptr = mp->b_rptr;
+			UDP_STAT(udp_in_pktinfo);
+		} else {
+			/*
+			 * ICMP messages.
+			 */
+			udp_icmp_error(q, mp);
+			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
+				"udp_rput_end: q %p (%S)", q, "m_ctl");
+			return;
+		}
+	}
+
+	mp_len = msgdsize(mp);
 	/*
 	 * This is the inbound data path.
 	 * First, we check to make sure the IP version number is correct,
 	 * and then pull the IP and UDP headers into the first mblk.
-	 */
-	/*
 	 * Assume IP provides aligned packets - otherwise toss.
 	 * Also, check if we have a complete IP header.
 	 */
-	if (!OK_32PTR(rptr) || (mp_size < sizeof (ipha_t))) {
-tossit:
-		freemsg(mp);
-		if (options_mp != NULL)
-			freeb(options_mp);
-		BUMP_MIB(&udp_mib, udpInErrors);
-		return;
-	}
 
 	/* Initialize regardless if ipversion is IPv4 or IPv6 */
 	ipp.ipp_fields = 0;
@@ -3567,10 +4195,9 @@
 	ipversion = IPH_HDR_VERSION(rptr);
 	switch (ipversion) {
 	case IPV4_VERSION:
+		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+		ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
 		hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE;
-		/* Verify this is a UDP packet */
-		if (((ipha_t *)rptr)->ipha_protocol != IPPROTO_UDP)
-			goto tossit;
 		if ((hdr_length > IP_SIMPLE_HDR_LENGTH + UDPH_SIZE) ||
 		    (udp->udp_ip_rcv_options_len)) {
 			/*
@@ -3587,7 +4214,7 @@
 			 * the packet.
 			 */
 			udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
-			if (msgdsize(mp) != (ntohs(udpha->uha_length) +
+			if (mp_len != (ntohs(udpha->uha_length) +
 			    hdr_length - UDPH_SIZE)) {
 				goto tossit;
 			}
@@ -3597,14 +4224,16 @@
 			 */
 			if (pinfo != NULL)
 				mp = options_mp;
-			qwriter(q, mp, udp_rput_other, PERIM_INNER);
+			udp_become_writer(connp, mp, udp_rput_other_wrapper,
+			    SQTAG_UDP_INPUT);
 			TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
 				"udp_rput_end: q %p (%S)", q, "end");
 			return;
 		}
 
 		/* Handle IPV6_RECVHOPLIMIT. */
-		if ((udp->udp_family == AF_INET6) && (pinfo != NULL)) {
+		if ((udp->udp_family == AF_INET6) && (pinfo != NULL) &&
+		    udp->udp_ipv6_recvpktinfo) {
 			if (pinfo->in_pkt_flags & IPF_RECVIF) {
 				ipp.ipp_fields |= IPPF_IFINDEX;
 				ipp.ipp_ifindex = pinfo->in_pkt_ifindex;
@@ -3620,8 +4249,7 @@
 		ASSERT(udp->udp_family == AF_INET6);
 
 		ip6h = (ip6_t *)rptr;
-		if ((uchar_t *)&ip6h[1] > mp->b_wptr)
-			goto tossit;
+		ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
 
 		if (ip6h->ip6_nxt != IPPROTO_UDP) {
 			uint8_t nexthdrp;
@@ -3647,6 +4275,7 @@
 				if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE))
 					goto tossit;
 				ip6h = (ip6_t *)rptr;
+				mp_len = msgdsize(mp);
 			}
 			/*
 			 * Find any potentially interesting extension headers
@@ -3655,18 +4284,14 @@
 			 */
 			hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) +
 			    UDPH_SIZE;
-			/* Verify this is a UDP packet */
-			if (nexthdrp != IPPROTO_UDP)
-				goto tossit;
+			ASSERT(nexthdrp == IPPROTO_UDP);
 		} else {
 			hdr_length = IPV6_HDR_LEN + UDPH_SIZE;
 			ip6i = NULL;
 		}
 		break;
 	default:
-		TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
-			"udp_rput_end: q %p (%S)", q, "Unknown IP version");
-		goto tossit;
+		ASSERT(0);
 	}
 
 	/*
@@ -3677,14 +4302,15 @@
 	 */
 	udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
 	if ((MBLKL(mp) < hdr_length) ||
-	    (msgdsize(mp) != (ntohs(udpha->uha_length) +
-	    hdr_length - UDPH_SIZE))) {
+	    (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) {
 		goto tossit;
 	}
 
 	/* Walk past the headers. */
-	if (!udp->udp_rcvhdr)
+	if (!udp->udp_rcvhdr) {
 		mp->b_rptr = rptr + hdr_length;
+		mp_len -= hdr_length;
+	}
 
 	/*
 	 * This is the inbound data path.  Packets are passed upstream as
@@ -3706,6 +4332,7 @@
 		if (udp->udp_recvdstaddr) {
 			udi_size += sizeof (struct T_opthdr) +
 			    sizeof (struct in_addr);
+			UDP_STAT(udp_in_recvdstaddr);
 		}
 
 		/*
@@ -3714,25 +4341,28 @@
 		 */
 		if (udp->udp_recvif && (pinfo != NULL) &&
 		    (pinfo->in_pkt_flags & IPF_RECVIF)) {
-			udi_size += sizeof (struct T_opthdr) +
-				sizeof (uint_t);
+			udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+			UDP_STAT(udp_in_recvif);
 		}
 
 		if (udp->udp_recvslla && (pinfo != NULL) &&
 		    (pinfo->in_pkt_flags & IPF_RECVSLLA)) {
 			udi_size += sizeof (struct T_opthdr) +
-				sizeof (struct sockaddr_dl);
+			    sizeof (struct sockaddr_dl);
+			UDP_STAT(udp_in_recvslla);
 		}
 
 		if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 			udi_size += sizeof (struct T_opthdr) + ucredsize;
 			cpid = DB_CPID(mp);
+			UDP_STAT(udp_in_recvucred);
 		}
 		/*
 		 * If IP_RECVTTL is set allocate the appropriate sized buffer
 		 */
 		if (udp->udp_recvttl) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+			UDP_STAT(udp_in_recvttl);
 		}
 
 		ASSERT(IPH_HDR_LENGTH((ipha_t *)rptr) == IP_SIMPLE_HDR_LENGTH);
@@ -3889,12 +4519,14 @@
 			    (ipp.ipp_fields & IPPF_HOPOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_hopoptslen;
+				UDP_STAT(udp_in_recvhopopts);
 			}
 			if ((udp->udp_ipv6_recvdstopts ||
 				udp->udp_old_ipv6_recvdstopts) &&
 			    (ipp.ipp_fields & IPPF_DSTOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_dstoptslen;
+				UDP_STAT(udp_in_recvdstopts);
 			}
 			if (((udp->udp_ipv6_recvdstopts &&
 			    udp->udp_ipv6_recvrthdr &&
@@ -3903,29 +4535,37 @@
 			    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_rtdstoptslen;
+				UDP_STAT(udp_in_recvrtdstopts);
 			}
 			if (udp->udp_ipv6_recvrthdr &&
 			    (ipp.ipp_fields & IPPF_RTHDR)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    ipp.ipp_rthdrlen;
+				UDP_STAT(udp_in_recvrthdr);
 			}
 			if (udp->udp_ipv6_recvpktinfo &&
 			    (ipp.ipp_fields & IPPF_IFINDEX)) {
 				udi_size += sizeof (struct T_opthdr) +
 				    sizeof (struct in6_pktinfo);
+				UDP_STAT(udp_in_recvpktinfo);
 			}
 
 		}
 		if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 			udi_size += sizeof (struct T_opthdr) + ucredsize;
 			cpid = DB_CPID(mp);
-		}
-
-		if (udp->udp_ipv6_recvhoplimit)
+			UDP_STAT(udp_in_recvucred);
+		}
+
+		if (udp->udp_ipv6_recvhoplimit) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (int);
-
-		if (udp->udp_ipv6_recvtclass)
+			UDP_STAT(udp_in_recvhoplimit);
+		}
+
+		if (udp->udp_ipv6_recvtclass) {
 			udi_size += sizeof (struct T_opthdr) + sizeof (int);
+			UDP_STAT(udp_in_recvtclass);
+		}
 
 		mp1 = allocb(udi_size, BPRI_MED);
 		if (mp1 == NULL) {
@@ -3960,7 +4600,7 @@
 			sin6->sin6_flowinfo = 0;
 			sin6->sin6_scope_id = 0;
 			sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
-			    udp->udp_zoneid);
+			    connp->conn_zoneid);
 		} else {
 			sin6->sin6_addr = ip6h->ip6_src;
 			/* No sin6_flowinfo per API */
@@ -3971,8 +4611,8 @@
 				sin6->sin6_scope_id = ipp.ipp_ifindex;
 			else
 				sin6->sin6_scope_id = 0;
-			sin6->__sin6_src_id =
-			    ip_srcid_find_addr(&ip6h->ip6_dst, udp->udp_zoneid);
+			sin6->__sin6_src_id = ip_srcid_find_addr(
+			    &ip6h->ip6_dst, connp->conn_zoneid);
 		}
 		sin6->sin6_port = udpha->uha_src_port;
 		sin6->sin6_family = udp->udp_family;
@@ -4133,7 +4773,45 @@
 		"udp_rput_end: q %p (%S)", q, "end");
 	if (options_mp != NULL)
 		freeb(options_mp);
-	putnext(q, mp);
+
+	if (udp->udp_direct_sockfs) {
+		/*
+		 * There is nothing above us except for the stream head;
+		 * use the read-side synchronous stream interface in
+		 * order to reduce the time spent in interrupt thread.
+		 */
+		ASSERT(udp->udp_issocket);
+		udp_rcv_enqueue(UDP_RD(q), udp, mp, mp_len);
+	} else {
+		/*
+		 * Use regular STREAMS interface to pass data upstream
+		 * if this is not a socket endpoint, or if we have
+		 * switched over to the slow mode due to sockmod being
+		 * popped or a module being pushed on top of us.
+		 */
+		putnext(UDP_RD(q), mp);
+	}
+	return;
+
+tossit:
+	freemsg(mp);
+	if (options_mp != NULL)
+		freeb(options_mp);
+	BUMP_MIB(&udp_mib, udpInErrors);
+}
+
+void
+udp_conn_recv(conn_t *connp, mblk_t *mp)
+{
+	_UDP_ENTER(connp, mp, udp_input_wrapper, SQTAG_UDP_FANOUT);
+}
+
+/* ARGSUSED */
+static void
+udp_input_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_input((conn_t *)arg, mp);
+	_UDP_EXIT((conn_t *)arg);
 }
 
 /*
@@ -4152,18 +4830,17 @@
 	int			opt_len;	/* Length of IP options */
 	sin_t			*sin;
 	struct T_error_ack	*tea;
-	udp_t			*udp;
 	mblk_t			*options_mp = NULL;
 	in_pktinfo_t		*pinfo;
 	boolean_t		recv_on = B_FALSE;
 	cred_t			*cr = NULL;
+	udp_t			*udp = Q_TO_UDP(q);
 	pid_t			cpid;
 
 	TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
 	    "udp_rput_other: q %p mp %p", q, mp);
 
 	ASSERT(OK_32PTR(mp->b_rptr));
-	udp = (udp_t *)q->q_ptr;
 	rptr = mp->b_rptr;
 
 	switch (mp->b_datap->db_type) {
@@ -4258,7 +4935,7 @@
 			freemsg(mp);
 			return;
 		}
-		putnext(q, mp);
+		putnext(UDP_RD(q), mp);
 		return;
 	}
 
@@ -4323,9 +5000,12 @@
 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
 	if (udp->udp_recvdstaddr) {
 		udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr);
-	}
-	if (udp->udp_recvopts && opt_len > 0)
+		UDP_STAT(udp_in_recvdstaddr);
+	}
+	if (udp->udp_recvopts && opt_len > 0) {
 		udi_size += sizeof (struct T_opthdr) + opt_len;
+		UDP_STAT(udp_in_recvopts);
+	}
 
 	/*
 	 * If the IP_RECVSLLA or the IP_RECVIF is set then allocate
@@ -4333,25 +5013,28 @@
 	 */
 	if (udp->udp_recvif && recv_on &&
 	    (pinfo->in_pkt_flags & IPF_RECVIF)) {
-		udi_size += sizeof (struct T_opthdr) +
-		    sizeof (uint_t);
+		udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+		UDP_STAT(udp_in_recvif);
 	}
 
 	if (udp->udp_recvslla && recv_on &&
 	    (pinfo->in_pkt_flags & IPF_RECVSLLA)) {
 		udi_size += sizeof (struct T_opthdr) +
 		    sizeof (struct sockaddr_dl);
+		UDP_STAT(udp_in_recvslla);
 	}
 
 	if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
 		udi_size += sizeof (struct T_opthdr) + ucredsize;
 		cpid = DB_CPID(mp);
+		UDP_STAT(udp_in_recvucred);
 	}
 	/*
 	 * If IP_RECVTTL is set allocate the appropriate sized buffer
 	 */
 	if (udp->udp_recvttl) {
 		udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+		UDP_STAT(udp_in_recvttl);
 	}
 
 	/* Allocate a message block for the T_UNITDATA_IND structure. */
@@ -4502,7 +5185,34 @@
 	    "udp_rput_other_end: q %p (%S)", q, "end");
 	if (options_mp != NULL)
 		freeb(options_mp);
-	putnext(q, mp);
+
+	if (udp->udp_direct_sockfs) {
+		/*
+		 * There is nothing above us except for the stream head;
+		 * use the read-side synchronous stream interface in
+		 * order to reduce the time spent in interrupt thread.
+		 */
+		ASSERT(udp->udp_issocket);
+		udp_rcv_enqueue(UDP_RD(q), udp, mp, msgdsize(mp));
+	} else {
+		/*
+		 * Use regular STREAMS interface to pass data upstream
+		 * if this is not a socket endpoint, or if we have
+		 * switched over to the slow mode due to sockmod being
+		 * popped or a module being pushed on top of us.
+		 */
+		putnext(UDP_RD(q), mp);
+	}
+}
+
+/* ARGSUSED */
+static void
+udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	conn_t *connp = arg;
+
+	udp_rput_other(connp->conn_rq, mp);
+	udp_exit(connp);
 }
 
 /*
@@ -4511,7 +5221,7 @@
 static void
 udp_rput_bind_ack(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp = (udp_t *)q->q_ptr;
+	udp_t	*udp = Q_TO_UDP(q);
 	mblk_t	*mp1;
 	ire_t	*ire;
 	struct T_bind_ack *tba;
@@ -4602,20 +5312,20 @@
 		while (mp != NULL) {
 			mp1 = mp->b_cont;
 			mp->b_cont = NULL;
-			putnext(q, mp);
+			putnext(UDP_RD(q), mp);
 			mp = mp1;
 		}
 		return;
 	}
 	freemsg(mp->b_cont);
 	mp->b_cont = NULL;
-	putnext(q, mp);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
  * return SNMP stuff in buffer in mpdata
  */
-static int
+int
 udp_snmp_get(queue_t *q, mblk_t *mpctl)
 {
 	mblk_t			*mpdata;
@@ -4626,12 +5336,14 @@
 	mblk_t			*mp_conn_tail = NULL;
 	mblk_t			*mp6_conn_tail = NULL;
 	struct opthdr		*optp;
-	IDP			idp;
-	udp_t			*udp;
 	mib2_udpEntry_t		ude;
 	mib2_udp6Entry_t	ude6;
 	int			state;
 	zoneid_t		zoneid;
+	int			i;
+	connf_t			*connfp;
+	conn_t			*connp = Q_TO_CONN(q);
+	udp_t			*udp = connp->conn_udp;
 
 	if (mpctl == NULL ||
 	    (mpdata = mpctl->b_cont) == NULL ||
@@ -4644,8 +5356,7 @@
 	mp_conn_data = mp_conn_ctl->b_cont;
 	mp6_conn_data = mp6_conn_ctl->b_cont;
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
+	zoneid = connp->conn_zoneid;
 
 	/* fixed length structure for IPv4 and IPv6 counters */
 	SET_MIB(udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t));
@@ -4657,76 +5368,88 @@
 	optp->len = msgdsize(mpdata);
 	qreply(q, mpctl);
 
-	mutex_enter(&udp_g_lock);
-	for (idp = mi_first_ptr(&udp_g_head);
-	    (udp = (udp_t *)idp) != 0;
-	    idp = mi_next_ptr(&udp_g_head, idp)) {
-
-		if (zoneid != udp->udp_zoneid)
-			continue;
-
-		/* Note that the port numbers are sent in host byte order */
-
-		if (udp->udp_state == TS_UNBND)
-			state = MIB2_UDP_unbound;
-		else if (udp->udp_state == TS_IDLE)
-			state = MIB2_UDP_idle;
-		else if (udp->udp_state == TS_DATA_XFER)
-			state = MIB2_UDP_connected;
-		else
-			state = MIB2_UDP_unknown;
-
-		/*
-		 * Create an IPv4 table entry for IPv4 entries and also
-		 * any IPv6 entries which are bound to in6addr_any
-		 * (i.e. anything a IPv4 peer could connect/send to).
-		 */
-		if (udp->udp_ipversion == IPV4_VERSION ||
-		    (udp->udp_state <= TS_IDLE &&
-		    IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
-			ude.udpEntryInfo.ue_state = state;
-			/* If in6addr_any this will set it to INADDR_ANY */
-			ude.udpLocalAddress = V4_PART_OF_V6(udp->udp_v6src);
-			ude.udpLocalPort = ntohs(udp->udp_port);
-			if (udp->udp_state == TS_DATA_XFER) {
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipcl_globalhash_fanout[i];
+		connp = NULL;
+
+		while ((connp = ipcl_get_next_conn(connfp, connp,
+		    IPCL_UDP))) {
+			udp = connp->conn_udp;
+			if (zoneid != connp->conn_zoneid)
+				continue;
+
+			/*
+			 * Note that the port numbers are sent in
+			 * host byte order
+			 */
+
+			if (udp->udp_state == TS_UNBND)
+				state = MIB2_UDP_unbound;
+			else if (udp->udp_state == TS_IDLE)
+				state = MIB2_UDP_idle;
+			else if (udp->udp_state == TS_DATA_XFER)
+				state = MIB2_UDP_connected;
+			else
+				state = MIB2_UDP_unknown;
+
+			/*
+			 * Create an IPv4 table entry for IPv4 entries and also
+			 * any IPv6 entries which are bound to in6addr_any
+			 * (i.e. anything a IPv4 peer could connect/send to).
+			 */
+			if (udp->udp_ipversion == IPV4_VERSION ||
+			    (udp->udp_state <= TS_IDLE &&
+			    IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
+				ude.udpEntryInfo.ue_state = state;
 				/*
-				 * Can potentially get here for v6 socket
-				 * if another process (say, ping) has just
-				 * done a sendto(), changing the state
-				 * from the TS_IDLE above to TS_DATA_XFER
-				 * by the time we hit this part of the code.
+				 * If in6addr_any this will set it to
+				 * INADDR_ANY
 				 */
-				ude.udpEntryInfo.ue_RemoteAddress =
-				    V4_PART_OF_V6(udp->udp_v6dst);
-				ude.udpEntryInfo.ue_RemotePort =
-				    ntohs(udp->udp_dstport);
-			} else {
-				ude.udpEntryInfo.ue_RemoteAddress = 0;
-				ude.udpEntryInfo.ue_RemotePort = 0;
+				ude.udpLocalAddress =
+				    V4_PART_OF_V6(udp->udp_v6src);
+				ude.udpLocalPort = ntohs(udp->udp_port);
+				if (udp->udp_state == TS_DATA_XFER) {
+					/*
+					 * Can potentially get here for
+					 * v6 socket if another process
+					 * (say, ping) has just done a
+					 * sendto(), changing the state
+					 * from the TS_IDLE above to
+					 * TS_DATA_XFER by the time we hit
+					 * this part of the code.
+					 */
+					ude.udpEntryInfo.ue_RemoteAddress =
+					    V4_PART_OF_V6(udp->udp_v6dst);
+					ude.udpEntryInfo.ue_RemotePort =
+					    ntohs(udp->udp_dstport);
+				} else {
+					ude.udpEntryInfo.ue_RemoteAddress = 0;
+					ude.udpEntryInfo.ue_RemotePort = 0;
+				}
+				(void) snmp_append_data2(mp_conn_data,
+				    &mp_conn_tail, (char *)&ude, sizeof (ude));
 			}
-			(void) snmp_append_data2(mp_conn_data, &mp_conn_tail,
-			    (char *)&ude, sizeof (ude));
-		}
-		if (udp->udp_ipversion == IPV6_VERSION) {
-			ude6.udp6EntryInfo.ue_state  = state;
-			ude6.udp6LocalAddress = udp->udp_v6src;
-			ude6.udp6LocalPort = ntohs(udp->udp_port);
-			ude6.udp6IfIndex = udp->udp_bound_if;
-			if (udp->udp_state == TS_DATA_XFER) {
-				ude6.udp6EntryInfo.ue_RemoteAddress =
-				    udp->udp_v6dst;
-				ude6.udp6EntryInfo.ue_RemotePort =
-				    ntohs(udp->udp_dstport);
-			} else {
-				ude6.udp6EntryInfo.ue_RemoteAddress =
-				    sin6_null.sin6_addr;
-				ude6.udp6EntryInfo.ue_RemotePort = 0;
+			if (udp->udp_ipversion == IPV6_VERSION) {
+				ude6.udp6EntryInfo.ue_state  = state;
+				ude6.udp6LocalAddress = udp->udp_v6src;
+				ude6.udp6LocalPort = ntohs(udp->udp_port);
+				ude6.udp6IfIndex = udp->udp_bound_if;
+				if (udp->udp_state == TS_DATA_XFER) {
+					ude6.udp6EntryInfo.ue_RemoteAddress =
+					    udp->udp_v6dst;
+					ude6.udp6EntryInfo.ue_RemotePort =
+					    ntohs(udp->udp_dstport);
+				} else {
+					ude6.udp6EntryInfo.ue_RemoteAddress =
+					    sin6_null.sin6_addr;
+					ude6.udp6EntryInfo.ue_RemotePort = 0;
+				}
+				(void) snmp_append_data2(mp6_conn_data,
+				    &mp6_conn_tail, (char *)&ude6,
+				    sizeof (ude6));
 			}
-			(void) snmp_append_data2(mp6_conn_data, &mp6_conn_tail,
-			    (char *)&ude6, sizeof (ude6));
-		}
-	}
-	mutex_exit(&udp_g_lock);
+		}
+	}
 
 	/* IPv4 UDP endpoints */
 	optp = (struct opthdr *)&mp_conn_ctl->b_rptr[
@@ -4754,7 +5477,7 @@
  * to do the appropriate locking.
  */
 /* ARGSUSED */
-static int
+int
 udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
     uchar_t *ptr, int len)
 {
@@ -4789,7 +5512,7 @@
 		state = "UnkState";
 	print_len = snprintf((char *)mp->b_wptr, buf_len,
 	    MI_COL_PTRFMT_STR "%4d %5u %s %s %5u %s\n",
-	    (void *)udp, udp->udp_zoneid, ntohs(udp->udp_port),
+	    (void *)udp, udp->udp_connp->conn_zoneid, ntohs(udp->udp_port),
 	    inet_ntop(AF_INET6, &udp->udp_v6src,
 		addrbuf1, sizeof (addrbuf1)),
 	    inet_ntop(AF_INET6, &udp->udp_v6dst,
@@ -4807,9 +5530,11 @@
 static int
 udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
 {
-	IDP	idp;
-	udp_t	*udp;
 	zoneid_t zoneid;
+	connf_t	*connfp;
+	conn_t	*connp = Q_TO_CONN(q);
+	udp_t	*udp = connp->conn_udp;
+	int	i;
 
 	/*
 	 * Because of the ndd constraint, at most we can have 64K buffer
@@ -4837,21 +5562,22 @@
 	    " zone lport src addr        dest addr       port  state");
 	/*    1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
 
-	udp = (udp_t *)q->q_ptr;
-	zoneid = udp->udp_zoneid;
-
-	mutex_enter(&udp_g_lock);
-	for (idp = mi_first_ptr(&udp_g_head);
-	    (udp = (udp_t *)idp) != 0;
-	    idp = mi_next_ptr(&udp_g_head, idp)) {
-
-		if (zoneid != GLOBAL_ZONEID &&
-		    zoneid != udp->udp_zoneid)
-			continue;
-
-		udp_report_item(mp->b_cont, udp);
-	}
-	mutex_exit(&udp_g_lock);
+	zoneid = connp->conn_zoneid;
+
+	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+		connfp = &ipcl_globalhash_fanout[i];
+		connp = NULL;
+
+		while ((connp = ipcl_get_next_conn(connfp, connp,
+		    IPCL_UDP))) {
+			udp = connp->conn_udp;
+			if (zoneid != GLOBAL_ZONEID &&
+			    zoneid != connp->conn_zoneid)
+				continue;
+
+			udp_report_item(mp->b_cont, udp);
+		}
+	}
 	udp_last_ndd_get_info_time = ddi_get_lbolt();
 	return (0);
 }
@@ -4862,32 +5588,44 @@
  * passed in mp.  This message is freed.
  */
 static void
-udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
+udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen,
+    t_scalar_t err)
 {
+	struct T_unitdata_req *tudr;
 	mblk_t	*mp1;
-	struct T_unitdata_req	*tudr = (struct T_unitdata_req *)mp->b_rptr;
-	uchar_t	*destaddr, *optaddr;
-
-	if ((mp->b_wptr < mp->b_rptr) ||
-	    (mp->b_wptr - mp->b_rptr) < sizeof (struct T_unitdata_req)) {
-		goto done;
-	}
-	destaddr = mp->b_rptr + tudr->DEST_offset;
-	if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
-	    destaddr + tudr->DEST_length < mp->b_rptr ||
-	    destaddr + tudr->DEST_length > mp->b_wptr) {
-		goto done;
-	}
-	optaddr = mp->b_rptr + tudr->OPT_offset;
-	if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
-	    optaddr + tudr->OPT_length < mp->b_rptr ||
-	    optaddr + tudr->OPT_length > mp->b_wptr) {
-		goto done;
-	}
-	mp1 = mi_tpi_uderror_ind((char *)destaddr, tudr->DEST_length,
-	    (char *)optaddr, tudr->OPT_length, err);
-	if (mp1)
-		qreply(q, mp1);
+	uchar_t	*optaddr;
+	t_scalar_t optlen;
+
+	if (DB_TYPE(mp) == M_DATA) {
+		ASSERT(destaddr != NULL && destlen != 0);
+		optaddr = NULL;
+		optlen = 0;
+	} else {
+		if ((mp->b_wptr < mp->b_rptr) ||
+		    (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+			goto done;
+		}
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+		destaddr = mp->b_rptr + tudr->DEST_offset;
+		if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+		    destaddr + tudr->DEST_length < mp->b_rptr ||
+		    destaddr + tudr->DEST_length > mp->b_wptr) {
+			goto done;
+		}
+		optaddr = mp->b_rptr + tudr->OPT_offset;
+		if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+		    optaddr + tudr->OPT_length < mp->b_rptr ||
+		    optaddr + tudr->OPT_length > mp->b_wptr) {
+			goto done;
+		}
+		destlen = tudr->DEST_length;
+		optlen = tudr->OPT_length;
+	}
+
+	mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
+	    (char *)optaddr, optlen, err);
+	if (mp1 != NULL)
+		putnext(UDP_RD(q), mp1);
 
 done:
 	freemsg(mp);
@@ -4900,9 +5638,8 @@
 static void
 udp_unbind(queue_t *q, mblk_t *mp)
 {
-	udp_t	*udp;
-
-	udp = (udp_t *)q->q_ptr;
+	udp_t *udp = Q_TO_UDP(q);
+
 	/* If a bind has not been done, we can't unbind. */
 	if (udp->udp_state == TS_UNBND) {
 		udp_err_ack(q, mp, TOUTSTATE, 0);
@@ -4939,8 +5676,13 @@
 			return;
 		}
 	}
-	/* Pass the unbind to IP */
-	putnext(q, mp);
+	/*
+	 * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
+	 * and therefore ip_unbind must never return NULL.
+	 */
+	mp = ip_unbind(q, mp);
+	ASSERT(mp != NULL);
+	putnext(UDP_RD(q), mp);
 }
 
 /*
@@ -4994,193 +5736,47 @@
 	return (port);
 }
 
-/*
- * This routine handles all messages passed downstream.  It either
- * consumes the message or passes it downstream; it never queues a
- * a message.
- */
-static void
-udp_wput(queue_t *q, mblk_t *mp)
+static mblk_t *
+udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
+    uint_t srcid, int *error)
 {
-	uchar_t		*rptr = mp->b_rptr;
-	struct 		datab *db;
-	ipha_t		*ipha;
-	udpha_t		*udpha;
-	mblk_t		*mp1;
-	int		ip_hdr_length;
-#define	tudr ((struct T_unitdata_req *)rptr)
-	uint32_t	ip_len;
-	udp_t		*udp;
-	sin6_t		*sin6;
-	sin_t		*sin;
-	ipaddr_t	v4dst;
-	uint16_t	port;
-	uint_t		srcid;
-
-	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
-		"udp_wput_start: q %p mp %p", q, mp);
-
-	db = mp->b_datap;
-	switch (db->db_type) {
-	case M_PROTO:
-	case M_PCPROTO:
-		ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
-		if (mp->b_wptr - rptr >= sizeof (struct T_unitdata_req)) {
-			/* Detect valid T_UNITDATA_REQ here */
-			if (((union T_primitives *)rptr)->type
-			    == T_UNITDATA_REQ)
-				break;
-		}
-		/* FALLTHRU */
-	default:
-		qwriter(q, mp, udp_wput_other, PERIM_INNER);
-		return;
-	}
-
-	udp = (udp_t *)q->q_ptr;
-
-	/* Handle UNITDATA_REQ messages here */
-	if (udp->udp_state == TS_UNBND) {
-		/* If a port has not been bound to the stream, fail. */
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EPROTO);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "outstate");
-		return;
-	}
-	mp1 = mp->b_cont;
-	if (mp1 == NULL) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EPROTO);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "badaddr");
-		return;
-	}
-
-	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EADDRNOTAVAIL);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			"udp_wput_end: q %p (%S)", q, "badaddr");
-		return;
-	}
-
-	switch (udp->udp_family) {
-	case AF_INET6:
-		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin6) ||
-		    tudr->DEST_length != sizeof (sin6_t) ||
-		    sin6->sin6_family != AF_INET6) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-
-		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
-			/*
-			 * Destination is a non-IPv4-compatible IPv6 address.
-			 * Send out an IPv6 format packet.
-			 */
-			udp_wput_ipv6(q, mp, sin6, tudr->OPT_length);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "udp_wput_ipv6");
-			return;
-		}
-		/*
-		 * If the local address is not zero or a mapped address return
-		 * an error.
-		 * I would be possible to send an IPv4 packet but the
-		 * response would never make it back to the application
-		 * since it is bound to a non-mapped address.
-		 */
-		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
-		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-		/* Send IPv4 packet without modifying udp_ipversion */
-		/* Extract port and ipaddr */
-		port = sin6->sin6_port;
-		IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
-		srcid = sin6->__sin6_src_id;
-		break;
-
-	case AF_INET:
-		sin = (sin_t *)&rptr[tudr->DEST_offset];
-		if (!OK_32PTR((char *)sin) ||
-		    tudr->DEST_length != sizeof (sin_t) ||
-		    sin->sin_family != AF_INET) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "badaddr");
-			return;
-		}
-		/* Extract port and ipaddr */
-		port = sin->sin_port;
-		v4dst = sin->sin_addr.s_addr;
-		srcid = 0;
-		break;
-	}
-
-
-	/*
-	 * If options passed in, feed it for verification and handling
-	 */
-	if (tudr->OPT_length != 0) {
-		int error;
-
-		if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
-			/* failure */
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, error);
-			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-			    "udp_wput_end: q %p (%S)", q,
-			    "udp_unitdata_opt_process");
-			return;
-		}
-		ASSERT(error == 0);
-		/*
-		 * Note: success in processing options.
-		 * mp option buffer represented by
-		 * OPT_length/offset now potentially modified
-		 * and contain option setting results
-		 */
-	}
+	udp_t	*udp = connp->conn_udp;
+	queue_t	*q = connp->conn_wq;
+	mblk_t	*mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+	mblk_t	*mp2;
+	ipha_t	*ipha;
+	int	ip_hdr_length;
+	uint32_t ip_len;
+	udpha_t	*udpha;
+
+	*error = 0;
+
+	/* mp1 points to the M_DATA mblk carrying the packet */
+	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
 
 	/* Add an IP header */
 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
 	    udp->udp_ip_snd_options_len;
 	ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
-	if ((mp1->b_datap->db_ref != 1) ||
-	    ((uchar_t *)ipha < mp1->b_datap->db_base) ||
+	if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) ||
 	    !OK_32PTR(ipha)) {
-		uchar_t *wptr;
-
-		mp1 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
-		if (!mp1) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, ENOMEM);
+		mp2 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
+		if (mp2 == NULL) {
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
-				"udp_wput_end: q %p (%S)", q, "allocbfail2");
-			return;
-		}
-		mp1->b_cont = mp->b_cont;
-		mp->b_cont = mp1;
-		wptr = mp1->b_datap->db_lim;
-		mp1->b_wptr = wptr;
-		ipha = (ipha_t *)(wptr - ip_hdr_length);
-	}
-	mp1->b_rptr = (uchar_t *)ipha;
-
-	ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
-	    (uintptr_t)UINT_MAX);
-
+			    "udp_wput_end: q %p (%S)", q, "allocbfail2");
+			*error = ENOMEM;
+			goto done;
+		}
+		mp2->b_wptr = DB_LIM(mp2);
+		mp2->b_cont = mp1;
+		mp1 = mp2;
+		if (DB_TYPE(mp) != M_DATA)
+			mp->b_cont = mp1;
+		else
+			mp = mp1;
+
+		ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length);
+	}
 	ip_hdr_length -= UDPH_SIZE;
 #ifdef	_BIG_ENDIAN
 	/* Set version, header length, and tos */
@@ -5206,24 +5802,25 @@
 	if (srcid != 0 && ipha->ipha_src == INADDR_ANY) {
 		in6_addr_t v6src;
 
-		ip_srcid_find_id(srcid, &v6src, udp->udp_zoneid);
+		ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid);
 		IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
 	}
 
 	ipha->ipha_fragment_offset_and_flags = 0;
 	ipha->ipha_ident = 0;
 
+	mp1->b_rptr = (uchar_t *)ipha;
+
+	ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
+	    (uintptr_t)UINT_MAX);
+
 	/* Determine length of packet */
 	ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha);
-	{
-		mblk_t	*mp2;
-		if ((mp2 = mp1->b_cont) != NULL) {
-			do {
-				ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr)
-				    <= (uintptr_t)UINT_MAX);
-				ip_len += (uint32_t)(mp2->b_wptr - mp2->b_rptr);
-			} while ((mp2 = mp2->b_cont) != NULL);
-		}
+	if ((mp2 = mp1->b_cont) != NULL) {
+		do {
+			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+			ip_len += (uint32_t)MBLKL(mp2);
+		} while ((mp2 = mp2->b_cont) != NULL);
 	}
 	/*
 	 * If the size of the packet is greater than the maximum allowed by
@@ -5231,19 +5828,18 @@
 	 * the size will have wrapped and be inconsistent with the msg size.
 	 */
 	if (ip_len > IP_MAXPACKET) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EMSGSIZE);
 		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
 		    "udp_wput_end: q %p (%S)", q, "IP length exceeded");
-		return;
+		*error = EMSGSIZE;
+		goto done;
 	}
 	ipha->ipha_length = htons((uint16_t)ip_len);
 	ip_len -= ip_hdr_length;
 	ip_len = htons((uint16_t)ip_len);
 	udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length);
+
 	/*
-	 * Copy in the destination address and port from the T_UNITDATA
-	 * request
+	 * Copy in the destination address
 	 */
 	if (v4dst == INADDR_ANY)
 		ipha->ipha_dst = htonl(INADDR_LOOPBACK);
@@ -5310,41 +5906,648 @@
 	/* Set UDP length and checksum */
 	*((uint32_t *)&udpha->uha_length) = ip_len;
 
-	freeb(mp);
+	if (DB_TYPE(mp) != M_DATA) {
+		ASSERT(mp != mp1);
+		freeb(mp);
+	}
+
+	/* mp has been consumed and we'll return success */
+	ASSERT(*error == 0);
+	mp = NULL;
 
 	/* We're done.  Pass the packet to ip. */
 	BUMP_MIB(&udp_mib, udpOutDatagrams);
 	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
 		"udp_wput_end: q %p (%S)", q, "end");
-	putnext(q, mp1);
-#undef tudr
+
+	if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
+	    CONN_OUTBOUND_POLICY_PRESENT(connp) ||
+	    connp->conn_dontroute || connp->conn_xmit_if_ill != NULL ||
+	    connp->conn_nofailover_ill != NULL ||
+	    connp->conn_outgoing_ill != NULL ||
+	    ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
+	    IPP_ENABLED(IPP_LOCAL_OUT) || ip_g_mrouter != NULL) {
+		UDP_STAT(udp_ip_send);
+		ip_output(connp, mp1, connp->conn_wq, IP_WPUT);
+	} else {
+		udp_send_data(udp, connp->conn_wq, mp1, ipha);
+	}
+
+done:
+	if (*error != 0) {
+		ASSERT(mp != NULL);
+		BUMP_MIB(&udp_mib, udpOutErrors);
+	}
+	return (mp);
+}
+
+static void
+udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
+{
+	conn_t	*connp = udp->udp_connp;
+	ipaddr_t src, dst;
+	ill_t	*ill;
+	ire_t	*ire;
+	ipif_t	*ipif = NULL;
+	mblk_t	*ire_fp_mp;
+	uint_t	ire_fp_mp_len;
+	uint16_t *up;
+	uint32_t cksum, hcksum_txflags;
+	queue_t	*dev_q;
+	boolean_t retry_caching;
+
+	dst = ipha->ipha_dst;
+	src = ipha->ipha_src;
+	ASSERT(ipha->ipha_ident == 0);
+
+	if (CLASSD(dst)) {
+		int err;
+
+		ipif = conn_get_held_ipif(connp,
+		    &connp->conn_multicast_ipif, &err);
+
+		if (ipif == NULL || ipif->ipif_isv6 ||
+		    (ipif->ipif_ill->ill_phyint->phyint_flags &
+		    PHYI_LOOPBACK)) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			UDP_STAT(udp_ip_send);
+			ip_output(connp, mp, q, IP_WPUT);
+			return;
+		}
+	}
+
+	retry_caching = B_FALSE;
+	mutex_enter(&connp->conn_lock);
+	ire = connp->conn_ire_cache;
+	ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
+
+	if (ire == NULL || ire->ire_addr != dst ||
+	    (ire->ire_marks & IRE_MARK_CONDEMNED)) {
+		retry_caching = B_TRUE;
+	} else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) {
+		ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+
+		ASSERT(ipif != NULL);
+		if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
+		    stq_ill->ill_group != ipif->ipif_ill->ill_group))
+			retry_caching = B_TRUE;
+	}
+
+	if (!retry_caching) {
+		ASSERT(ire != NULL);
+		IRE_REFHOLD(ire);
+		mutex_exit(&connp->conn_lock);
+	} else {
+		boolean_t cached = B_FALSE;
+
+		connp->conn_ire_cache = NULL;
+		mutex_exit(&connp->conn_lock);
+
+		/* Release the old ire */
+		if (ire != NULL) {
+			IRE_REFRELE_NOTR(ire);
+			ire = NULL;
+		}
+
+		if (CLASSD(dst)) {
+			ASSERT(ipif != NULL);
+			ire = ire_ctable_lookup(dst, 0, 0, ipif,
+			    connp->conn_zoneid, MATCH_IRE_ILL_GROUP);
+		} else {
+			ASSERT(ipif == NULL);
+			ire = ire_cache_lookup(dst, connp->conn_zoneid);
+		}
+
+		if (ire == NULL) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			UDP_STAT(udp_ire_null);
+			ip_output(connp, mp, q, IP_WPUT);
+			return;
+		}
+		IRE_REFHOLD_NOTR(ire);
+
+		mutex_enter(&connp->conn_lock);
+		if (!(connp->conn_state_flags & CONN_CLOSING) &&
+		    connp->conn_ire_cache == NULL) {
+			rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
+			if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
+				connp->conn_ire_cache = ire;
+				cached = B_TRUE;
+			}
+			rw_exit(&ire->ire_bucket->irb_lock);
+		}
+		mutex_exit(&connp->conn_lock);
+
+		/*
+		 * We can continue to use the ire but since it was not
+		 * cached, we should drop the extra reference.
+		 */
+		if (!cached)
+			IRE_REFRELE_NOTR(ire);
+	}
+	ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION);
+	ASSERT(!CLASSD(dst) || ipif != NULL);
+
+	if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) ||
+	    (ire->ire_flags & RTF_MULTIRT) || ire->ire_stq == NULL ||
+	    ire->ire_max_frag < ntohs(ipha->ipha_length) ||
+	    (ire_fp_mp = ire->ire_fp_mp) == NULL ||
+	    (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) {
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		UDP_STAT(udp_ip_ire_send);
+		IRE_REFRELE(ire);
+		ip_output(connp, mp, q, IP_WPUT);
+		return;
+	}
+
+	BUMP_MIB(&ip_mib, ipOutRequests);
+
+	ill = ire_to_ill(ire);
+	ASSERT(ill != NULL);
+
+	dev_q = ire->ire_stq->q_next;
+	ASSERT(dev_q != NULL);
+	/*
+	 * If the service thread is already running, or if the driver
+	 * queue is currently flow-controlled, queue this packet.
+	 */
+	if ((q->q_first != NULL || connp->conn_draining) ||
+	    ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+		if (ip_output_queue) {
+			(void) putq(q, mp);
+		} else {
+			BUMP_MIB(&ip_mib, ipOutDiscards);
+			freemsg(mp);
+		}
+		if (ipif != NULL)
+			ipif_refrele(ipif);
+		IRE_REFRELE(ire);
+		return;
+	}
+
+	ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
+#ifndef _BIG_ENDIAN
+	ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
+#endif
+
+	if (src == INADDR_ANY && !connp->conn_unspec_src) {
+		if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC))
+			src = ipha->ipha_src = ipif->ipif_src_addr;
+		else
+			src = ipha->ipha_src = ire->ire_src_addr;
+	}
+
+	if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+		ASSERT(ill->ill_hcksum_capab != NULL);
+		hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+	} else {
+		hcksum_txflags = 0;
+	}
+
+	/* pseudo-header checksum (do it in parts for IP header checksum) */
+	cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+
+	ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
+	up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+	if (*up != 0) {
+		IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags,
+		    mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
+		    ntohs(ipha->ipha_length), cksum);
+
+		/* Software checksum? */
+		if (DB_CKSUMFLAGS(mp) == 0) {
+			UDP_STAT(udp_out_sw_cksum);
+			UDP_STAT_UPDATE(udp_out_sw_cksum_bytes,
+			    ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
+		}
+	}
+
+	ipha->ipha_fragment_offset_and_flags |=
+	    (uint32_t)htons(ire->ire_frag_flag);
+
+	/* Calculate IP header checksum if hardware isn't capable */
+	if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+		IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
+		    ((uint16_t *)ipha)[4]);
+	}
+
+	if (CLASSD(dst)) {
+		ilm_t *ilm;
+
+		ILM_WALKER_HOLD(ill);
+		ilm = ilm_lookup_ill(ill, dst, ALL_ZONES);
+		ILM_WALKER_RELE(ill);
+		if (ilm != NULL) {
+			ip_multicast_loopback(q, ill, mp,
+			    connp->conn_multicast_loop ? 0 :
+			    IP_FF_NO_MCAST_LOOP, connp->conn_zoneid);
+		}
+
+		/* If multicast TTL is 0 then we are done */
+		if (ipha->ipha_ttl == 0) {
+			if (ipif != NULL)
+				ipif_refrele(ipif);
+			freemsg(mp);
+			IRE_REFRELE(ire);
+			return;
+		}
+	}
+
+	ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
+	mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
+	bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
+
+	UPDATE_OB_PKT_COUNT(ire);
+	ire->ire_last_used_time = lbolt;
+
+	if (ILL_POLL_CAPABLE(ill)) {
+		/*
+		 * Send the packet directly to DLD, where it may be queued
+		 * depending on the availability of transmit resources at
+		 * the media layer.
+		 */
+		IP_POLL_ILL_TX(ill, mp);
+	} else {
+		putnext(ire->ire_stq, mp);
+	}
+
+	if (ipif != NULL)
+		ipif_refrele(ipif);
+	IRE_REFRELE(ire);
 }
 
 /*
- * udp_wput_ipv6():
+ * This routine handles all messages passed downstream.  It either
+ * consumes the message or passes it downstream; it never queues a
+ * a message.
+ */
+static void
+udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
+{
+	sin6_t		*sin6;
+	sin_t		*sin;
+	ipaddr_t	v4dst;
+	uint16_t	port;
+	uint_t		srcid;
+	queue_t		*q = connp->conn_wq;
+	udp_t		*udp = connp->conn_udp;
+	t_scalar_t	optlen;
+	int		error = 0;
+	struct sockaddr_storage ss;
+
+	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
+	    "udp_wput_start: connp %p mp %p", connp, mp);
+
+	/*
+	 * We directly handle several cases here: T_UNITDATA_REQ message
+	 * coming down as M_PROTO/M_PCPROTO and M_DATA messages for both
+	 * connected and non-connected socket.  The latter carries the
+	 * address structure along when this routine gets called.
+	 */
+	switch (DB_TYPE(mp)) {
+	case M_DATA:
+		if (!udp->udp_direct_sockfs || udp->udp_state != TS_DATA_XFER) {
+			if (!udp->udp_direct_sockfs ||
+			    addr == NULL || addrlen == 0) {
+				/* Not connected; address is required */
+				BUMP_MIB(&udp_mib, udpOutErrors);
+				UDP_STAT(udp_out_err_notconn);
+				freemsg(mp);
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: connp %p (%S)", connp,
+				    "not-connected; address required");
+				return;
+			}
+			ASSERT(udp->udp_issocket);
+			UDP_DBGSTAT(udp_data_notconn);
+			/* Not connected; do some more checks below */
+			optlen = 0;
+			break;
+		}
+		/* M_DATA for connected socket */
+		UDP_DBGSTAT(udp_data_conn);
+		IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst);
+
+		/* Initialize addr and addrlen as if they're passed in */
+		if (udp->udp_family == AF_INET) {
+			sin = (sin_t *)&ss;
+			sin->sin_family = AF_INET;
+			sin->sin_port = udp->udp_dstport;
+			sin->sin_addr.s_addr = v4dst;
+			addr = (struct sockaddr *)sin;
+			addrlen = sizeof (*sin);
+		} else {
+			sin6 = (sin6_t *)&ss;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = udp->udp_dstport;
+			sin6->sin6_flowinfo = udp->udp_flowinfo;
+			sin6->sin6_addr = udp->udp_v6dst;
+			sin6->sin6_scope_id = 0;
+			sin6->__sin6_src_id = 0;
+			addr = (struct sockaddr *)sin6;
+			addrlen = sizeof (*sin6);
+		}
+
+		if (udp->udp_family == AF_INET ||
+		    IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst)) {
+			/*
+			 * Handle both AF_INET and AF_INET6; the latter
+			 * for IPV4 mapped destination addresses.  Note
+			 * here that both addr and addrlen point to the
+			 * corresponding struct depending on the address
+			 * family of the socket.
+			 */
+			mp = udp_output_v4(connp, mp, v4dst,
+			    udp->udp_dstport, 0, &error);
+		} else {
+			mp = udp_output_v6(connp, mp, sin6, 0, &error);
+		}
+		if (error != 0) {
+			ASSERT(addr != NULL && addrlen != 0);
+			goto ud_error;
+		}
+		return;
+	case M_PROTO:
+	case M_PCPROTO: {
+		struct T_unitdata_req *tudr;
+
+		ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX);
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+
+		/* Handle valid T_UNITDATA_REQ here */
+		if (MBLKL(mp) >= sizeof (*tudr) &&
+		    ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) {
+			if (mp->b_cont == NULL) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "badaddr");
+				error = EPROTO;
+				goto ud_error;
+			}
+
+			if (!MBLKIN(mp, 0, tudr->DEST_offset +
+			    tudr->DEST_length)) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "badaddr");
+				error = EADDRNOTAVAIL;
+				goto ud_error;
+			}
+			/*
+			 * If a port has not been bound to the stream, fail.
+			 * This is not a problem when sockfs is directly
+			 * above us, because it will ensure that the socket
+			 * is first bound before allowing data to be sent.
+			 */
+			if (udp->udp_state == TS_UNBND) {
+				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+				    "udp_wput_end: q %p (%S)", q, "outstate");
+				error = EPROTO;
+				goto ud_error;
+			}
+			addr = (struct sockaddr *)
+			    &mp->b_rptr[tudr->DEST_offset];
+			addrlen = tudr->DEST_length;
+			optlen = tudr->OPT_length;
+			if (optlen != 0)
+				UDP_STAT(udp_out_opt);
+			break;
+		}
+		/* FALLTHRU */
+	}
+	default:
+		udp_become_writer(connp, mp, udp_wput_other_wrapper,
+		    SQTAG_UDP_OUTPUT);
+		return;
+	}
+	ASSERT(addr != NULL);
+
+	switch (udp->udp_family) {
+	case AF_INET6:
+		sin6 = (sin6_t *)addr;
+		if (!OK_32PTR((char *)sin6) || addrlen != sizeof (sin6_t) ||
+		    sin6->sin6_family != AF_INET6) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+
+		if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+			/*
+			 * Destination is a non-IPv4-compatible IPv6 address.
+			 * Send out an IPv6 format packet.
+			 */
+			mp = udp_output_v6(connp, mp, sin6, optlen, &error);
+			if (error != 0)
+				goto ud_error;
+
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "udp_output_v6");
+			return;
+		}
+		/*
+		 * If the local address is not zero or a mapped address
+		 * return an error.  It would be possible to send an IPv4
+		 * packet but the response would never make it back to the
+		 * application since it is bound to a non-mapped address.
+		 */
+		if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
+		    !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+		/* Send IPv4 packet without modifying udp_ipversion */
+		/* Extract port and ipaddr */
+		port = sin6->sin6_port;
+		IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
+		srcid = sin6->__sin6_src_id;
+		break;
+
+	case AF_INET:
+		sin = (sin_t *)addr;
+		if (!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t) ||
+		    sin->sin_family != AF_INET) {
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q, "badaddr");
+			error = EADDRNOTAVAIL;
+			goto ud_error;
+		}
+		/* Extract port and ipaddr */
+		port = sin->sin_port;
+		v4dst = sin->sin_addr.s_addr;
+		srcid = 0;
+		break;
+	}
+
+	/*
+	 * If options passed in, feed it for verification and handling
+	 */
+	if (optlen != 0) {
+		ASSERT(DB_TYPE(mp) != M_DATA);
+		if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
+			/* failure */
+			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+			    "udp_wput_end: q %p (%S)", q,
+			    "udp_unitdata_opt_process");
+			goto ud_error;
+		}
+		/*
+		 * Note: success in processing options.
+		 * mp option buffer represented by
+		 * OPT_length/offset now potentially modified
+		 * and contain option setting results
+		 */
+	}
+	ASSERT(error == 0);
+	mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error);
+	if (error != 0) {
+ud_error:
+		UDP_STAT(udp_out_err_output);
+		ASSERT(mp != NULL);
+		/* mp is freed by the following routine */
+		udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
+		    (t_scalar_t)error);
+	}
+}
+
+/* ARGSUSED */
+static void
+udp_output_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_output((conn_t *)arg, mp, NULL, 0);
+	_UDP_EXIT((conn_t *)arg);
+}
+
+static void
+udp_wput(queue_t *q, mblk_t *mp)
+{
+	_UDP_ENTER(Q_TO_CONN(UDP_WR(q)), mp, udp_output_wrapper,
+	    SQTAG_UDP_WPUT);
+}
+
+/*
+ * Allocate and prepare a T_UNITDATA_REQ message.
+ */
+static mblk_t *
+udp_tudr_alloc(struct sockaddr *addr, socklen_t addrlen)
+{
+	struct T_unitdata_req *tudr;
+	mblk_t *mp;
+
+	mp = allocb(sizeof (*tudr) + addrlen, BPRI_MED);
+	if (mp != NULL) {
+		mp->b_wptr += sizeof (*tudr) + addrlen;
+		DB_TYPE(mp) = M_PROTO;
+
+		tudr = (struct T_unitdata_req *)mp->b_rptr;
+		tudr->PRIM_type = T_UNITDATA_REQ;
+		tudr->DEST_length = addrlen;
+		tudr->DEST_offset = (t_scalar_t)sizeof (*tudr);
+		tudr->OPT_length = 0;
+		tudr->OPT_offset = 0;
+		bcopy(addr, tudr+1, addrlen);
+	}
+	return (mp);
+}
+
+/*
+ * Entry point for sockfs when udp is in "direct sockfs" mode.  This mode
+ * is valid when we are directly beneath the stream head, and thus sockfs
+ * is able to bypass STREAMS and directly call us, passing along the sockaddr
+ * structure without the cumbersome T_UNITDATA_REQ interface.  Note that
+ * this is done for both connected and non-connected endpoint.
+ */
+void
+udp_wput_data(queue_t *q, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
+{
+	conn_t	*connp;
+	udp_t	*udp;
+
+	q = UDP_WR(q);
+	connp = Q_TO_CONN(q);
+	udp = connp->conn_udp;
+
+	/* udpsockfs should only send down M_DATA for this entry point */
+	ASSERT(DB_TYPE(mp) == M_DATA);
+
+	mutex_enter(&connp->conn_lock);
+	UDP_MODE_ASSERTIONS(udp, UDP_ENTER);
+
+	if (udp->udp_mode != UDP_MT_HOT) {
+		/*
+		 * We can't enter this conn right away because another
+		 * thread is currently executing as writer; therefore we
+		 * need to deposit the message into the squeue to be
+		 * drained later.  If a socket address is present, we
+		 * need to create a T_UNITDATA_REQ message as placeholder.
+		 */
+		if (addr != NULL && addrlen != 0) {
+			mblk_t *tudr_mp = udp_tudr_alloc(addr, addrlen);
+
+			if (tudr_mp == NULL) {
+				mutex_exit(&connp->conn_lock);
+				BUMP_MIB(&udp_mib, udpOutErrors);
+				UDP_STAT(udp_out_err_tudr);
+				freemsg(mp);
+				return;
+			}
+			/* Tag the packet with T_UNITDATA_REQ */
+			tudr_mp->b_cont = mp;
+			mp = tudr_mp;
+		}
+		mutex_exit(&connp->conn_lock);
+		udp_enter(connp, mp, udp_output_wrapper, SQTAG_UDP_WPUT);
+		return;
+	}
+
+	/* We can execute as reader right away. */
+	UDP_READERS_INCREF(udp);
+	mutex_exit(&connp->conn_lock);
+
+	udp_output(connp, mp, addr, addrlen);
+
+	mutex_enter(&connp->conn_lock);
+	UDP_MODE_ASSERTIONS(udp, UDP_EXIT);
+	UDP_READERS_DECREF(udp);
+	mutex_exit(&connp->conn_lock);
+}
+
+/*
+ * udp_output_v6():
  * Assumes that udp_wput did some sanity checking on the destination
  * address.
  */
-static void
-udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
+static mblk_t *
+udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen,
+    int *error)
 {
-	ip6_t			*ip6h;
-	ip6i_t			*ip6i;	/* mp1->b_rptr even if no ip6i_t */
-	mblk_t			*mp1;
-	int			udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
-	size_t			ip_len;
-	udpha_t			*udph;
-	udp_t			*udp;
-	ip6_pkt_t		ipp_s;	/* For ancillary data options */
-	ip6_pkt_t		*ipp = &ipp_s;
-	ip6_pkt_t		*tipp;	/* temporary ipp */
-	uint32_t		csum = 0;
-	uint_t			ignore = 0;
-	uint_t			option_exists = 0, is_sticky = 0;
-	uint8_t			*cp;
-	uint8_t			*nxthdr_ptr;
-
-	udp = (udp_t *)q->q_ptr;
+	ip6_t		*ip6h;
+	ip6i_t		*ip6i;	/* mp1->b_rptr even if no ip6i_t */
+	mblk_t		*mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+	mblk_t		*mp2;
+	int		udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
+	size_t		ip_len;
+	udpha_t		*udph;
+	udp_t		*udp = connp->conn_udp;
+	queue_t		*q = connp->conn_wq;
+	ip6_pkt_t	ipp_s;	/* For ancillary data options */
+	ip6_pkt_t	*ipp = &ipp_s;
+	ip6_pkt_t	*tipp;	/* temporary ipp */
+	uint32_t	csum = 0;
+	uint_t		ignore = 0;
+	uint_t		option_exists = 0, is_sticky = 0;
+	uint8_t		*cp;
+	uint8_t		*nxthdr_ptr;
+
+	*error = 0;
+
+	/* mp1 points to the M_DATA mblk carrying the packet */
+	ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
+	ASSERT(tudr_optlen == 0 || DB_TYPE(mp) != M_DATA);
 
 	/*
 	 * If the local address is a mapped address return
@@ -5354,9 +6557,8 @@
 	 * since it is bound to a mapped address.
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EADDRNOTAVAIL);
-		return;
+		*error = EADDRNOTAVAIL;
+		goto done;
 	}
 
 	ipp->ipp_fields = 0;
@@ -5366,17 +6568,12 @@
 	 * If TPI options passed in, feed it for verification and handling
 	 */
 	if (tudr_optlen != 0) {
-		int 		error;
-
-		if (udp_unitdata_opt_process(q, mp, &error,
-		    (void *)ipp) < 0) {
+		if (udp_unitdata_opt_process(q, mp, error, (void *)ipp) < 0) {
 			/* failure */
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, error);
-			return;
+			goto done;
 		}
 		ignore = ipp->ipp_sticky_ignored;
-		ASSERT(error == 0);
+		ASSERT(*error == 0);
 	}
 
 	if (sin6->sin6_scope_id != 0 &&
@@ -5389,8 +6586,7 @@
 		option_exists |= IPPF_SCOPE_ID;
 	}
 
-	if ((udp->udp_sticky_ipp.ipp_fields == 0) &&
-	    (ipp->ipp_fields == 0)) {
+	if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) {
 		/* No sticky options nor ancillary data. */
 		goto no_options;
 	}
@@ -5475,7 +6671,8 @@
 	if (!(ignore & IPPF_USE_MIN_MTU)) {
 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
 			option_exists |= IPPF_USE_MIN_MTU;
-		} else if (udp->udp_sticky_ipp.ipp_fields & IPPF_USE_MIN_MTU) {
+		} else if (udp->udp_sticky_ipp.ipp_fields &
+		    IPPF_USE_MIN_MTU) {
 			option_exists |= IPPF_USE_MIN_MTU;
 			is_sticky |= IPPF_USE_MIN_MTU;
 		}
@@ -5518,26 +6715,28 @@
 		udp_ip_hdr_len += sizeof (ip6i_t);
 
 	/* check/fix buffer config, setup pointers into it */
-	mp1 = mp->b_cont;
 	ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len];
-	if ((mp1->b_datap->db_ref != 1) ||
-	    ((unsigned char *)ip6h < mp1->b_datap->db_base) ||
+	if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) ||
 	    !OK_32PTR(ip6h)) {
 		/* Try to get everything in a single mblk next time */
 		if (udp_ip_hdr_len > udp->udp_max_hdr_len) {
 			udp->udp_max_hdr_len = udp_ip_hdr_len;
-			(void) mi_set_sth_wroff(RD(q),
+			(void) mi_set_sth_wroff(UDP_RD(q),
 			    udp->udp_max_hdr_len + udp_wroff_extra);
 		}
-		mp1 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
-		if (!mp1) {
-			BUMP_MIB(&udp_mib, udpOutErrors);
-			udp_ud_err(q, mp, ENOMEM);
-			return;
-		}
-		mp1->b_cont = mp->b_cont;
-		mp->b_cont = mp1;
-		mp1->b_wptr = mp1->b_datap->db_lim;
+		mp2 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
+		if (mp2 == NULL) {
+			*error = ENOMEM;
+			goto done;
+		}
+		mp2->b_wptr = DB_LIM(mp2);
+		mp2->b_cont = mp1;
+		mp1 = mp2;
+		if (DB_TYPE(mp) != M_DATA)
+			mp->b_cont = mp1;
+		else
+			mp = mp1;
+
 		ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len);
 	}
 	mp1->b_rptr = (unsigned char *)ip6h;
@@ -5624,7 +6823,7 @@
 		if (sin6->__sin6_src_id != 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
 			ip_srcid_find_id(sin6->__sin6_src_id,
-			    &ip6h->ip6_src, udp->udp_zoneid);
+			    &ip6h->ip6_src, connp->conn_zoneid);
 		}
 	}
 
@@ -5731,9 +6930,8 @@
 				 * Drop packet - only support Type 0 routing.
 				 * Notify the application as well.
 				 */
-				udp_ud_err(q, mp, EPROTO);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EPROTO;
+				goto done;
 			}
 
 			/*
@@ -5741,9 +6939,8 @@
 			 * addresses in the header. Thus it must be even.
 			 */
 			if (rth->ip6r_len & 0x1) {
-				udp_ud_err(q, mp, EPROTO);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EPROTO;
+				goto done;
 			}
 			/*
 			 * Shuffle the routing header and ip6_dst
@@ -5758,9 +6955,8 @@
 			 * for subsequent hops.
 			 */
 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
-				udp_ud_err(q, mp, EADDRNOTAVAIL);
-				BUMP_MIB(&udp_mib, udpOutErrors);
-				return;
+				*error = EADDRNOTAVAIL;
+				goto done;
 			}
 
 			cp += (rth->ip6r_len + 1)*8;
@@ -5769,14 +6965,11 @@
 
 	/* count up length of UDP packet */
 	ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN;
-	{
-		mblk_t *mp2;
-
-		if ((mp2 = mp1->b_cont) != NULL) {
-			do {
-				ip_len += mp2->b_wptr - mp2->b_rptr;
-			} while ((mp2 = mp2->b_cont) != NULL);
-		}
+	if ((mp2 = mp1->b_cont) != NULL) {
+		do {
+			ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+			ip_len += (uint32_t)MBLKL(mp2);
+		} while ((mp2 = mp2->b_cont) != NULL);
 	}
 
 	/*
@@ -5785,9 +6978,8 @@
 	 * the size will have wrapped and be inconsistent with the msg size.
 	 */
 	if (ip_len > IP_MAXPACKET) {
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		udp_ud_err(q, mp, EMSGSIZE);
-		return;
+		*error = EMSGSIZE;
+		goto done;
 	}
 
 	/* Store the UDP length. Subtract length of extension hdrs */
@@ -5810,11 +7002,25 @@
 #endif
 	ip6h->ip6_plen = ip_len;
 
-	freeb(mp);
+	if (DB_TYPE(mp) != M_DATA) {
+		ASSERT(mp != mp1);
+		freeb(mp);
+	}
+
+	/* mp has been consumed and we'll return success */
+	ASSERT(*error == 0);
+	mp = NULL;
 
 	/* We're done. Pass the packet to IP */
 	BUMP_MIB(&udp_mib, udpOutDatagrams);
-	putnext(q, mp1);
+	ip_output_v6(connp, mp1, q, IP_WPUT);
+
+done:
+	if (*error != 0) {
+		ASSERT(mp != NULL);
+		BUMP_MIB(&udp_mib, udpOutErrors);
+	}
+	return (mp);
 }
 
 static void
@@ -5823,26 +7029,18 @@
 	uchar_t	*rptr = mp->b_rptr;
 	struct datab *db;
 	struct iocblk *iocp;
-	udp_t	*udp;
 	cred_t	*cr;
+	conn_t	*connp = Q_TO_CONN(q);
+	udp_t	*udp = connp->conn_udp;
 
 	TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START,
 		"udp_wput_other_start: q %p", q);
 
-	udp = (udp_t *)q->q_ptr;
 	db = mp->b_datap;
 
-	cr = DB_CREDDEF(mp, udp->udp_credp);
+	cr = DB_CREDDEF(mp, connp->conn_cred);
 
 	switch (db->db_type) {
-	case M_DATA:
-		/* Not connected */
-		BUMP_MIB(&udp_mib, udpOutErrors);
-		freemsg(mp);
-		TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
-			"udp_wput_other_end: q %p (%S)",
-			q, "not-connected");
-		return;
 	case M_PROTO:
 	case M_PCPROTO:
 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
@@ -5852,7 +7050,7 @@
 				q, "protoshort");
 			return;
 		}
-		switch (((union T_primitives *)rptr)->type) {
+		switch (((t_primp_t)rptr)->type) {
 		case T_ADDR_REQ:
 			udp_addr_req(q, mp);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
@@ -5885,7 +7083,7 @@
 			 * be bad.  Valid T_UNITDATA_REQs are handled
 			 * in udp_wput.
 			 */
-			udp_ud_err(q, mp, EADDRNOTAVAIL);
+			udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 				"udp_wput_other_end: q %p (%S)",
 				q, "unitdatareq");
@@ -5897,14 +7095,26 @@
 			return;
 		case T_SVR4_OPTMGMT_REQ:
 			if (!snmpcom_req(q, mp, udp_snmp_set, udp_snmp_get, cr))
-				(void) svr4_optcom_req(q, mp, cr, &udp_opt_obj);
+				/*
+				 * Use upper queue for option processing in
+				 * case the request is not handled at this
+				 * level and needs to be passed down to IP.
+				 */
+				(void) svr4_optcom_req(_WR(UDP_RD(q)),
+				    mp, cr, &udp_opt_obj);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 			    "udp_wput_other_end: q %p (%S)",
 			    q, "optmgmtreq");
 			return;
 
 		case T_OPTMGMT_REQ:
-			(void) tpi_optcom_req(q, mp, cr, &udp_opt_obj);
+			/*
+			 * Use upper queue for option processing in
+			 * case the request is not handled at this
+			 * level and needs to be passed down to IP.
+			 */
+			(void) tpi_optcom_req(_WR(UDP_RD(q)),
+			    mp, cr, &udp_opt_obj);
 			TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 				"udp_wput_other_end: q %p (%S)",
 				q, "optmgmtreq");
@@ -5954,10 +7164,9 @@
 				 * don't know the peer's name.
 				 */
 				iocp->ioc_error = ENOTCONN;
-err_ret:;
 				iocp->ioc_count = 0;
 				mp->b_datap->db_type = M_IOCACK;
-				qreply(q, mp);
+				putnext(UDP_RD(q), mp);
 				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 					"udp_wput_other_end: q %p (%S)",
 					q, "getpeername");
@@ -5982,13 +7191,45 @@
 			/* nd_getset performs the necessary checking */
 		case ND_GET:
 			if (nd_getset(q, udp_g_nd, mp)) {
-				qreply(q, mp);
+				putnext(UDP_RD(q), mp);
 				TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 					"udp_wput_other_end: q %p (%S)",
 					q, "get");
 				return;
 			}
 			break;
+		case _SIOCSOCKFALLBACK:
+			/*
+			 * Either sockmod is about to be popped and the
+			 * socket would now be treated as a plain stream,
+			 * or a module is about to be pushed so we could
+			 * no longer use read-side synchronous stream.
+			 * Drain any queued data and disable direct sockfs
+			 * interface from now on.
+			 */
+			if (!udp->udp_issocket) {
+				DB_TYPE(mp) = M_IOCNAK;
+				iocp->ioc_error = EINVAL;
+			} else {
+				udp->udp_issocket = B_FALSE;
+				if (udp->udp_direct_sockfs) {
+					/*
+					 * Disable read-side synchronous
+					 * stream interface and drain any
+					 * queued data.
+					 */
+					udp_rcv_drain(UDP_RD(q), udp,
+					    B_FALSE);
+					ASSERT(!udp->udp_direct_sockfs);
+					UDP_STAT(udp_sock_fallback);
+				}
+				DB_TYPE(mp) = M_IOCACK;
+				iocp->ioc_error = 0;
+			}
+			iocp->ioc_count = 0;
+			iocp->ioc_rval = 0;
+			putnext(UDP_RD(q), mp);
+			return;
 		default:
 			break;
 		}
@@ -6004,7 +7245,15 @@
 	}
 	TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
 		"udp_wput_other_end: q %p (%S)", q, "end");
-	putnext(q, mp);
+	ip_output(connp, mp, q, IP_WPUT);
+}
+
+/* ARGSUSED */
+static void
+udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+	udp_wput_other(((conn_t *)arg)->conn_wq, mp);
+	udp_exit((conn_t *)arg);
 }
 
 /*
@@ -6017,11 +7266,11 @@
 	mblk_t	*mp1;
 	STRUCT_HANDLE(strbuf, sb);
 	uint16_t port;
-	udp_t	*udp;
 	in6_addr_t	v6addr;
 	ipaddr_t	v4addr;
 	uint32_t	flowinfo = 0;
 	int		addrlen;
+	udp_t		*udp = Q_TO_UDP(q);
 
 	/* Make sure it is one of ours. */
 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -6029,9 +7278,11 @@
 	case TI_GETPEERNAME:
 		break;
 	default:
-		putnext(q, mp);
+		ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
 		return;
 	}
+
+	q = WR(UDP_RD(q));
 	switch (mi_copy_state(q, mp, &mp1)) {
 	case -1:
 		return;
@@ -6068,7 +7319,6 @@
 	 */
 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
 	    (void *)mp1->b_rptr);
-	udp = (udp_t *)q->q_ptr;
 	if (udp->udp_family == AF_INET)
 		addrlen = sizeof (sin_t);
 	else
@@ -6113,6 +7363,10 @@
 		port = udp->udp_port;
 		break;
 	case TI_GETPEERNAME:
+		if (udp->udp_state != TS_DATA_XFER) {
+			mi_copy_done(q, mp, ENOTCONN);
+			return;
+		}
 		if (udp->udp_family == AF_INET) {
 			ASSERT(udp->udp_ipversion == IPV4_VERSION);
 			v4addr = V4_PART_OF_V6(udp->udp_v6dst);
@@ -6163,21 +7417,23 @@
 udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
     void *thisdg_attrs)
 {
-	udp_t	*udp;
 	struct T_unitdata_req *udreqp;
 	int is_absreq_failure;
 	cred_t *cr;
-
-	ASSERT(((union T_primitives *)mp->b_rptr)->type);
-
-	udp = (udp_t *)q->q_ptr;
-
-	cr = DB_CREDDEF(mp, udp->udp_credp);
+	conn_t	*connp = Q_TO_CONN(q);
+
+	ASSERT(((t_primp_t)mp->b_rptr)->type);
+
+	cr = DB_CREDDEF(mp, connp->conn_cred);
 
 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
 	*errorp = 0;
 
-	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
+	/*
+	 * Use upper queue for option processing since the callback
+	 * routines expect to be called in UDP instance instead of IP.
+	 */
+	*errorp = tpi_optcom_buf(_WR(UDP_RD(q)), mp, &udreqp->OPT_length,
 	    udreqp->OPT_offset, cr, &udp_opt_obj,
 	    thisdg_attrs, &is_absreq_failure);
 
@@ -6198,7 +7454,6 @@
 	int i;
 
 	UDP6_MAJ = ddi_name_to_major(UDP6);
-	mutex_init(&udp_g_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr,
 	    udp_opt_obj.odb_opt_arr_cnt);
@@ -6218,7 +7473,11 @@
 		    NULL);
 	}
 	(void) udp_param_register(udp_param_arr, A_CNT(udp_param_arr));
+
 	udp_kstat_init();
+
+	udp_cache = kmem_cache_create("udp_cache", sizeof (udp_t),
+	    CACHE_ALIGN_SIZE, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
@@ -6228,14 +7487,16 @@
 
 	nd_free(&udp_g_nd);
 
-	mutex_destroy(&udp_g_lock);
 	for (i = 0; i < udp_bind_fanout_size; i++) {
 		mutex_destroy(&udp_bind_fanout[i].uf_lock);
 	}
+
 	kmem_free(udp_bind_fanout, udp_bind_fanout_size *
 	    sizeof (udp_fanout_t));
+
 	udp_kstat_fini();
 
+	kmem_cache_destroy(udp_cache);
 }
 
 static void
@@ -6250,9 +7511,9 @@
 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
 	};
 
-	udp_mibkp = kstat_create("udp", 0, "udp", "mib2", KSTAT_TYPE_NAMED,
-					NUM_OF_FIELDS(udp_named_kstat_t),
-					0);
+	udp_mibkp = kstat_create(UDP_MOD_NAME, 0, UDP_MOD_NAME,
+	    "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(udp_named_kstat_t), 0);
+
 	if (udp_mibkp == NULL)
 		return;
 
@@ -6264,12 +7525,24 @@
 	udp_mibkp->ks_update = udp_kstat_update;
 
 	kstat_install(udp_mibkp);
+
+	if ((udp_ksp = kstat_create(UDP_MOD_NAME, 0, "udpstat",
+	    "net", KSTAT_TYPE_NAMED,
+	    sizeof (udp_statistics) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL)) != NULL) {
+		udp_ksp->ks_data = &udp_statistics;
+		kstat_install(udp_ksp);
+	}
 }
 
 static void
 udp_kstat_fini(void)
 {
-	if (udp_mibkp) {
+	if (udp_ksp != NULL) {
+		kstat_delete(udp_ksp);
+		udp_ksp = NULL;
+	}
+	if (udp_mibkp != NULL) {
 		kstat_delete(udp_mibkp);
 		udp_mibkp = NULL;
 	}
@@ -6296,6 +7569,269 @@
 	return (0);
 }
 
+/* ARGSUSED */
+static void
+udp_rput(queue_t *q, mblk_t *mp)
+{
+	/*
+	 * We get here whenever we do qreply() from IP,
+	 * i.e as part of handlings ioctls, etc.
+	 */
+	putnext(q, mp);
+}
+
+/*
+ * Read-side synchronous stream info entry point, called as a
+ * result of handling certain STREAMS ioctl operations.
+ */
+static int
+udp_rinfop(queue_t *q, infod_t *dp)
+{
+	mblk_t	*mp;
+	uint_t	cmd = dp->d_cmd;
+	int	res = 0;
+	int	error = 0;
+	udp_t	*udp = Q_TO_UDP(RD(UDP_WR(q)));
+	struct stdata *stp = STREAM(q);
+
+	mutex_enter(&udp->udp_drain_lock);
+	/* If shutdown on read has happened, return nothing */
+	mutex_enter(&stp->sd_lock);
+	if (stp->sd_flag & STREOF) {
+		mutex_exit(&stp->sd_lock);
+		goto done;
+	}
+	mutex_exit(&stp->sd_lock);
+
+	if ((mp = udp->udp_rcv_list_head) == NULL)
+		goto done;
+
+	ASSERT(DB_TYPE(mp) != M_DATA && mp->b_cont != NULL);
+
+	if (cmd & INFOD_COUNT) {
+		/*
+		 * Return the number of messages.
+		 */
+		dp->d_count += udp->udp_rcv_msgcnt;
+		res |= INFOD_COUNT;
+	}
+	if (cmd & INFOD_BYTES) {
+		/*
+		 * Return size of all data messages.
+		 */
+		dp->d_bytes += udp->udp_rcv_cnt;
+		res |= INFOD_BYTES;
+	}
+	if (cmd & INFOD_FIRSTBYTES) {
+		/*
+		 * Return size of first data message.
+		 */
+		dp->d_bytes = msgdsize(mp);
+		res |= INFOD_FIRSTBYTES;
+		dp->d_cmd &= ~INFOD_FIRSTBYTES;
+	}
+	if (cmd & INFOD_COPYOUT) {
+		mblk_t *mp1 = mp->b_cont;
+		int n;
+		/*
+		 * Return data contents of first message.
+		 */
+		ASSERT(DB_TYPE(mp1) == M_DATA);
+		while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+			n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+			if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+			    UIO_READ, dp->d_uiop)) != 0) {
+				goto done;
+			}
+			mp1 = mp1->b_cont;
+		}
+		res |= INFOD_COPYOUT;
+		dp->d_cmd &= ~INFOD_COPYOUT;
+	}
+done:
+	mutex_exit(&udp->udp_drain_lock);
+
+	dp->d_res |= res;
+
+	return (error);
+}
+
+/*
+ * Read-side synchronous stream entry point.  This is called as a result
+ * of recv/read operation done at sockfs, and is guaranteed to execute
+ * outside of the interrupt thread context.  It returns a single datagram
+ * (b_cont chain of T_UNITDATA_IND plus data) to the upper layer.
+ */
+static int
+udp_rrw(queue_t *q, struiod_t *dp)
+{
+	mblk_t	*mp;
+	udp_t	*udp = Q_TO_UDP(_RD(UDP_WR(q)));
+
+	/* We should never get here when we're in SNMP mode */
+	ASSERT(!(udp->udp_connp->conn_flags & IPCL_UDPMOD));
+
+	/*
+	 * Dequeue datagram from the head of the list and return
+	 * it to caller; also ensure that RSLEEP sd_wakeq flag is
+	 * set/cleared depending on whether or not there's data
+	 * remaining in the list.
+	 */
+	mutex_enter(&udp->udp_drain_lock);
+	if (!udp->udp_direct_sockfs) {
+		mutex_exit(&udp->udp_drain_lock);
+		UDP_STAT(udp_rrw_busy);
+		return (EBUSY);
+	}
+	if ((mp = udp->udp_rcv_list_head) != NULL) {
+		uint_t size = msgdsize(mp);
+
+		/* Last datagram in the list? */
+		if ((udp->udp_rcv_list_head = mp->b_next) == NULL)
+			udp->udp_rcv_list_tail = NULL;
+		mp->b_next = NULL;
+
+		udp->udp_rcv_cnt -= size;
+		udp->udp_rcv_msgcnt--;
+		UDP_STAT(udp_rrw_msgcnt);
+
+		/* No longer flow-controlling? */
+		if (udp->udp_rcv_cnt < udp->udp_rcv_hiwat &&
+		    udp->udp_rcv_msgcnt < udp->udp_rcv_hiwat)
+			udp->udp_drain_qfull = B_FALSE;
+	}
+	if (udp->udp_rcv_list_head == NULL) {
+		/*
+		 * Either we just dequeued the last datagram or
+		 * we get here from sockfs and have nothing to
+		 * return; in this case clear RSLEEP.
+		 */
+		ASSERT(udp->udp_rcv_cnt == 0);
+		ASSERT(udp->udp_rcv_msgcnt == 0);
+		ASSERT(udp->udp_rcv_list_tail == NULL);
+		STR_WAKEUP_CLEAR(STREAM(q));
+	} else {
+		/*
+		 * More data follows; we need udp_rrw() to be
+		 * called in future to pick up the rest.
+		 */
+		STR_WAKEUP_SET(STREAM(q));
+	}
+	mutex_exit(&udp->udp_drain_lock);
+	dp->d_mp = mp;
+	return (0);
+}
+
+/*
+ * Enqueue a completely-built T_UNITDATA_IND message into the receive
+ * list; this is typically executed within the interrupt thread context
+ * and so we do things as quickly as possible.
+ */
+static void
+udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, uint_t pkt_len)
+{
+	ASSERT(q == RD(q));
+	ASSERT(pkt_len == msgdsize(mp));
+	ASSERT(mp->b_next == NULL && mp->b_cont != NULL);
+	ASSERT(DB_TYPE(mp) == M_PROTO && DB_TYPE(mp->b_cont) == M_DATA);
+	ASSERT(MBLKL(mp) >= sizeof (struct T_unitdata_ind));
+
+	mutex_enter(&udp->udp_drain_lock);
+	/*
+	 * Wake up and signal the receiving app; it is okay to do this
+	 * before enqueueing the mp because we are holding the drain lock.
+	 * One of the advantages of synchronous stream is the ability for
+	 * us to find out when the application performs a read on the
+	 * socket by way of udp_rrw() entry point being called.  We need
+	 * to generate SIGPOLL/SIGIO for each received data in the case
+	 * of asynchronous socket just as in the strrput() case.  However,
+	 * we only wake the application up when necessary, i.e. during the
+	 * first enqueue.  When udp_rrw() is called, we send up a single
+	 * datagram upstream and call STR_WAKEUP_SET() again when there
+	 * are still data remaining in our receive queue.
+	 */
+	if (udp->udp_rcv_list_head == NULL) {
+		STR_WAKEUP_SET(STREAM(q));
+		udp->udp_rcv_list_head = mp;
+	} else {
+		udp->udp_rcv_list_tail->b_next = mp;
+	}
+	udp->udp_rcv_list_tail = mp;
+	udp->udp_rcv_cnt += pkt_len;
+	udp->udp_rcv_msgcnt++;
+
+	/* Need to flow-control? */
+	if (udp->udp_rcv_cnt >= udp->udp_rcv_hiwat ||
+	    udp->udp_rcv_msgcnt >= udp->udp_rcv_hiwat)
+		udp->udp_drain_qfull = B_TRUE;
+
+	/* Update poll events and send SIGPOLL/SIGIO if necessary */
+	STR_SENDSIG(STREAM(q));
+	mutex_exit(&udp->udp_drain_lock);
+}
+
+/*
+ * Drain the contents of receive list to the module upstream; we do
+ * this during close or when we fallback to the slow mode due to
+ * sockmod being popped or a module being pushed on top of us.
+ */
+static void
+udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing)
+{
+	mblk_t *mp;
+
+	ASSERT(q == RD(q));
+
+	mutex_enter(&udp->udp_drain_lock);
+	/*
+	 * There is no race with a concurrent udp_input() sending
+	 * up packets using putnext() after we have cleared the
+	 * udp_direct_sockfs flag but before we have completed
+	 * sending up the packets in udp_rcv_list, since we are
+	 * either a writer or we have quiesced the conn.
+	 */
+	udp->udp_direct_sockfs = B_FALSE;
+	mutex_exit(&udp->udp_drain_lock);
+
+	if (udp->udp_rcv_list_head != NULL)
+		UDP_STAT(udp_drain);
+
+	/*
+	 * Send up everything via putnext(); note here that we
+	 * don't need the udp_drain_lock to protect us since
+	 * nothing can enter udp_rrw() and that we currently
+	 * have exclusive access to this udp.
+	 */
+	while ((mp = udp->udp_rcv_list_head) != NULL) {
+		udp->udp_rcv_list_head = mp->b_next;
+		mp->b_next = NULL;
+		udp->udp_rcv_cnt -= msgdsize(mp);
+		udp->udp_rcv_msgcnt--;
+		if (closing) {
+			freemsg(mp);
+		} else {
+			putnext(q, mp);
+		}
+	}
+	ASSERT(udp->udp_rcv_cnt == 0);
+	ASSERT(udp->udp_rcv_msgcnt == 0);
+	ASSERT(udp->udp_rcv_list_head == NULL);
+	udp->udp_rcv_list_tail = NULL;
+	udp->udp_drain_qfull = B_FALSE;
+}
+
+static size_t
+udp_set_rcv_hiwat(udp_t *udp, size_t size)
+{
+	/* We add a bit of extra buffering */
+	size += size >> 1;
+	if (size > udp_max_buf)
+		size = udp_max_buf;
+
+	udp->udp_rcv_hiwat = size;
+	return (size);
+}
+
 /*
  * Little helper for IPsec's NAT-T processing.
  */
--- a/usr/src/uts/common/inet/udp/udp6ddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udp6ddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1992,1997-2002 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,7 +36,13 @@
 #define	INET_DEVMINOR	IPV6_MINOR
 #define	INET_DEVDESC	"UDP6 STREAMS driver %I%"
 #define	INET_STRTAB	udpinfo
-#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS	/* since we're really ip */
+#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define	INET_MODMTFLAGS	(D_MP|D_SYNCSTR|_D_DIRECT)
 
 #include "../inetddi.c"
 
--- a/usr/src/uts/common/inet/udp/udpddi.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp/udpddi.c	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -32,20 +32,23 @@
 #include <sys/modctl.h>
 #include <inet/common.h>
 #include <inet/ip.h>
+#include <inet/udp_impl.h>
 
 #define	INET_NAME	"udp"
 #define	INET_MODDESC	"UDP STREAMS module %I%"
 #define	INET_DEVDESC	"UDP STREAMS driver %I%"
 #define	INET_DEVMINOR	IPV4_MINOR
 #define	INET_STRTAB	udpinfo
-#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS	/* since as a driver we're ip */
-#define	INET_MODMTFLAGS (D_MP | D_MTQPAIR | D_MTPUTSHARED | _D_MTOCSHARED)
+#define	INET_DEVMTFLAGS	IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define	INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT)
 
 #include "../inetddi.c"
 
-extern void udp_ddi_init(void);
-extern void udp_ddi_destroy(void);
-
 int
 _init(void)
 {
--- a/usr/src/uts/common/inet/udp_impl.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/udp_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,13 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+/*
+ * UDP implementation private declarations.  These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself.  They are undocumented and are
+ * subject to change without notice.
+ */
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -43,32 +50,42 @@
 #include <inet/common.h>
 #include <inet/ip.h>
 
+#define	UDP_MOD_ID		5607
+
+/* udp_mode. UDP_MT_HOT and UDP_SQUEUE are stable modes. Rest are transient */
+typedef enum {
+	UDP_MT_HOT = 0,			/* UDP endpoint is MT HOT */
+	UDP_MT_QUEUED = 1,		/* Messages enqueued in udp_mphead */
+	UDP_QUEUED_SQUEUE = 2,		/* Messages enqueued in conn_sqp */
+	UDP_SQUEUE = 3			/* Single threaded using squeues */
+} udp_mode_t;
+
 /* Internal udp control structure, one per open stream */
 typedef	struct udp_s {
-	uint32_t 	udp_state;	/* TPI state */
-	in_port_t 	udp_port;	/* Port bound to this stream */
-	in_port_t 	udp_dstport;	/* Connected port */
-	in6_addr_t 	udp_v6src;	/* Source address of this stream */
-	in6_addr_t 	udp_bound_v6src; /* Explicitly bound address */
-	in6_addr_t 	udp_v6dst;	/* Connected destination */
+	uint32_t	udp_state;	/* TPI state */
+	in_port_t	udp_port;	/* Port bound to this stream */
+	in_port_t	udp_dstport;	/* Connected port */
+	in6_addr_t	udp_v6src;	/* Source address of this stream */
+	in6_addr_t	udp_bound_v6src; /* Explicitly bound address */
+	in6_addr_t	udp_v6dst;	/* Connected destination */
 	uint32_t	udp_flowinfo;	/* Connected flow id and tclass */
-	uint32_t 	udp_max_hdr_len; /* For write offset in stream head */
+	uint32_t	udp_max_hdr_len; /* For write offset in stream head */
 	sa_family_t	udp_family;	/* Family from socket() call */
 	/*
 	 * IP format that packets transmitted from this struct should use.
 	 * Value can be IP4_VERSION or IPV6_VERSION.
 	 */
 	ushort_t	udp_ipversion;
-	uint32_t 	udp_ip_snd_options_len; /* Len of IPv4 options */
+	uint32_t	udp_ip_snd_options_len; /* Len of IPv4 options */
 	uchar_t		*udp_ip_snd_options;    /* Ptr to IPv4 options */
-	uint32_t 	udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
+	uint32_t	udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
 	uchar_t		*udp_ip_rcv_options;    /* Ptr to IPv4 options recvd */
-	cred_t		*udp_credp;		/* Credentials at open */
 	uchar_t		udp_multicast_ttl;	/* IP*_MULTICAST_TTL/HOPS */
-	ipaddr_t 	udp_multicast_if_addr;  /* IP_MULTICAST_IF option */
+	ipaddr_t	udp_multicast_if_addr;  /* IP_MULTICAST_IF option */
 	uint_t		udp_multicast_if_index;	/* IPV6_MULTICAST_IF option */
 	int		udp_bound_if;		/* IP*_BOUND_IF option */
 	int		udp_xmit_if;		/* IP_XMIT_IF option */
+	conn_t		*udp_connp;
 	uint32_t
 		udp_debug : 1,		/* SO_DEBUG "socket" option. */
 		udp_dontroute : 1,	/* SO_DONTROUTE "socket" option. */
@@ -76,35 +93,36 @@
 		udp_useloopback : 1,	/* SO_USELOOPBACK "socket" option */
 
 		udp_reuseaddr : 1,	/* SO_REUSEADDR "socket" option. */
-		udp_multicast_loop : 1,	/* IP_MULTICAST_LOOP option */
 		udp_dgram_errind : 1,	/* SO_DGRAM_ERRIND option */
 		udp_recvdstaddr : 1,	/* IP_RECVDSTADDR option */
+		udp_recvopts : 1,	/* IP_RECVOPTS option */
 
-		udp_recvopts : 1,	/* IP_RECVOPTS option */
 		udp_discon_pending : 1,	/* T_DISCON_REQ in progress */
 		udp_unspec_source : 1,	/* IP*_UNSPEC_SRC option */
 		udp_ipv6_recvpktinfo : 1,	/* IPV6_RECVPKTINFO option  */
+		udp_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
 
-		udp_ipv6_recvhoplimit : 1,	/* IPV6_RECVHOPLIMIT option */
 		udp_ipv6_recvhopopts : 1,	/* IPV6_RECVHOPOPTS option */
 		udp_ipv6_recvdstopts : 1,	/* IPV6_RECVDSTOPTS option */
 		udp_ipv6_recvrthdr : 1,		/* IPV6_RECVRTHDR option */
+		udp_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
 
-		udp_ipv6_recvtclass : 1,	/* IPV6_RECVTCLASS */
 		udp_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 		udp_anon_priv_bind : 1,
 		udp_exclbind : 1,	/* ``exclusive'' binding */
+		udp_recvif : 1,		/* IP_RECVIF option */
 
-		udp_recvif : 1,		/* IP_RECVIF option */
 		udp_recvslla : 1,	/* IP_RECVSLLA option */
 		udp_recvttl : 1,	/* IP_RECVTTL option */
 		udp_recvucred : 1,	/* IP_RECVUCRED option */
+		udp_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
 
-		udp_old_ipv6_recvdstopts : 1,	/* old form of IPV6_DSTOPTS */
 		udp_ipv6_recvrthdrdstopts : 1,	/* IPV6_RECVRTHDRDSTOPTS */
+		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
+		udp_issocket : 1,	/* socket mode */
+		udp_direct_sockfs : 1,	/* direct calls to/from sockfs */
 
-		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
-		udp_pad_to_bit_31 : 7;
+		udp_pad_to_bit_31 : 4;
 
 	uint8_t		udp_type_of_service;	/* IP_TOS option */
 	uint8_t		udp_ttl;		/* TTL or hoplimit */
@@ -114,7 +132,20 @@
 	uint_t		udp_sticky_hdrs_len;	/* Incl. ip6h and any ip6i */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */
 	struct udp_s	**udp_ptpbhn; /* Pointer to previous bind hash next. */
-	zoneid_t	udp_zoneid;	/* ID of owning zone */
+	udp_mode_t	udp_mode;	/* Current mode of operation */
+	mblk_t		*udp_mphead;	/* Head of the queued operations */
+	mblk_t		*udp_mptail;	/* Tail of the queued operations */
+	uint_t		udp_mpcount;	/* Number of messages in the queue */
+	uint_t		udp_reader_count; /* Number of reader threads */
+	uint_t		udp_squeue_count; /* Number of messages in conn_sqp */
+
+	kmutex_t	udp_drain_lock;		/* lock for udp_rcv_list */
+	boolean_t	udp_drain_qfull;	/* drain queue is full */
+	mblk_t		*udp_rcv_list_head;	/* b_next chain of mblks */
+	mblk_t		*udp_rcv_list_tail;	/* last mblk in chain */
+	uint_t		udp_rcv_cnt;		/* total data in rcv_list */
+	uint_t		udp_rcv_msgcnt;		/* total messages in rcv_list */
+	size_t		udp_rcv_hiwat;		/* receive high watermark */
 } udp_t;
 
 /* UDP Protocol header */
@@ -127,6 +158,92 @@
 } udpha_t;
 #define	UDPH_SIZE	8
 
+/* Named Dispatch Parameter Management Structure */
+typedef struct udpparam_s {
+	uint32_t udp_param_min;
+	uint32_t udp_param_max;
+	uint32_t udp_param_value;
+	char	*udp_param_name;
+} udpparam_t;
+
+extern udpparam_t udp_param_arr[];
+
+#define	udp_wroff_extra			udp_param_arr[0].udp_param_value
+#define	udp_ipv4_ttl			udp_param_arr[1].udp_param_value
+#define	udp_ipv6_hoplimit		udp_param_arr[2].udp_param_value
+#define	udp_smallest_nonpriv_port	udp_param_arr[3].udp_param_value
+#define	udp_do_checksum			udp_param_arr[4].udp_param_value
+#define	udp_smallest_anon_port		udp_param_arr[5].udp_param_value
+#define	udp_largest_anon_port		udp_param_arr[6].udp_param_value
+#define	udp_xmit_hiwat			udp_param_arr[7].udp_param_value
+#define	udp_xmit_lowat			udp_param_arr[8].udp_param_value
+#define	udp_recv_hiwat			udp_param_arr[9].udp_param_value
+#define	udp_max_buf			udp_param_arr[10].udp_param_value
+#define	udp_ndd_get_info_interval	udp_param_arr[11].udp_param_value
+
+/* Kstats */
+typedef struct {				/* Class "net" kstats */
+	kstat_named_t	udp_ip_send;
+	kstat_named_t	udp_ip_ire_send;
+	kstat_named_t	udp_ire_null;
+	kstat_named_t	udp_drain;
+	kstat_named_t	udp_sock_fallback;
+	kstat_named_t	udp_rrw_busy;
+	kstat_named_t	udp_rrw_msgcnt;
+	kstat_named_t	udp_out_sw_cksum;
+	kstat_named_t	udp_out_sw_cksum_bytes;
+	kstat_named_t	udp_out_opt;
+	kstat_named_t	udp_out_err_notconn;
+	kstat_named_t	udp_out_err_output;
+	kstat_named_t	udp_out_err_tudr;
+	kstat_named_t	udp_in_pktinfo;
+	kstat_named_t	udp_in_recvdstaddr;
+	kstat_named_t	udp_in_recvopts;
+	kstat_named_t	udp_in_recvif;
+	kstat_named_t	udp_in_recvslla;
+	kstat_named_t	udp_in_recvucred;
+	kstat_named_t	udp_in_recvttl;
+	kstat_named_t	udp_in_recvhopopts;
+	kstat_named_t	udp_in_recvhoplimit;
+	kstat_named_t	udp_in_recvdstopts;
+	kstat_named_t	udp_in_recvrtdstopts;
+	kstat_named_t	udp_in_recvrthdr;
+	kstat_named_t	udp_in_recvpktinfo;
+	kstat_named_t	udp_in_recvtclass;
+#ifdef DEBUG
+	kstat_named_t	udp_data_conn;
+	kstat_named_t	udp_data_notconn;
+#endif
+} udp_stat_t;
+
+extern udp_stat_t	udp_statistics;
+
+#define	UDP_STAT(x)		(udp_statistics.x.value.ui64++)
+#define	UDP_STAT_UPDATE(x, n)	(udp_statistics.x.value.ui64 += (n))
+#ifdef DEBUG
+#define	UDP_DBGSTAT(x)		UDP_STAT(x)
+#else
+#define	UDP_DBGSTAT(x)
+#endif /* DEBUG */
+
+extern major_t	UDP6_MAJ;
+
+extern int	udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int	udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+		    uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+extern int	udp_snmp_get(queue_t *, mblk_t *);
+extern int	udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
+extern void	udp_close_free(conn_t *);
+extern void	udp_quiesce_conn(conn_t *);
+extern void	udp_ddi_init(void);
+extern void	udp_ddi_destroy(void);
+extern void	udp_resume_bind(conn_t *, mblk_t *);
+extern void	udp_conn_recv(conn_t *, mblk_t *);
+extern boolean_t udp_compute_checksum(void);
+extern void	udp_wput_data(queue_t *, mblk_t *, struct sockaddr *,
+		    socklen_t);
+
 #endif	/*  _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/io/gld.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/gld.c	Sat Oct 22 22:50:14 2005 -0700
@@ -3415,6 +3415,8 @@
 			dlhp->hcksum_txflags |= HCKSUM_INET_PARTIAL;
 		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V4)
 			dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V4;
+		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V6)
+			dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V6;
 		if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_IPHDR)
 			dlhp->hcksum_txflags |= HCKSUM_IPHDRCKSUM;
 
--- a/usr/src/uts/common/io/stream.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/stream.c	Sat Oct 22 22:50:14 2005 -0700
@@ -1690,6 +1690,21 @@
 }
 
 /*
+ * Calculate number of data bytes in a single data message block taking
+ * multidata messages into account.
+ */
+
+#define	ADD_MBLK_SIZE(mp, size) 					\
+	if (DB_TYPE(mp) != M_MULTIDATA) {				\
+		(size) += MBLKL(mp);					\
+	} else {							\
+		uint_t	pinuse;						\
+									\
+		mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse);	\
+		(size) += pinuse;					\
+	}
+
+/*
  * Like getq() but does not backenable.  This is used by the stream
  * head when a putback() is likely.  The caller must call qbackenable()
  * after it is done with accessing the queue.
@@ -1721,7 +1736,7 @@
 
 		/* Get message byte count for q_count accounting */
 		for (tmp = bp; tmp; tmp = tmp->b_cont) {
-			bytecnt += (tmp->b_wptr - tmp->b_rptr);
+			ADD_MBLK_SIZE(tmp, bytecnt);
 			mblkcnt++;
 		}
 
@@ -1941,7 +1956,7 @@
 
 	/* Get the size of the message for q_count accounting */
 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
 
@@ -2433,9 +2448,10 @@
 
 	/* Get message byte count for q_count accounting */
 	for (tmp = bp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
+
 	if (qbp) {
 		qbp->qb_count += bytecnt;
 		qbp->qb_mblkcnt += mblkcnt;
@@ -2617,7 +2633,7 @@
 
 	/* Get message byte count for q_count accounting */
 	for (tmp = bp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
 	if (qbp) {
@@ -2748,7 +2764,7 @@
 
 	/* Get mblk and byte count for q_count accounting */
 	for (tmp = mp; tmp; tmp = tmp->b_cont) {
-		bytecnt += (tmp->b_wptr - tmp->b_rptr);
+		ADD_MBLK_SIZE(tmp, bytecnt);
 		mblkcnt++;
 	}
 
--- a/usr/src/uts/common/io/strsun.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/io/strsun.c	Sat Oct 22 22:50:14 2005 -0700
@@ -37,7 +37,9 @@
 #include <sys/errno.h>
 #include <sys/stream.h>
 #include <sys/stropts.h>
+#include <sys/strsubr.h>
 #include <sys/strsun.h>
+#include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 
 void
@@ -243,3 +245,63 @@
 	freemsg(datamp);
 	return (0);
 }
+
+/* Copy userdata into a new mblk_t */
+mblk_t *
+mcopyinuio(struct stdata *stp, uio_t *uiop, ssize_t iosize,
+    ssize_t maxblk, int *errorp)
+{
+	mblk_t	*head = NULL, **tail = &head;
+	size_t	offset = stp->sd_wroff;
+
+	if (iosize == INFPSZ || iosize > uiop->uio_resid)
+		iosize = uiop->uio_resid;
+
+	if (maxblk == INFPSZ)
+		maxblk = iosize;
+
+	/* Nothing to do in these cases, so we're done */
+	if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
+		goto done;
+
+	if (stp->sd_flag & STRCOPYCACHED)
+		uiop->uio_extflg |= UIO_COPY_CACHED;
+
+	/*
+	 * We will enter the loop below if iosize is 0; it will allocate an
+	 * empty message block and call uiomove(9F) which will just return.
+	 * We could avoid that with an extra check but would only slow
+	 * down the much more likely case where iosize is larger than 0.
+	 */
+	do {
+		ssize_t blocksize;
+		mblk_t  *mp;
+
+		blocksize = MIN(iosize, maxblk);
+		ASSERT(blocksize >= 0);
+		if ((mp = allocb_cred(offset + blocksize, CRED())) == NULL) {
+			*errorp = ENOMEM;
+			return (head);
+		}
+		mp->b_rptr += offset;
+		mp->b_wptr = mp->b_rptr + blocksize;
+		DB_CPID(mp) = curproc->p_pid;
+
+		*tail = mp;
+		tail = &mp->b_cont;
+
+		/* uiomove(9F) either returns 0 or EFAULT */
+		if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
+		    UIO_WRITE, uiop)) != 0) {
+			ASSERT(*errorp != ENOMEM);
+			freemsg(head);
+			return (NULL);
+		}
+
+		iosize -= blocksize;
+	} while (iosize > 0);
+
+done:
+	*errorp = 0;
+	return (head);
+}
--- a/usr/src/uts/common/os/streamio.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/os/streamio.c	Sat Oct 22 22:50:14 2005 -0700
@@ -2642,11 +2642,18 @@
 int
 strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp)
 {
+	return (strwrite_common(vp, uiop, crp, 0));
+}
+
+/* ARGSUSED2 */
+int
+strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag)
+{
 	struct stdata *stp;
 	struct queue *wqp;
 	ssize_t rmin, rmax;
 	ssize_t iosize;
-	char waitflag;
+	int waitflag;
 	int tempmode;
 	int error = 0;
 	int b_flag;
@@ -2701,7 +2708,7 @@
 	/*
 	 * Do until count satisfied or error.
 	 */
-	waitflag = WRITEWAIT;
+	waitflag = WRITEWAIT | wflag;
 	if (stp->sd_flag & OLDNDELAY)
 		tempmode = uiop->uio_fmode & ~FNDELAY;
 	else
@@ -2803,79 +2810,6 @@
 }
 
 /*
- * kstrwritemp() has very similar semantics as that of strwrite().
- * The main difference is it obtains mblks from the caller and also
- * does not do any copy as done in strwrite() from user buffers to
- * kernel buffers.
- *
- *
- * Currently, this routine is used by sendfile to send data allocated
- * within the kernel without any copying. This interface does not use the
- * synchronous stream interface as synch. stream interface implies
- * copying.
- */
-int
-kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
-{
-	struct stdata *stp;
-	struct queue *wqp;
-	char waitflag;
-	int tempmode;
-	int error;
-	int done = 0;
-
-	ASSERT(vp->v_stream);
-	stp = vp->v_stream;
-
-	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
-		mutex_enter(&stp->sd_lock);
-		error = strwriteable(stp, B_FALSE, B_TRUE);
-		mutex_exit(&stp->sd_lock);
-		if (error != 0)
-			return (error);
-	}
-
-	/*
-	 * First, check for flow control without grabbing the sd_lock.
-	 * If we would block, re-check with the lock. This is similar
-	 * to the logic used by strwrite().
-	 */
-	wqp = stp->sd_wrq;
-	if (canputnext(wqp)) {
-		putnext(wqp, mp);
-		return (0);
-	}
-
-	waitflag = WRITEWAIT;
-	if (stp->sd_flag & OLDNDELAY)
-		tempmode = fmode & ~FNDELAY;
-	else
-		tempmode = fmode;
-
-	mutex_enter(&stp->sd_lock);
-	do {
-		if (canputnext(wqp)) {
-			mutex_exit(&stp->sd_lock);
-			putnext(wqp, mp);
-			return (0);
-		}
-		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
-		    &done);
-	} while (error == 0 && !done);
-
-	mutex_exit(&stp->sd_lock);
-	/*
-	 * EAGAIN tells the application to try again. ENOMEM
-	 * is returned only if the memory allocation size
-	 * exceeds the physical limits of the system. ENOMEM
-	 * can't be true here.
-	 */
-	if (error == ENOMEM)
-		error = EAGAIN;
-	return (error);
-}
-
-/*
  * Stream head write service routine.
  * Its job is to wake up any sleeping writers when a queue
  * downstream needs data (part of the flow control in putq and getq).
--- a/usr/src/uts/common/os/strsubr.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/os/strsubr.c	Sat Oct 22 22:50:14 2005 -0700
@@ -2437,6 +2437,18 @@
 	if (devflag & D_SYNCSTR)
 		qflag |= QSYNCSTR;
 
+	/*
+	 * Private flag used by a transport module to indicate
+	 * to sockfs that it supports direct-access mode without
+	 * having to go through STREAMS.
+	 */
+	if (devflag & _D_DIRECT) {
+		/* Reject unless the module is fully-MT (no perimeter) */
+		if ((qflag & QMT_TYPEMASK) != QMTSAFE)
+			goto bad;
+		qflag |= _QDIRECT;
+	}
+
 	*qflagp = qflag;
 	*sqtypep = sqtype;
 	return (0);
@@ -8236,11 +8248,11 @@
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		/* Associate values for M_DATA type */
-		mp->b_datap->db_cksumstart = (intptr_t)start;
-		mp->b_datap->db_cksumstuff = (intptr_t)stuff;
-		mp->b_datap->db_cksumend = (intptr_t)end;
-		mp->b_datap->db_struioun.cksum.flags = flags;
-		mp->b_datap->db_cksum16 = (uint16_t)value;
+		DB_CKSUMSTART(mp) = (intptr_t)start;
+		DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
+		DB_CKSUMEND(mp) = (intptr_t)end;
+		DB_CKSUMFLAGS(mp) = flags;
+		DB_CKSUM16(mp) = (uint16_t)value;
 
 	} else {
 		pattrinfo_t pa_info;
@@ -8258,6 +8270,8 @@
 			hck->hcksum_end_offset = end;
 			hck->hcksum_cksum_val.inet_cksum = (uint16_t)value;
 			hck->hcksum_flags = flags;
+		} else {
+			rc = -1;
 		}
 	}
 	return (rc);
@@ -8271,20 +8285,16 @@
 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
 	if (mp->b_datap->db_type == M_DATA) {
 		if (flags != NULL) {
-			*flags = mp->b_datap->db_struioun.cksum.flags;
+			*flags = DB_CKSUMFLAGS(mp);
 			if (*flags & HCK_PARTIALCKSUM) {
 				if (start != NULL)
-					*start = (uint32_t)
-					    mp->b_datap->db_cksumstart;
+					*start = (uint32_t)DB_CKSUMSTART(mp);
 				if (stuff != NULL)
-					*stuff = (uint32_t)
-					    mp->b_datap->db_cksumstuff;
+					*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
 				if (end != NULL)
-					*end =
-					    (uint32_t)mp->b_datap->db_cksumend;
+					*end = (uint32_t)DB_CKSUMEND(mp);
 				if (value != NULL)
-					*value =
-					    (uint32_t)mp->b_datap->db_cksum16;
+					*value = (uint32_t)DB_CKSUM16(mp);
 			}
 		}
 	} else {
--- a/usr/src/uts/common/sys/conf.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/conf.h	Sat Oct 22 22:50:14 2005 -0700
@@ -24,7 +24,7 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -219,6 +219,8 @@
 
 #define	D_U64BIT	0x40000	/* Driver supports unsigned 64-bit uio offset */
 
+#define	_D_DIRECT	0x80000	/* Private flag for transport modules */
+
 #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/sys/dlpi.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/dlpi.h	Sat Oct 22 22:50:14 2005 -0700
@@ -689,6 +689,8 @@
 					/* ability */
 #define	HCKSUM_INET_FULL_V4	0x04	/* Full 1's complement checksum */
 					/* ability for IPv4 packets. */
+#define	HCKSUM_INET_FULL_V6	0x08	/* Full 1's complement checksum */
+					/* ability for IPv6 packets. */
 #define	HCKSUM_IPHDRCKSUM	0x10	/* IPv4 Header checksum offload */
 					/* capability */
 #ifdef _KERNEL
--- a/usr/src/uts/common/sys/gld.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/gld.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -240,9 +240,12 @@
 #define	GLD_CAP_LINKSTATE	0x00000001 /* will call gld_linkstate() */
 #define	GLD_CAP_CKSUM_IPHDR	0x00000008 /* IP checksum offload	*/
 #define	GLD_CAP_CKSUM_PARTIAL	0x00000010 /* TCP/UDP partial		*/
-#define	GLD_CAP_CKSUM_FULL_V4	0x00000020 /* TCP/UDP full		*/
-#define	GLD_CAP_CKSUM_ANY	0x00000038 /* any or all of the above	*/
+#define	GLD_CAP_CKSUM_FULL_V4	0x00000020 /* TCP/UDP full for IPv4	*/
 #define	GLD_CAP_ZEROCOPY	0x00000040 /* zerocopy */
+#define	GLD_CAP_CKSUM_FULL_V6	0x00000080 /* TCP/UDP full for IPv6	*/
+#define	GLD_CAP_CKSUM_ANY				\
+	(GLD_CAP_CKSUM_IPHDR|GLD_CAP_CKSUM_PARTIAL|	\
+	GLD_CAP_CKSUM_FULL_V4|GLD_CAP_CKSUM_FULL_V6)
 
 /* values of gldm_linkstate, as passed to gld_linkstate() */
 #define	GLD_LINKSTATE_DOWN	-1
--- a/usr/src/uts/common/sys/multidata.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/multidata.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -68,19 +68,24 @@
 /*
  * Multidata packet descriptor information.
  */
-typedef struct pdescinfo_s {
-	uint_t	flags;		/* misc. flags */
-	uchar_t	*hdr_base;	/* start address of header area */
-	uchar_t *hdr_rptr;	/* start address of header data */
-	uchar_t *hdr_wptr;	/* end address of header data */
-	uchar_t	*hdr_lim;	/* end address of header area */
-	uint_t	pld_cnt;	/* number of payload area */
-	struct pld_ary_s {
-		int pld_pbuf_idx;	/* payload buffer index */
-		uchar_t *pld_rptr;	/* start address of payload data */
-		uchar_t *pld_wptr;	/* pointer to end of payload data */
-	} pld_ary[MULTIDATA_MAX_PBUFS];
-} pdescinfo_t;
+struct pld_ary_s {
+	int pld_pbuf_idx;	/* payload buffer index */
+	uchar_t *pld_rptr;	/* start address of payload data */
+	uchar_t *pld_wptr;	/* pointer to end of payload data */
+};
+
+#define	PDESCINFO_STRUCT(elems) 					\
+{									\
+	uint_t	flags;		/* misc. flags */			\
+	uchar_t	*hdr_base;	/* start address of header area */	\
+	uchar_t *hdr_rptr;	/* start address of header data */	\
+	uchar_t *hdr_wptr;	/* end address of header data */	\
+	uchar_t	*hdr_lim;	/* end address of header area */	\
+	uint_t	pld_cnt;	/* number of payload area */		\
+	struct pld_ary_s	pld_ary[(elems)];			\
+}
+
+typedef struct pdescinfo_s PDESCINFO_STRUCT(MULTIDATA_MAX_PBUFS) pdescinfo_t;
 
 /*
  * Possible values for flags
--- a/usr/src/uts/common/sys/multidata_impl.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/multidata_impl.h	Sat Oct 22 22:50:14 2005 -0700
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -191,21 +191,6 @@
 	uint_t	mmd_pbuf_ref;	/* descriptors referring to payload buffer(s) */
 };
 
-/*
- * Smaller and private version of pdescinfo_t used specifically for tcp,
- * which allows for only two payload spans per packet.  Any changes made
- * to the pdescinfo_t structure must be reflected here as well.
- */
-typedef struct tcp_pdescinfo_s {
-	uint_t	flags;		/* misc. flags */
-	uchar_t	*hdr_base;	/* start address of header area */
-	uchar_t *hdr_rptr;	/* start address of header data */
-	uchar_t *hdr_wptr;	/* end address of header data */
-	uchar_t	*hdr_lim;	/* end address of header area */
-	uint_t	pld_cnt;	/* number of payload area */
-	struct pld_ary_s pld_ary[2];
-} tcp_pdescinfo_t;
-
 #ifdef _KERNEL
 
 extern void mmd_init(void);
--- a/usr/src/uts/common/sys/socketvar.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/socketvar.h	Sat Oct 22 22:50:14 2005 -0700
@@ -100,6 +100,7 @@
 };
 
 typedef struct sonodeops sonodeops_t;
+typedef struct sonode sonode_t;
 
 /*
  * The sonode represents a socket. A sonode never exist in the file system
@@ -364,7 +365,7 @@
 #define	SS_DONEREAD		0x00080000 /* NCAfs: all data read */
 #define	SS_MOREDATA		0x00100000 /* NCAfs: NCA has more data */
 
-#define	SS_TCP_FAST_ACCEPT	0x00200000 /* Use TCP's accept fast-path */
+#define	SS_DIRECT		0x00200000 /* transport is directly below */
 
 #define	SS_LADDR_VALID		0x01000000	/* so_laddr valid for user */
 #define	SS_FADDR_VALID		0x02000000	/* so_faddr valid for user */
@@ -769,8 +770,10 @@
 extern void	so_flush_discon_ind(struct sonode *);
 extern int	sowaitconnected(struct sonode *, int, int);
 
+extern int	sostream_direct(struct sonode *, struct uio *,
+		    mblk_t *, cred_t *);
 extern int	sosend_dgram(struct sonode *, struct sockaddr *,
-			socklen_t, struct uio *, int);
+		    socklen_t, struct uio *, int);
 extern int	sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int);
 extern void	so_installhooks(struct sonode *);
 extern int	so_strinit(struct sonode *, struct sonode *);
--- a/usr/src/uts/common/sys/sockio.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/sockio.h	Sat Oct 22 22:50:14 2005 -0700
@@ -265,9 +265,9 @@
 #define	SIOCDXARP	_IOW('i', 168, struct xarpreq)	/* delete ARP entry */
 
 /*
- * IOCTL to indicate to the transport that the sockmod is being popped
+ * IOCTL private to sockfs.
  */
-#define	SIOCPOPSOCKFS	_IOW('i', 169, 0)
+#define	_SIOCSOCKFALLBACK _IOW('i', 169, 0)
 
 /*
  * IOCTLs for getting and setting zone associated with an interface, and
--- a/usr/src/uts/common/sys/stream.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/stream.h	Sat Oct 22 22:50:14 2005 -0700
@@ -171,6 +171,8 @@
 #define	_QINSERTING	0x04000000	/* Private, module is being inserted */
 #define	_QREMOVING	0x08000000	/* Private, module is being removed */
 #define	_QASSOCIATED	0x10000000	/* queue is associated with a device */
+#define	_QDIRECT	0x20000000	/* Private; transport module uses */
+					/* direct interface to/from sockfs */
 
 /* queue sqflags (protected by SQLOCK). */
 #define	Q_SQQUEUED	0x01		/* Queue is in the syncq list */
--- a/usr/src/uts/common/sys/strsubr.h	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/sys/strsubr.h	Sat Oct 22 22:50:14 2005 -0700
@@ -1096,6 +1096,8 @@
 extern void strclean(struct vnode *);
 extern void str_cn_clean();	/* XXX hook for consoles signal cleanup */
 extern int strwrite(struct vnode *, struct uio *, cred_t *);
+extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int);
+extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
 extern int strread(struct vnode *, struct uio *, cred_t *);
 extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *);
 extern int strrput(queue_t *, mblk_t *);
@@ -1180,6 +1182,7 @@
 extern mblk_t *allocb_cred(size_t, cred_t *);
 extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *);
 extern mblk_t *allocb_tmpl(size_t, const mblk_t *);
+extern mblk_t *allocb_tryhard(size_t);
 extern void mblk_setcred(mblk_t *, cred_t *);
 extern void strpollwakeup(vnode_t *, short);
 extern int putnextctl_wait(queue_t *, int);
@@ -1188,7 +1191,6 @@
     unsigned char, int, int);
 extern int kstrgetmsg(struct vnode *, mblk_t **, struct uio *,
     unsigned char *, int *, clock_t, rval_t *);
-extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
 
 extern void strsetrerror(vnode_t *, int, int, errfunc_t);
 extern void strsetwerror(vnode_t *, int, int, errfunc_t);
@@ -1217,6 +1219,8 @@
 extern void freemsgchain(mblk_t *);
 extern mblk_t *copymsgchain(mblk_t *);
 
+extern mblk_t *mcopyinuio(struct stdata *, uio_t *, ssize_t, ssize_t, int *);
+
 /*
  * shared or externally configured data structures
  */
@@ -1263,6 +1267,19 @@
 extern struct queue *WR(queue_t *);
 extern int SAMESTR(queue_t *);
 
+/*
+ * The following hardware checksum related macros are private
+ * interfaces that are subject to change without notice.
+ */
+#ifdef _KERNEL
+#define	DB_CKSUMSTART(mp)	((mp)->b_datap->db_cksumstart)
+#define	DB_CKSUMEND(mp)		((mp)->b_datap->db_cksumend)
+#define	DB_CKSUMSTUFF(mp)	((mp)->b_datap->db_cksumstuff)
+#define	DB_CKSUMFLAGS(mp)	((mp)->b_datap->db_struioun.cksum.flags)
+#define	DB_CKSUM16(mp)		((mp)->b_datap->db_cksum16)
+#define	DB_CKSUM32(mp)		((mp)->b_datap->db_cksum32)
+#endif	/* _KERNEL */
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/syscall/sendfile.c	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/syscall/sendfile.c	Sat Oct 22 22:50:14 2005 -0700
@@ -73,6 +73,89 @@
 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
 		int);
 
+/*
+ * kstrwritemp() has very similar semantics as that of strwrite().
+ * The main difference is it obtains mblks from the caller and also
+ * does not do any copy as done in strwrite() from user buffers to
+ * kernel buffers.
+ *
+ * Currently, this routine is used by sendfile to send data allocated
+ * within the kernel without any copying. This interface does not use the
+ * synchronous stream interface as synch. stream interface implies
+ * copying.
+ */
+int
+kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
+{
+	struct stdata *stp;
+	struct queue *wqp;
+	char waitflag;
+	int tempmode;
+	int error = 0;
+	int done = 0;
+	struct sonode *so;
+	boolean_t direct;
+
+	ASSERT(vp->v_stream);
+	stp = vp->v_stream;
+
+	so = VTOSO(vp);
+	direct = (so->so_state & SS_DIRECT);
+
+	/*
+	 * This is the sockfs direct fast path. canputnext() need
+	 * not be accurate so we don't grab the sd_lock here. If
+	 * we get flow-controlled, we grab sd_lock just before the
+	 * do..while loop below to emulate what strwrite() does.
+	 */
+	wqp = stp->sd_wrq;
+	if (canputnext(wqp) && direct &&
+	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
+		return (sostream_direct(so, NULL, mp, CRED()));
+	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
+		/* Fast check of flags before acquiring the lock */
+		mutex_enter(&stp->sd_lock);
+		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
+		mutex_exit(&stp->sd_lock);
+		if (error != 0) {
+			if (!(stp->sd_flag & STPLEX) &&
+			    (stp->sd_wput_opt & SW_SIGPIPE)) {
+				tsignal(curthread, SIGPIPE);
+				error = EPIPE;
+			}
+			return (error);
+		}
+	}
+
+	waitflag = WRITEWAIT;
+	if (stp->sd_flag & OLDNDELAY)
+		tempmode = fmode & ~FNDELAY;
+	else
+		tempmode = fmode;
+
+	mutex_enter(&stp->sd_lock);
+	do {
+		if (canputnext(wqp)) {
+			mutex_exit(&stp->sd_lock);
+			putnext(wqp, mp);
+			return (0);
+		}
+		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
+		    &done);
+	} while (error == 0 && !done);
+
+	mutex_exit(&stp->sd_lock);
+	/*
+	 * EAGAIN tells the application to try again. ENOMEM
+	 * is returned only if the memory allocation size
+	 * exceeds the physical limits of the system. ENOMEM
+	 * can't be true here.
+	 */
+	if (error == ENOMEM)
+		error = EAGAIN;
+	return (error);
+}
+
 #define	SEND_MAX_CHUNK	16
 
 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
@@ -1045,7 +1128,7 @@
 				goto err;
 			}
 
-			if ((so->so_state & SS_TCP_FAST_ACCEPT) &&
+			if ((so->so_state & SS_DIRECT) &&
 			    (so->so_priv != NULL)) {
 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
 			} else {
--- a/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Oct 22 22:50:14 2005 -0700
@@ -482,6 +482,7 @@
 	NO_UNLOAD_STUB(sockfs, sosendfile64,  	nomod_zero);
 	NO_UNLOAD_STUB(sockfs, sock_getfasync,  nomod_zero);
 	NO_UNLOAD_STUB(sockfs, nl7c_sendfilev,  nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sostream_direct,	nomod_zero);
 	END_MODULE(sockfs);
 #endif
 
@@ -529,12 +530,6 @@
 	END_MODULE(spdsock);
 #endif
 
-#ifndef UDP_MODULE
-	MODULE(udp,drv);
-	WSTUB(udp, udp_compute_checksum, nomod_zero);
-	END_MODULE(udp);
-#endif
-
 #ifndef NATTYMOD_MODULE
 	MODULE(nattymod, strmod);
 	WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);
--- a/usr/src/uts/sparc/ml/modstubs.s	Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/sparc/ml/modstubs.s	Sat Oct 22 22:50:14 2005 -0700
@@ -368,6 +368,7 @@
 	NO_UNLOAD_STUB(sockfs, sosendfile64,  	nomod_zero);
 	NO_UNLOAD_STUB(sockfs, sock_getfasync,  nomod_zero);
 	NO_UNLOAD_STUB(sockfs, nl7c_sendfilev,  nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sostream_direct,	nomod_zero);
 	END_MODULE(sockfs);
 #endif
 
@@ -415,12 +416,6 @@
 	END_MODULE(spdsock);
 #endif
 
-#ifndef UDP_MODULE
-	MODULE(udp,drv);
-	WSTUB(udp, udp_compute_checksum, nomod_zero);
-	END_MODULE(udp);
-#endif
-
 #ifndef NATTYMOD_MODULE
 	MODULE(nattymod, strmod);
 	WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);