--- a/usr/src/uts/common/inet/ip/ip.c Sat Oct 22 11:06:40 2005 -0700
+++ b/usr/src/uts/common/inet/ip/ip.c Sat Oct 22 22:50:14 2005 -0700
@@ -75,9 +75,11 @@
#include <netinet/sctp.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
@@ -110,6 +112,7 @@
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
+#include <inet/udp_impl.h>
/*
* Values for squeue switch:
@@ -122,7 +125,8 @@
/*
* IP statistics.
*/
-#define IP_STAT(x) (ip_statistics.x.value.ui64++)
+#define IP_STAT(x) (ip_statistics.x.value.ui64++)
+#define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n))
typedef struct ip_stat {
kstat_named_t ipsec_fanout_proto;
@@ -158,42 +162,68 @@
kstat_named_t ip_ire_redirect_timer_expired;
kstat_named_t ip_ire_pmtu_timer_expired;
kstat_named_t ip_input_multi_squeue;
+ kstat_named_t ip_tcp_in_full_hw_cksum_err;
+ kstat_named_t ip_tcp_in_part_hw_cksum_err;
+ kstat_named_t ip_tcp_in_sw_cksum_err;
+ kstat_named_t ip_tcp_out_sw_cksum_bytes;
+ kstat_named_t ip_udp_in_full_hw_cksum_err;
+ kstat_named_t ip_udp_in_part_hw_cksum_err;
+ kstat_named_t ip_udp_in_sw_cksum_err;
+ kstat_named_t ip_udp_out_sw_cksum_bytes;
+ kstat_named_t ip_frag_mdt_pkt_out;
+ kstat_named_t ip_frag_mdt_discarded;
+ kstat_named_t ip_frag_mdt_allocfail;
+ kstat_named_t ip_frag_mdt_addpdescfail;
+ kstat_named_t ip_frag_mdt_allocd;
} ip_stat_t;
static ip_stat_t ip_statistics = {
- { "ipsec_fanout_proto", KSTAT_DATA_UINT64 },
- { "ip_udp_fannorm", KSTAT_DATA_UINT64 },
- { "ip_udp_fanmb", KSTAT_DATA_UINT64 },
- { "ip_udp_fanothers", KSTAT_DATA_UINT64 },
- { "ip_udp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_udp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_udp_input_err", KSTAT_DATA_UINT64 },
- { "ip_tcppullup", KSTAT_DATA_UINT64 },
- { "ip_tcpoptions", KSTAT_DATA_UINT64 },
- { "ip_multipkttcp", KSTAT_DATA_UINT64 },
- { "ip_tcp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_input_error", KSTAT_DATA_UINT64 },
- { "ip_db_ref", KSTAT_DATA_UINT64 },
- { "ip_notaligned1", KSTAT_DATA_UINT64 },
- { "ip_notaligned2", KSTAT_DATA_UINT64 },
- { "ip_multimblk3", KSTAT_DATA_UINT64 },
- { "ip_multimblk4", KSTAT_DATA_UINT64 },
- { "ip_ipoptions", KSTAT_DATA_UINT64 },
- { "ip_classify_fail", KSTAT_DATA_UINT64 },
- { "ip_opt", KSTAT_DATA_UINT64 },
- { "ip_udp_rput_local", KSTAT_DATA_UINT64 },
- { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
- { "ip_conn_flputbq", KSTAT_DATA_UINT64 },
- { "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
- { "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
- { "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
- { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 },
+ { "ipsec_fanout_proto", KSTAT_DATA_UINT64 },
+ { "ip_udp_fannorm", KSTAT_DATA_UINT64 },
+ { "ip_udp_fanmb", KSTAT_DATA_UINT64 },
+ { "ip_udp_fanothers", KSTAT_DATA_UINT64 },
+ { "ip_udp_fast_path", KSTAT_DATA_UINT64 },
+ { "ip_udp_slow_path", KSTAT_DATA_UINT64 },
+ { "ip_udp_input_err", KSTAT_DATA_UINT64 },
+ { "ip_tcppullup", KSTAT_DATA_UINT64 },
+ { "ip_tcpoptions", KSTAT_DATA_UINT64 },
+ { "ip_multipkttcp", KSTAT_DATA_UINT64 },
+ { "ip_tcp_fast_path", KSTAT_DATA_UINT64 },
+ { "ip_tcp_slow_path", KSTAT_DATA_UINT64 },
+ { "ip_tcp_input_error", KSTAT_DATA_UINT64 },
+ { "ip_db_ref", KSTAT_DATA_UINT64 },
+ { "ip_notaligned1", KSTAT_DATA_UINT64 },
+ { "ip_notaligned2", KSTAT_DATA_UINT64 },
+ { "ip_multimblk3", KSTAT_DATA_UINT64 },
+ { "ip_multimblk4", KSTAT_DATA_UINT64 },
+ { "ip_ipoptions", KSTAT_DATA_UINT64 },
+ { "ip_classify_fail", KSTAT_DATA_UINT64 },
+ { "ip_opt", KSTAT_DATA_UINT64 },
+ { "ip_udp_rput_local", KSTAT_DATA_UINT64 },
+ { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
+ { "ip_conn_flputbq", KSTAT_DATA_UINT64 },
+ { "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
+ { "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 },
{ "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 },
- { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 },
+ { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 },
{ "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_input_multi_squeue", KSTAT_DATA_UINT64 },
+ { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 },
+ { "ip_input_multi_squeue", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 },
};
static kstat_t *ip_kstat;
@@ -591,28 +621,12 @@
/* RFC1122 Conformance */
#define IP_FORWARD_DEFAULT IP_FORWARD_NEVER
-#ifdef _BIG_ENDIAN
-#define IP_HDR_CSUM_TTL_ADJUST 256
-#define IP_TCP_CSUM_COMP IPPROTO_TCP
-#define IP_UDP_CSUM_COMP IPPROTO_UDP
-#else
-#define IP_HDR_CSUM_TTL_ADJUST 1
-#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
-#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8)
-#endif
-
-#define TCP_CHECKSUM_OFFSET 16
-#define UDP_CHECKSUM_OFFSET 6
-
#define ILL_MAX_NAMELEN LIFNAMSIZ
-#define UDPH_SIZE 8
-
/* Leave room for ip_newroute to tack on the src and target addresses */
#define OK_RESOLVER_MP(mp) \
((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
-static ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t);
@@ -668,6 +682,8 @@
static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
ire_t *);
static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *);
+static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
+ uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *);
static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *);
@@ -692,7 +708,6 @@
static boolean_t ip_source_routed(ipha_t *);
static boolean_t ip_source_route_included(ipha_t *);
-static void ip_unbind(queue_t *, mblk_t *);
static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t);
static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int);
static void ip_wput_local_options(ipha_t *);
@@ -767,6 +782,15 @@
time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT;
clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
+/*
+ * Threshold which determines whether MDT should be used when
+ * generating IP fragments; payload size must be greater than
+ * this threshold for MDT to take place.
+ */
+#define IP_WPUT_FRAG_MDT_MIN 32768
+
+int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
+
/* Protected by ip_mi_lock */
static void *ip_g_head; /* Instance Data List Head */
kmutex_t ip_mi_lock; /* Lock for list of instances */
@@ -1431,7 +1455,7 @@
};
struct module_info ip_mod_info = {
- 5701, "ip", 1, INFPSZ, 65536, 1024
+ IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
};
static struct qinit rinit = {
@@ -1930,6 +1954,8 @@
/* Send out an ICMP packet */
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
+ if (icmph->icmph_checksum == 0)
+ icmph->icmph_checksum = 0xFFFF;
if (broadcast || CLASSD(ipha->ipha_dst)) {
ipif_t *ipif_chosen;
/*
@@ -3204,6 +3230,8 @@
bcopy(stuff, icmph, len);
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
+ if (icmph->icmph_checksum == 0)
+ icmph->icmph_checksum = 0xFFFF;
BUMP_MIB(&icmp_mib, icmpOutMsgs);
put(q, ipsec_mp);
}
@@ -3704,7 +3732,7 @@
ASSERT(!connp->conn_af_isv6);
connp->conn_pkt_isv6 = B_FALSE;
- len = mp->b_wptr - mp->b_rptr;
+ len = MBLKL(mp);
if (len < (sizeof (*tbr) + 1)) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"ip_bind: bogus msg, len %ld", len);
@@ -3716,7 +3744,7 @@
protocol = *mp->b_wptr & 0xFF;
tbr = (struct T_bind_req *)mp->b_rptr;
/* Reset the message type in preparation for shipping it back. */
- mp->b_datap->db_type = M_PCPROTO;
+ DB_TYPE(mp) = M_PCPROTO;
connp->conn_ulp = (uint8_t)protocol;
@@ -3762,8 +3790,8 @@
*/
mp1 = mp->b_cont;
- ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET);
+ ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET);
switch (tbr->ADDR_length) {
default:
@@ -4169,7 +4197,7 @@
if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
!(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
(md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
- (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+ ILL_MDT_CAPABLE(md_ill)) {
md_dst_ire = dst_ire;
IRE_REFHOLD(md_dst_ire);
}
@@ -4689,43 +4717,19 @@
}
/*
- * IP has been configured as _D_QNEXTLESS for the client side i.e the driver
- * instance. This implies that
- * 1. IP cannot access the read side q_next pointer directly - it must
- * use routines like putnext and canputnext.
- * 2. ip_close must ensure that all sources of messages being putnext upstream
- * are gone before qprocsoff is called.
- *
- * #2 is handled by having ip_close do the ipcl_hash_remove and wait for
- * conn_ref to drop to zero before calling qprocsoff.
- */
-
-/* ARGSUSED */
-int
-ip_close(queue_t *q, int flags)
-{
- conn_t *connp;
+ * This is called as part of close() for both IP and UDP
+ * in order to quiesce the conn.
+ */
+void
+ip_quiesce_conn(conn_t *connp)
+{
boolean_t drain_cleanup_reqd = B_FALSE;
boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
boolean_t ilg_cleanup_reqd = B_FALSE;
- TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
-
- /*
- * Call the appropriate delete routine depending on whether this is
- * a module or device.
- */
- if (WR(q)->q_next != NULL) {
- /* This is a module close */
- return (ip_modclose((ill_t *)q->q_ptr));
- }
-
- connp = Q_TO_CONN(q);
- ASSERT(connp->conn_tcp == NULL);
-
- /*
- * We are being closed as /dev/ip or /dev/ip6.
- *
+ ASSERT(!IPCL_IS_TCP(connp));
+
+ /*
* Mark the conn as closing, and this conn must not be
* inserted in future into any list. Eg. conn_drain_insert(),
* won't insert this conn into the conn_drain_list.
@@ -4736,6 +4740,7 @@
* cannot get set henceforth.
*/
mutex_enter(&connp->conn_lock);
+ ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
connp->conn_state_flags |= CONN_CLOSING;
if (connp->conn_idl != NULL)
drain_cleanup_reqd = B_TRUE;
@@ -4745,17 +4750,17 @@
ilg_cleanup_reqd = B_TRUE;
mutex_exit(&connp->conn_lock);
+ if (IPCL_IS_UDP(connp))
+ udp_quiesce_conn(connp);
+
if (conn_ioctl_cleanup_reqd)
conn_ioctl_cleanup(connp);
/*
* Remove this conn from any fanout list it is on.
- * Then wait until the number of pending putnexts from
- * the fanout code drops to zero, before calling qprocsoff.
- * This is the guarantee a QNEXTLESS driver provides to
- * STREAMS, and is mentioned at the top of this function.
- */
-
+ * and then wait for any threads currently operating
+ * on this endpoint to finish
+ */
ipcl_hash_remove(connp);
/*
@@ -4776,7 +4781,6 @@
conn_delete_ire(connp, NULL);
-
/*
* Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
* callers from write side can't be there now because close
@@ -4787,7 +4791,29 @@
connp->conn_state_flags |= CONN_CONDEMNED;
while (connp->conn_ref != 1)
cv_wait(&connp->conn_cv, &connp->conn_lock);
+ connp->conn_state_flags |= CONN_QUIESCED;
mutex_exit(&connp->conn_lock);
+}
+
+/* ARGSUSED */
+int
+ip_close(queue_t *q, int flags)
+{
+ conn_t *connp;
+
+ TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
+
+ /*
+ * Call the appropriate delete routine depending on whether this is
+ * a module or device.
+ */
+ if (WR(q)->q_next != NULL) {
+ /* This is a module close */
+ return (ip_modclose((ill_t *)q->q_ptr));
+ }
+
+ connp = q->q_ptr;
+ ip_quiesce_conn(connp);
qprocsoff(q);
@@ -4801,6 +4827,15 @@
* has completed, and service has completed or won't run in
* future.
*/
+ ASSERT(connp->conn_ref == 1);
+
+ /*
+ * A conn which was previously marked as IPCL_UDP cannot
+ * retain the flag because it would have been cleared by
+ * udp_close().
+ */
+ ASSERT(!IPCL_IS_UDP(connp));
+
if (connp->conn_latch != NULL) {
IPLATCH_REFRELE(connp->conn_latch);
connp->conn_latch = NULL;
@@ -4827,6 +4862,83 @@
return (0);
}
+int
+ip_snmpmod_close(queue_t *q)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+ qprocsoff(q);
+
+ if (connp->conn_flags & IPCL_UDPMOD)
+ udp_close_free(connp);
+
+ if (connp->conn_cred != NULL) {
+ crfree(connp->conn_cred);
+ connp->conn_cred = NULL;
+ }
+ CONN_DEC_REF(connp);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+/*
+ * Write side put procedure for TCP module or UDP module instance. TCP/UDP
+ * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP.
+ * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ.
+ * M_FLUSH messages and ioctls are only passed downstream; we don't flush our
+ * queues as we never enqueue messages there and we don't handle any ioctls.
+ * Everything else is freed.
+ */
+void
+ip_snmpmod_wput(queue_t *q, mblk_t *mp)
+{
+ conn_t *connp = q->q_ptr;
+ pfi_t setfn;
+ pfi_t getfn;
+
+ ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
+ ((((union T_primitives *)mp->b_rptr)->type ==
+ T_SVR4_OPTMGMT_REQ) ||
+ (((union T_primitives *)mp->b_rptr)->type ==
+ T_OPTMGMT_REQ))) {
+ /*
+ * This is the only TPI primitive supported. Its
+ * handling does not require tcp_t, but it does require
+ * conn_t to check permissions.
+ */
+ cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
+
+ if (connp->conn_flags & IPCL_TCPMOD) {
+ setfn = tcp_snmp_set;
+ getfn = tcp_snmp_get;
+ } else {
+ setfn = udp_snmp_set;
+ getfn = udp_snmp_get;
+ }
+ if (!snmpcom_req(q, mp, setfn, getfn, cr)) {
+ freemsg(mp);
+ return;
+ }
+ } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
+ != NULL)
+ qreply(q, mp);
+ break;
+ case M_FLUSH:
+ case M_IOCTL:
+ putnext(q, mp);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+}
+
/* Return the IP checksum for the IP header at "iph". */
uint16_t
ip_csum_hdr(ipha_t *ipha)
@@ -5081,7 +5193,7 @@
* Send an ICMP error after patching up the packet appropriately. Returns
* non-zero if the appropriate MIB should be bumped; zero otherwise.
*/
-static int
+static boolean_t
ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid)
{
@@ -5103,8 +5215,8 @@
* ipsec_check_global_policy() assumes M_DATA as clear
* and M_CTL as secure.
*/
- db_type = mp->b_datap->db_type;
- mp->b_datap->db_type = M_DATA;
+ db_type = DB_TYPE(mp);
+ DB_TYPE(mp) = M_DATA;
secure = B_FALSE;
}
/*
@@ -5119,17 +5231,17 @@
first_mp = ipsec_check_global_policy(first_mp, NULL,
ipha, NULL, mctl_present);
if (first_mp == NULL)
- return (0);
+ return (B_FALSE);
}
if (!mctl_present)
- mp->b_datap->db_type = db_type;
+ DB_TYPE(mp) = db_type;
if (flags & IP_FF_SEND_ICMP) {
if (flags & IP_FF_HDR_COMPLETE) {
if (ip_hdr_complete(ipha, zoneid)) {
freemsg(first_mp);
- return (1);
+ return (B_TRUE);
}
}
if (flags & IP_FF_CKSUM) {
@@ -5152,10 +5264,10 @@
}
} else {
freemsg(first_mp);
- return (0);
- }
-
- return (1);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
}
#ifdef DEBUG
@@ -5592,7 +5704,7 @@
}
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart = (intptr_t)sqp;
+ DB_CKSUMSTART(mp) = (intptr_t)sqp;
syn_present = B_TRUE;
}
}
@@ -5720,7 +5832,6 @@
boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
boolean_t ip_policy)
{
- queue_t *rq = connp->conn_rq;
boolean_t mctl_present = (first_mp != NULL);
uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
uint32_t ill_index;
@@ -5730,7 +5841,7 @@
else
first_mp = mp;
- if (!canputnext(rq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
BUMP_MIB(&ip_mib, udpInOverflows);
freemsg(first_mp);
return;
@@ -5776,7 +5887,9 @@
mp = ip_add_info(mp, recv_ill, in_flags);
}
BUMP_MIB(&ip_mib, ipInDelivers);
- putnext(rq, mp);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
}
/*
@@ -8454,7 +8567,6 @@
return (ip_modopen(q, devp, flag, sflag, credp));
}
-
/*
* We are opening as a device. This is an IP client stream, and we
* allocate an conn_t as the instance data.
@@ -8463,6 +8575,9 @@
connp->conn_upq = q;
q->q_ptr = WR(q)->q_ptr = connp;
+ if (flag & SO_SOCKSTR)
+ connp->conn_flags |= IPCL_SOCKET;
+
/* Minor tells us which /dev entry was opened */
if (geteminor(*devp) == IPV6_MINOR) {
connp->conn_flags |= IPCL_ISV6;
@@ -8474,9 +8589,7 @@
connp->conn_pkt_isv6 = B_FALSE;
}
-
- if ((connp->conn_dev =
- inet_minor_alloc(ip_minor_arena)) == 0) {
+ if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
q->q_ptr = WR(q)->q_ptr = NULL;
CONN_DEC_REF(connp);
return (EBUSY);
@@ -10734,381 +10847,455 @@
}
/*
- * Do fragmentation reassembly.
- * returns B_TRUE if successful else B_FALSE.
+ * Fragmentation reassembly. Each ILL has a hash table for
+ * queuing packets undergoing reassembly for all IPIFs
+ * associated with the ILL. The hash is based on the packet
+ * IP ident field. The ILL frag hash table was allocated
+ * as a timer block at the time the ILL was created. Whenever
+ * there is anything on the reassembly queue, the timer will
+ * be running. Returns B_TRUE if successful else B_FALSE;
* frees mp on failure.
*/
static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha)
+ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+ uint32_t *cksum_val, uint16_t *cksum_flags)
{
uint32_t frag_offset_flags;
- ill_t *ill = (ill_t *)q->q_ptr;
- mblk_t *mp = *mpp;
- mblk_t *t_mp;
+ ill_t *ill = (ill_t *)q->q_ptr;
+ mblk_t *mp = *mpp;
+ mblk_t *t_mp;
ipaddr_t dst;
+ uint8_t proto = ipha->ipha_protocol;
+ uint32_t sum_val;
+ uint16_t sum_flags;
+ ipf_t *ipf;
+ ipf_t **ipfp;
+ ipfb_t *ipfb;
+ uint16_t ident;
+ uint32_t offset;
+ ipaddr_t src;
+ uint_t hdr_length;
+ uint32_t end;
+ mblk_t *mp1;
+ mblk_t *tail_mp;
+ size_t count;
+ size_t msg_len;
+ uint8_t ecn_info = 0;
+ uint32_t packet_size;
+ boolean_t pruned = B_FALSE;
+
+ if (cksum_val != NULL)
+ *cksum_val = 0;
+ if (cksum_flags != NULL)
+ *cksum_flags = 0;
/*
* Drop the fragmented as early as possible, if
* we don't have resource(s) to re-assemble.
*/
-
if (ip_reass_queue_bytes == 0) {
freemsg(mp);
return (B_FALSE);
}
+ /* Check for fragmentation offset; return if there's none */
+ if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
+ (IPH_MF | IPH_OFFSET)) == 0)
+ return (B_TRUE);
+
+ /*
+ * We utilize hardware computed checksum info only for UDP since
+ * IP fragmentation is a normal occurence for the protocol. In
+ * addition, checksum offload support for IP fragments carrying
+ * UDP payload is commonly implemented across network adapters.
+ */
+ ASSERT(ill != NULL);
+ if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+ mblk_t *mp1 = mp->b_cont;
+ int32_t len;
+
+ /* Record checksum information from the packet */
+ sum_val = (uint32_t)DB_CKSUM16(mp);
+ sum_flags = DB_CKSUMFLAGS(mp);
+
+ /* IP payload offset from beginning of mblk */
+ offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
+
+ if ((sum_flags & HCK_PARTIALCKSUM) &&
+ (mp1 == NULL || mp1->b_cont == NULL) &&
+ offset >= DB_CKSUMSTART(mp) &&
+ ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
+ uint32_t adj;
+ /*
+ * Partial checksum has been calculated by hardware
+ * and attached to the packet; in addition, any
+ * prepended extraneous data is even byte aligned.
+ * If any such data exists, we adjust the checksum;
+ * this would also handle any postpended data.
+ */
+ IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+ mp, mp1, len, adj);
+
+ /* One's complement subtract extraneous checksum */
+ if (adj >= sum_val)
+ sum_val = ~(adj - sum_val) & 0xFFFF;
+ else
+ sum_val -= adj;
+ }
+ } else {
+ sum_val = 0;
+ sum_flags = 0;
+ }
+
+ /* Clear hardware checksumming flag */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ ident = ipha->ipha_ident;
+ offset = (frag_offset_flags << 3) & 0xFFFF;
+ src = ipha->ipha_src;
dst = ipha->ipha_dst;
-
- /* Clear hardware checksumming flag if set */
- mp->b_datap->db_struioun.cksum.flags = 0;
-
- /* Check for fragmentation offset. */
- frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
- (IPH_MF | IPH_OFFSET);
- if (frag_offset_flags) {
- ipf_t *ipf;
- ipf_t **ipfp;
- ipfb_t *ipfb;
- uint16_t ident;
- uint32_t offset;
- ipaddr_t src;
- uint_t hdr_length;
- uint32_t end;
- uint8_t proto;
- mblk_t *mp1;
- mblk_t *tail_mp;
- size_t count;
- size_t msg_len;
- uint8_t ecn_info = 0;
- uint32_t packet_size;
- boolean_t pruned = B_FALSE;
-
- ident = ipha->ipha_ident;
- offset = (frag_offset_flags << 3) & 0xFFFF;
- src = ipha->ipha_src;
- hdr_length = IPH_HDR_LENGTH(ipha);
- end = ntohs(ipha->ipha_length) - hdr_length;
-
- /*
- * if end == 0 then we have a packet with no data, so just
- * free it.
- */
- if (end == 0) {
+ hdr_length = IPH_HDR_LENGTH(ipha);
+ end = ntohs(ipha->ipha_length) - hdr_length;
+
+ /* If end == 0 then we have a packet with no data, so just free it */
+ if (end == 0) {
+ freemsg(mp);
+ return (B_FALSE);
+ }
+
+ /* Record the ECN field info. */
+ ecn_info = (ipha->ipha_type_of_service & 0x3);
+ if (offset != 0) {
+ /*
+ * If this isn't the first piece, strip the header, and
+ * add the offset to the end value.
+ */
+ mp->b_rptr += hdr_length;
+ end += offset;
+ }
+
+ msg_len = MBLKSIZE(mp);
+ tail_mp = mp;
+ while (tail_mp->b_cont != NULL) {
+ tail_mp = tail_mp->b_cont;
+ msg_len += MBLKSIZE(tail_mp);
+ }
+
+ /* If the reassembly list for this ILL will get too big, prune it */
+ if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
+ ip_reass_queue_bytes) {
+ ill_frag_prune(ill,
+ (ip_reass_queue_bytes < msg_len) ? 0 :
+ (ip_reass_queue_bytes - msg_len));
+ pruned = B_TRUE;
+ }
+
+ ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
+ mutex_enter(&ipfb->ipfb_lock);
+
+ ipfp = &ipfb->ipfb_ipf;
+ /* Try to find an existing fragment queue for this packet. */
+ for (;;) {
+ ipf = ipfp[0];
+ if (ipf != NULL) {
+ /*
+ * It has to match on ident and src/dst address.
+ */
+ if (ipf->ipf_ident == ident &&
+ ipf->ipf_src == src &&
+ ipf->ipf_dst == dst &&
+ ipf->ipf_protocol == proto) {
+ /*
+ * If we have received too many
+ * duplicate fragments for this packet
+ * free it.
+ */
+ if (ipf->ipf_num_dups > ip_max_frag_dups) {
+ ill_frag_free_pkts(ill, ipfb, ipf, 1);
+ freemsg(mp);
+ mutex_exit(&ipfb->ipfb_lock);
+ return (B_FALSE);
+ }
+ /* Found it. */
+ break;
+ }
+ ipfp = &ipf->ipf_hash_next;
+ continue;
+ }
+
+ /*
+ * If we pruned the list, do we want to store this new
+ * fragment?. We apply an optimization here based on the
+ * fact that most fragments will be received in order.
+ * So if the offset of this incoming fragment is zero,
+ * it is the first fragment of a new packet. We will
+ * keep it. Otherwise drop the fragment, as we have
+ * probably pruned the packet already (since the
+ * packet cannot be found).
+ */
+ if (pruned && offset != 0) {
+ mutex_exit(&ipfb->ipfb_lock);
freemsg(mp);
return (B_FALSE);
}
- proto = ipha->ipha_protocol;
-
- /*
- * Fragmentation reassembly. Each ILL has a hash table for
- * queuing packets undergoing reassembly for all IPIFs
- * associated with the ILL. The hash is based on the packet
- * IP ident field. The ILL frag hash table was allocated
- * as a timer block at the time the ILL was created. Whenever
- * there is anything on the reassembly queue, the timer will
- * be running.
- */
- ASSERT(ill != NULL);
-
- /* Record the ECN field info. */
- ecn_info = (ipha->ipha_type_of_service & 0x3);
- if (offset != 0) {
- /*
- * If this isn't the first piece, strip the header, and
- * add the offset to the end value.
- */
- mp->b_rptr += hdr_length;
- end += offset;
- }
-
- msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
- tail_mp = mp;
- while (tail_mp->b_cont != NULL) {
- tail_mp = tail_mp->b_cont;
- msg_len += tail_mp->b_datap->db_lim -
- tail_mp->b_datap->db_base;
- }
-
- /*
- * If the reassembly list for this ILL will get too big
- * prune it.
- */
- if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
- ip_reass_queue_bytes) {
- ill_frag_prune(ill,
- (ip_reass_queue_bytes < msg_len) ? 0 :
- (ip_reass_queue_bytes - msg_len));
- pruned = B_TRUE;
- }
-
- ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
- mutex_enter(&ipfb->ipfb_lock);
-
- ipfp = &ipfb->ipfb_ipf;
- /* Try to find an existing fragment queue for this packet. */
- for (;;) {
- ipf = ipfp[0];
- if (ipf != NULL) {
- /*
- * It has to match on ident and src/dst address.
- */
- if (ipf->ipf_ident == ident &&
- ipf->ipf_src == src &&
- ipf->ipf_dst == dst &&
- ipf->ipf_protocol == proto) {
- /*
- * If we have received too many
- * duplicate fragments for this packet
- * free it.
- */
- if (ipf->ipf_num_dups >
- ip_max_frag_dups) {
- ill_frag_free_pkts(ill, ipfb,
- ipf, 1);
- freemsg(mp);
- mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
- }
- /* Found it. */
- break;
- }
- ipfp = &ipf->ipf_hash_next;
- continue;
- }
-
- /*
- * If we pruned the list, do we want to store this new
- * fragment?. We apply an optimization here based on the
- * fact that most fragments will be received in order.
- * So if the offset of this incoming fragment is zero,
- * it is the first fragment of a new packet. We will
- * keep it. Otherwise drop the fragment, as we have
- * probably pruned the packet already (since the
- * packet cannot be found).
- */
- if (pruned && offset != 0) {
- mutex_exit(&ipfb->ipfb_lock);
- freemsg(mp);
- return (B_FALSE);
- }
-
- if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) {
- /*
- * Too many fragmented packets in this hash
- * bucket. Free the oldest.
- */
- ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
- 1);
- }
-
- /* New guy. Allocate a frag message. */
- mp1 = allocb(sizeof (*ipf), BPRI_MED);
- if (mp1 == NULL) {
- BUMP_MIB(&ip_mib, ipInDiscards);
- freemsg(mp);
+
+ if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) {
+ /*
+ * Too many fragmented packets in this hash
+ * bucket. Free the oldest.
+ */
+ ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
+ }
+
+ /* New guy. Allocate a frag message. */
+ mp1 = allocb(sizeof (*ipf), BPRI_MED);
+ if (mp1 == NULL) {
+ BUMP_MIB(&ip_mib, ipInDiscards);
+ freemsg(mp);
reass_done:
- mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
- }
-
-
- BUMP_MIB(&ip_mib, ipReasmReqds);
- mp1->b_cont = mp;
-
- /* Initialize the fragment header. */
- ipf = (ipf_t *)mp1->b_rptr;
- ipf->ipf_mp = mp1;
- ipf->ipf_ptphn = ipfp;
- ipfp[0] = ipf;
- ipf->ipf_hash_next = NULL;
- ipf->ipf_ident = ident;
- ipf->ipf_protocol = proto;
- ipf->ipf_src = src;
- ipf->ipf_dst = dst;
- ipf->ipf_nf_hdr_len = 0;
- /* Record reassembly start time. */
- ipf->ipf_timestamp = gethrestime_sec();
- /* Record ipf generation and account for frag header */
- ipf->ipf_gen = ill->ill_ipf_gen++;
- ipf->ipf_count = mp1->b_datap->db_lim -
- mp1->b_datap->db_base;
- ipf->ipf_last_frag_seen = B_FALSE;
- ipf->ipf_ecn = ecn_info;
- ipf->ipf_num_dups = 0;
- ipfb->ipfb_frag_pkts++;
-
- /*
- * We handle reassembly two ways. In the easy case,
- * where all the fragments show up in order, we do
- * minimal bookkeeping, and just clip new pieces on
- * the end. If we ever see a hole, then we go off
- * to ip_reassemble which has to mark the pieces and
- * keep track of the number of holes, etc. Obviously,
- * the point of having both mechanisms is so we can
- * handle the easy case as efficiently as possible.
- */
- if (offset == 0) {
- /* Easy case, in-order reassembly so far. */
- ipf->ipf_count += msg_len;
- ipf->ipf_tail_mp = tail_mp;
- /*
- * Keep track of next expected offset in
- * ipf_end.
- */
- ipf->ipf_end = end;
- ipf->ipf_nf_hdr_len = hdr_length;
- } else {
- /* Hard case, hole at the beginning. */
- ipf->ipf_tail_mp = NULL;
- /*
- * ipf_end == 0 means that we have given up
- * on easy reassembly.
- */
- ipf->ipf_end = 0;
- /*
- * ipf_hole_cnt is set by ip_reassemble.
- * ipf_count is updated by ip_reassemble.
- * No need to check for return value here
- * as we don't expect reassembly to complete
- * or fail for the first fragment itself.
- */
- (void) ip_reassemble(mp, ipf,
- (frag_offset_flags & IPH_OFFSET) << 3,
- (frag_offset_flags & IPH_MF), ill, msg_len);
- }
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += ipf->ipf_count;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += ipf->ipf_count;
- ASSERT(ill->ill_frag_count > 0); /* Wraparound */
- /* If the frag timer wasn't already going, start it. */
- mutex_enter(&ill->ill_lock);
- ill_frag_timer_start(ill);
- mutex_exit(&ill->ill_lock);
- goto reass_done;
- }
-
- /*
- * We have a new piece of a datagram which is already being
- * reassembled. Update the ECN info if all IP fragments
- * are ECN capable. If there is one which is not, clear
- * all the info. If there is at least one which has CE
- * code point, IP needs to report that up to transport.
- */
- if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
- if (ecn_info == IPH_ECN_CE)
- ipf->ipf_ecn = IPH_ECN_CE;
- } else {
- ipf->ipf_ecn = IPH_ECN_NECT;
- }
- if (offset && ipf->ipf_end == offset) {
- /* The new fragment fits at the end */
- ipf->ipf_tail_mp->b_cont = mp;
- /* Update the byte count */
+ mutex_exit(&ipfb->ipfb_lock);
+ return (B_FALSE);
+ }
+
+
+ BUMP_MIB(&ip_mib, ipReasmReqds);
+ mp1->b_cont = mp;
+
+ /* Initialize the fragment header. */
+ ipf = (ipf_t *)mp1->b_rptr;
+ ipf->ipf_mp = mp1;
+ ipf->ipf_ptphn = ipfp;
+ ipfp[0] = ipf;
+ ipf->ipf_hash_next = NULL;
+ ipf->ipf_ident = ident;
+ ipf->ipf_protocol = proto;
+ ipf->ipf_src = src;
+ ipf->ipf_dst = dst;
+ ipf->ipf_nf_hdr_len = 0;
+ /* Record reassembly start time. */
+ ipf->ipf_timestamp = gethrestime_sec();
+ /* Record ipf generation and account for frag header */
+ ipf->ipf_gen = ill->ill_ipf_gen++;
+ ipf->ipf_count = MBLKSIZE(mp1);
+ ipf->ipf_last_frag_seen = B_FALSE;
+ ipf->ipf_ecn = ecn_info;
+ ipf->ipf_num_dups = 0;
+ ipfb->ipfb_frag_pkts++;
+ ipf->ipf_checksum = 0;
+ ipf->ipf_checksum_flags = 0;
+
+ /* Store checksum value in fragment header */
+ if (sum_flags != 0) {
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ ipf->ipf_checksum_flags = sum_flags;
+ }
+
+ /*
+ * We handle reassembly two ways. In the easy case,
+ * where all the fragments show up in order, we do
+ * minimal bookkeeping, and just clip new pieces on
+ * the end. If we ever see a hole, then we go off
+ * to ip_reassemble which has to mark the pieces and
+ * keep track of the number of holes, etc. Obviously,
+ * the point of having both mechanisms is so we can
+ * handle the easy case as efficiently as possible.
+ */
+ if (offset == 0) {
+ /* Easy case, in-order reassembly so far. */
ipf->ipf_count += msg_len;
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += msg_len;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += msg_len;
- ASSERT(ill->ill_frag_count > 0); /* Wraparound */
- if (frag_offset_flags & IPH_MF) {
- /* More to come. */
- ipf->ipf_end = end;
- ipf->ipf_tail_mp = tail_mp;
- goto reass_done;
- }
- } else {
- /* Go do the hard cases. */
- int ret;
-
- if (offset == 0)
- ipf->ipf_nf_hdr_len = hdr_length;
-
- /* Save current byte count */
- count = ipf->ipf_count;
- ret = ip_reassemble(mp, ipf,
+ ipf->ipf_tail_mp = tail_mp;
+ /*
+ * Keep track of next expected offset in
+ * ipf_end.
+ */
+ ipf->ipf_end = end;
+ ipf->ipf_nf_hdr_len = hdr_length;
+ } else {
+ /* Hard case, hole at the beginning. */
+ ipf->ipf_tail_mp = NULL;
+ /*
+ * ipf_end == 0 means that we have given up
+ * on easy reassembly.
+ */
+ ipf->ipf_end = 0;
+
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
+
+ /*
+ * ipf_hole_cnt is set by ip_reassemble.
+ * ipf_count is updated by ip_reassemble.
+ * No need to check for return value here
+ * as we don't expect reassembly to complete
+ * or fail for the first fragment itself.
+ */
+ (void) ip_reassemble(mp, ipf,
(frag_offset_flags & IPH_OFFSET) << 3,
(frag_offset_flags & IPH_MF), ill, msg_len);
- /* Count of bytes added and subtracted (freeb()ed) */
- count = ipf->ipf_count - count;
- if (count) {
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += count;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += count;
- ASSERT(ill->ill_frag_count > 0);
- }
- if (ret == IP_REASS_PARTIAL) {
- goto reass_done;
- } else if (ret == IP_REASS_FAILED) {
- /* Reassembly failed. Free up all resources */
- ill_frag_free_pkts(ill, ipfb, ipf, 1);
- for (t_mp = mp; t_mp != NULL;
- t_mp = t_mp->b_cont) {
- IP_REASS_SET_START(t_mp, 0);
- IP_REASS_SET_END(t_mp, 0);
- }
- freemsg(mp);
- goto reass_done;
- }
- /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
- }
- /*
- * We have completed reassembly. Unhook the frag header from
- * the reassembly list.
- *
- * Before we free the frag header, record the ECN info
- * to report back to the transport.
- */
- ecn_info = ipf->ipf_ecn;
- BUMP_MIB(&ip_mib, ipReasmOKs);
- ipfp = ipf->ipf_ptphn;
- mp1 = ipf->ipf_mp;
+ }
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += ipf->ipf_count;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += ipf->ipf_count;
+ ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+ /* If the frag timer wasn't already going, start it. */
+ mutex_enter(&ill->ill_lock);
+ ill_frag_timer_start(ill);
+ mutex_exit(&ill->ill_lock);
+ goto reass_done;
+ }
+
+ /*
+ * If the packet's flag has changed (it could be coming up
+ * from an interface different than the previous, therefore
+ * possibly different checksum capability), then forget about
+ * any stored checksum states. Otherwise add the value to
+ * the existing one stored in the fragment header.
+ */
+ if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+ sum_val += ipf->ipf_checksum;
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ } else if (ipf->ipf_checksum_flags != 0) {
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
+ }
+
+ /*
+ * We have a new piece of a datagram which is already being
+ * reassembled. Update the ECN info if all IP fragments
+ * are ECN capable. If there is one which is not, clear
+ * all the info. If there is at least one which has CE
+ * code point, IP needs to report that up to transport.
+ */
+ if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
+ if (ecn_info == IPH_ECN_CE)
+ ipf->ipf_ecn = IPH_ECN_CE;
+ } else {
+ ipf->ipf_ecn = IPH_ECN_NECT;
+ }
+ if (offset && ipf->ipf_end == offset) {
+ /* The new fragment fits at the end */
+ ipf->ipf_tail_mp->b_cont = mp;
+ /* Update the byte count */
+ ipf->ipf_count += msg_len;
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += msg_len;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += msg_len;
+ ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+ if (frag_offset_flags & IPH_MF) {
+ /* More to come. */
+ ipf->ipf_end = end;
+ ipf->ipf_tail_mp = tail_mp;
+ goto reass_done;
+ }
+ } else {
+ /* Go do the hard cases. */
+ int ret;
+
+ if (offset == 0)
+ ipf->ipf_nf_hdr_len = hdr_length;
+
+ /* Save current byte count */
count = ipf->ipf_count;
- ipf = ipf->ipf_hash_next;
- if (ipf)
- ipf->ipf_ptphn = ipfp;
- ipfp[0] = ipf;
- ill->ill_frag_count -= count;
- ASSERT(ipfb->ipfb_count >= count);
- ipfb->ipfb_count -= count;
- ipfb->ipfb_frag_pkts--;
- mutex_exit(&ipfb->ipfb_lock);
- /* Ditch the frag header. */
- mp = mp1->b_cont;
-
- freeb(mp1);
-
- /* Restore original IP length in header. */
- packet_size = (uint32_t)msgdsize(mp);
- if (packet_size > IP_MAXPACKET) {
+ ret = ip_reassemble(mp, ipf,
+ (frag_offset_flags & IPH_OFFSET) << 3,
+ (frag_offset_flags & IPH_MF), ill, msg_len);
+ /* Count of bytes added and subtracted (freeb()ed) */
+ count = ipf->ipf_count - count;
+ if (count) {
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += count;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += count;
+ ASSERT(ill->ill_frag_count > 0);
+ }
+ if (ret == IP_REASS_PARTIAL) {
+ goto reass_done;
+ } else if (ret == IP_REASS_FAILED) {
+ /* Reassembly failed. Free up all resources */
+ ill_frag_free_pkts(ill, ipfb, ipf, 1);
+ for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
+ IP_REASS_SET_START(t_mp, 0);
+ IP_REASS_SET_END(t_mp, 0);
+ }
freemsg(mp);
- BUMP_MIB(&ip_mib, ipInHdrErrors);
+ goto reass_done;
+ }
+ /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
+ }
+ /*
+ * We have completed reassembly. Unhook the frag header from
+ * the reassembly list.
+ *
+ * Before we free the frag header, record the ECN info
+ * to report back to the transport.
+ */
+ ecn_info = ipf->ipf_ecn;
+ BUMP_MIB(&ip_mib, ipReasmOKs);
+ ipfp = ipf->ipf_ptphn;
+
+ /* We need to supply these to caller */
+ if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+ sum_val = ipf->ipf_checksum;
+ else
+ sum_val = 0;
+
+ mp1 = ipf->ipf_mp;
+ count = ipf->ipf_count;
+ ipf = ipf->ipf_hash_next;
+ if (ipf != NULL)
+ ipf->ipf_ptphn = ipfp;
+ ipfp[0] = ipf;
+ ill->ill_frag_count -= count;
+ ASSERT(ipfb->ipfb_count >= count);
+ ipfb->ipfb_count -= count;
+ ipfb->ipfb_frag_pkts--;
+ mutex_exit(&ipfb->ipfb_lock);
+ /* Ditch the frag header. */
+ mp = mp1->b_cont;
+
+ freeb(mp1);
+
+ /* Restore original IP length in header. */
+ packet_size = (uint32_t)msgdsize(mp);
+ if (packet_size > IP_MAXPACKET) {
+ freemsg(mp);
+ BUMP_MIB(&ip_mib, ipInHdrErrors);
+ return (B_FALSE);
+ }
+
+ if (DB_REF(mp) > 1) {
+ mblk_t *mp2 = copymsg(mp);
+
+ freemsg(mp);
+ if (mp2 == NULL) {
+ BUMP_MIB(&ip_mib, ipInDiscards);
return (B_FALSE);
}
-
- if (mp->b_datap->db_ref > 1) {
- mblk_t *mp2;
-
- mp2 = copymsg(mp);
- freemsg(mp);
- if (!mp2) {
- BUMP_MIB(&ip_mib, ipInDiscards);
- return (B_FALSE);
- }
- mp = mp2;
- }
- ipha = (ipha_t *)mp->b_rptr;
-
- ipha->ipha_length = htons((uint16_t)packet_size);
- /* We're now complete, zip the frag state */
- ipha->ipha_fragment_offset_and_flags = 0;
- /* Record the ECN info. */
- ipha->ipha_type_of_service &= 0xFC;
- ipha->ipha_type_of_service |= ecn_info;
- *mpp = mp;
-
- }
+ mp = mp2;
+ }
+ ipha = (ipha_t *)mp->b_rptr;
+
+ ipha->ipha_length = htons((uint16_t)packet_size);
+ /* We're now complete, zip the frag state */
+ ipha->ipha_fragment_offset_and_flags = 0;
+ /* Record the ECN info. */
+ ipha->ipha_type_of_service &= 0xFC;
+ ipha->ipha_type_of_service |= ecn_info;
+ *mpp = mp;
+
+ /* Reassembly is successful; return checksum information if needed */
+ if (cksum_val != NULL)
+ *cksum_val = sum_val;
+ if (cksum_flags != NULL)
+ *cksum_flags = sum_flags;
+
return (B_TRUE);
}
@@ -11156,16 +11343,12 @@
{
uint32_t sum;
uint32_t u1;
- uint32_t u2;
boolean_t mctl_present;
conn_t *connp;
mblk_t *first_mp;
- mblk_t *mp1;
- dblk_t *dp;
uint16_t *up;
ill_t *ill = (ill_t *)q->q_ptr;
- uint32_t ports;
- boolean_t cksum_computed = B_FALSE;
+ uint16_t reass_hck_flags = 0;
#define rptr ((uchar_t *)ipha)
@@ -11182,19 +11365,13 @@
IP_SIMPLE_HDR_LENGTH_IN_WORDS);
/* IP options present */
- if (u1)
+ if (u1 != 0)
goto ipoptions;
-#define IS_IPHDR_HWCKSUM(mctl_present, mp, ill) \
- ((!mctl_present) && (mp->b_datap->db_struioun.cksum.flags & \
- HCK_IPV4_HDRCKSUM) && (ill->ill_capabilities & \
- ILL_CAPAB_HCKSUM) && dohwcksum)
-
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/* Clear the IP header h/w cksum flag */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
@@ -11207,7 +11384,7 @@
* Don't verify header checksum if this packet is coming
* back from AH/ESP as we already did it.
*/
- if (!mctl_present && (sum && sum != 0xFFFF)) {
+ if (!mctl_present && sum != 0 && sum != 0xFFFF) {
BUMP_MIB(&ip_mib, ipInCksumErrs);
freemsg(first_mp);
return;
@@ -11236,133 +11413,52 @@
/* packet does not contain complete IP & UDP headers */
if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
goto udppullup;
+
/* up points to UDP header */
up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
#define iphs ((uint16_t *)ipha)
-#define IP_CKSUM_RECV(len, u1, u2, mp, mp1, error, dp) { \
- boolean_t doswcksum = B_TRUE; \
- uint_t hcksumflags = 0; \
- \
- hcksumflags = dp->db_struioun.cksum.flags; \
- \
- /* Clear the hardware checksum flags; they have been consumed */\
- dp->db_struioun.cksum.flags = 0; \
- if (hcksumflags && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&\
- dohwcksum) { \
- if (hcksumflags & HCK_FULLCKSUM) { \
- /* \
- * Full checksum has been computed by the \
- * hardware and has been attached. \
- */ \
- doswcksum = B_FALSE; \
- if (!(hcksumflags & HCK_FULLCKSUM_OK) && \
- (dp->db_cksum16 != 0xffff)) { \
- ipcsumdbg("full hwcksumerr\n", mp); \
- goto error; \
- } \
- } else if ((hcksumflags & HCK_PARTIALCKSUM) && \
- (((len = (IP_SIMPLE_HDR_LENGTH - dp->db_cksumstart))\
- & 1) == 0)) { \
- uint32_t tot_len = 0; \
- \
- doswcksum = B_FALSE; \
- /* Partial checksum computed */ \
- u1 += dp->db_cksum16; \
- tot_len = mp->b_wptr - mp->b_rptr; \
- if (!mp1) \
- mp1 = mp; \
- else \
- tot_len += mp1->b_wptr - mp1->b_rptr; \
- if (len > 0) { \
- /* \
- * Prepended extraneous data. Adjust \
- * checksum. \
- */ \
- u2 = IP_BCSUM_PARTIAL((uchar_t *)(rptr +\
- dp->db_cksumstart), (int32_t)len, \
- 0); \
- } else \
- u2 = 0; \
- if ((len = (dp->db_cksumend - tot_len)) > 0) { \
- /* \
- * Postpended extraneous data. Adjust \
- * checksum. \
- */ \
- uint32_t u3; \
- \
- u3 = IP_BCSUM_PARTIAL(mp1->b_wptr, \
- (int32_t)len, 0); \
- if ((uintptr_t)mp1->b_wptr & 1) \
- /* \
- * Postpended extraneous data \
- * was odd byte aligned, so \
- * swap resulting checksum \
- * bytes. \
- */ \
- u2 += ((u3 << 8) & 0xffff) | \
- (u3 >> 8); \
- else \
- u2 += u3; \
- u2 = (u2 & 0xFFFF) + ((int)(u2) >> 16); \
- } \
- /* \
- * One's complement subtract extraneous checksum\
- */ \
- if (u2 >= u1) \
- u1 = ~(u2 - u1) & 0xFFFF; \
- else \
- u1 -= u2; \
- u1 = (u1 & 0xFFFF) + ((int)u1 >> 16); \
- if (~(u1) & 0xFFFF) { \
- ipcsumdbg("partial hwcksumerr\n", mp); \
- goto error; \
- } \
- } \
- } \
- if (doswcksum) { \
- IP_STAT(ip_in_sw_cksum); \
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - \
- (uchar_t *)ipha), u1)) != 0) { \
- ipcsumdbg("swcksumerr\n", mp); \
- goto error; \
- } \
- } \
-}
-
- dp = mp->b_datap;
/* if udp hdr cksum != 0, then need to checksum udp packet */
- if (up[3]) {
- cksum_computed = B_TRUE;
- /* multiple mblks of udp data? */
- if ((mp1 = mp->b_cont) != NULL) {
- /* more than two? */
- if (mp1->b_cont)
- goto multipktudp;
- }
+ if (up[3] != 0) {
+ mblk_t *mp1 = mp->b_cont;
+ boolean_t cksum_err;
+ uint16_t hck_flags = 0;
/* Pseudo-header checksum */
u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
iphs[9] + up[2];
- if (!mctl_present) {
- ssize_t len = 0;
-
- IP_CKSUM_RECV(len, u1, u2, mp, mp1, udpcksumerr, dp);
- } else {
-multipktudp:
+
+ /*
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload or if IPsec is present.
+ */
+ if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
IP_STAT(ip_in_sw_cksum);
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -
- (uchar_t *)ipha), u1)) != 0) {
-udpcksumerr:
- ip1dbg(("ip_udp_input: bad udp checksum\n"));
- BUMP_MIB(&ip_mib, udpInCksumErrs);
- freemsg(first_mp);
- return;
- }
- }
- }
-
- /* broadcast IP packet? */
+
+ IP_CKSUM_RECV(hck_flags, u1,
+ (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+ (int32_t)((uchar_t *)up - rptr),
+ mp, mp1, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_udp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_udp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_udp_in_sw_cksum_err);
+
+ freemsg(first_mp);
+ return;
+ }
+ }
+
+ /* Non-fragmented broadcast or multicast packet? */
if (ire->ire_type == IRE_BROADCAST)
goto udpslowpath;
@@ -11371,7 +11467,7 @@
ASSERT(connp->conn_upq != NULL);
IP_STAT(ip_udp_fast_path);
- if (!canputnext(connp->conn_upq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
freemsg(mp);
BUMP_MIB(&ip_mib, udpInOverflows);
} else {
@@ -11383,7 +11479,8 @@
*/
if (ip_udp_check(q, connp, recv_ill,
ipha, &mp, &first_mp, mctl_present)) {
- putnext(connp->conn_upq, mp);
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
}
}
/*
@@ -11416,9 +11513,13 @@
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ /*
+ * "sum" and "reass_hck_flags" are non-zero if the
+ * reassembled packet has a valid hardware computed
+ * checksum information associated with it.
+ */
+ if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
goto slow_done;
- }
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -11432,7 +11533,7 @@
/* Now we have a complete datagram, destined for this machine. */
u1 = IPH_HDR_LENGTH(ipha);
/* Pull up the UDP header, if necessary. */
- if ((mp->b_wptr - mp->b_rptr) < (u1 + UDPH_SIZE)) {
+ if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
udppullup:
if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
BUMP_MIB(&ip_mib, ipInDiscards);
@@ -11441,30 +11542,43 @@
}
ipha = (ipha_t *)mp->b_rptr;
}
- /*
- * Validate the checksum. This code is a bit funny looking
- * but may help out the compiler in this crucial spot.
+
+ /*
+ * Validate the checksum for the reassembled packet; for the
+ * pullup case we calculate the payload checksum in software.
*/
up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
- if (!cksum_computed && up[3]) {
- IP_STAT(ip_in_sw_cksum);
- sum = IP_CSUM(mp, (int32_t)((uchar_t *)up - (uchar_t *)ipha),
- IP_UDP_CSUM_COMP + iphs[6] +
- iphs[7] + iphs[8] +
- iphs[9] + up[2]);
- if (sum != 0) {
- ip1dbg(("ip_udp_input: bad udp checksum\n"));
- BUMP_MIB(&ip_mib, udpInCksumErrs);
- freemsg(first_mp);
- goto slow_done;
+ if (up[3] != 0) {
+ boolean_t cksum_err;
+
+ if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+ IP_STAT(ip_in_sw_cksum);
+
+ IP_CKSUM_RECV_REASS(reass_hck_flags,
+ (int32_t)((uchar_t *)up - (uchar_t *)ipha),
+ IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
+ iphs[9] + up[2], sum, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+ if (reass_hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_udp_in_full_hw_cksum_err);
+ else if (reass_hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_udp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_udp_in_sw_cksum_err);
+
+ freemsg(first_mp);
+ goto slow_done;
}
}
udpslowpath:
- ports = *(uint32_t *)up;
- /* Clear hardware checksum flag */
- mp->b_datap->db_struioun.cksum.flags = 0;
- ip_fanout_udp(q, first_mp, ill, ipha, ports,
+ /* Clear hardware checksum flag to be safe */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
(ire->ire_type == IRE_BROADCAST),
IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO,
mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
@@ -11473,6 +11587,7 @@
IP_STAT(ip_udp_slow_path);
return;
+#undef iphs
#undef rptr
}
@@ -11485,17 +11600,17 @@
conn_t *connp;
uint32_t sum;
uint32_t u1;
- uint32_t u2;
uint16_t *up;
int offset;
ssize_t len;
mblk_t *mp1;
- dblk_t *dp;
boolean_t syn_present = B_FALSE;
tcph_t *tcph;
uint_t ip_hdr_len;
ill_t *ill = (ill_t *)q->q_ptr;
zoneid_t zoneid = ire->ire_zoneid;
+ boolean_t cksum_err;
+ uint16_t hck_flags = 0;
#define rptr ((uchar_t *)ipha)
@@ -11514,10 +11629,9 @@
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/* Clear the IP header h/w cksum flag */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -11596,30 +11710,32 @@
#endif
u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
-
- /*
- * If the packet has gone through AH/ESP, do the checksum here
- * itself.
- *
- * If it has not gone through IPSEC processing and not a duped
- * mblk, then look for driver checksummed mblk. We validate or
- * postpone the checksum to TCP for single copy checksum.
- *
- * Note that we only honor HW cksum in the fastpath.
- */
- dp = mp->b_datap;
- if (!mctl_present) {
- IP_CKSUM_RECV(len, u1, u2, mp, mp1, tcpcksumerr, dp);
- } else {
+ /*
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload or if IPsec is present.
+ */
+ if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
IP_STAT(ip_in_sw_cksum);
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr),
- u1)) != 0) {
-tcpcksumerr:
- BUMP_MIB(&ip_mib, tcpInErrs);
- ip1dbg(("ip_tcp_input: bad tcp checksum \n"));
- freemsg(first_mp);
- goto slow_done;
- }
+
+ IP_CKSUM_RECV(hck_flags, u1,
+ (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+ (int32_t)((uchar_t *)up - rptr),
+ mp, mp1, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, tcpInErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_tcp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_tcp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_tcp_in_sw_cksum_err);
+
+ goto error;
}
try_again:
@@ -11654,7 +11770,7 @@
if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
if (IPCL_IS_TCP(connp)) {
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart =
+ DB_CKSUMSTART(mp) =
(intptr_t)ip_squeue_get(ill_ring);
if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
!CONN_INBOUND_POLICY_PRESENT(connp)) {
@@ -11800,7 +11916,7 @@
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
if (mctl_present)
freeb(first_mp);
goto slow_done;
@@ -11876,9 +11992,10 @@
* ICMP's back, then this flag may need to be cleared in
* other places as well.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
+
u1 = (uint32_t)(len - u1); /* TCP datagram length. */
#ifdef _BIG_ENDIAN
u1 += IPPROTO_TCP;
@@ -11890,7 +12007,7 @@
* Not M_DATA mblk or its a dup, so do the checksum now.
*/
IP_STAT(ip_in_sw_cksum);
- if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1)) {
+ if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
BUMP_MIB(&ip_mib, tcpInErrs);
goto error;
}
@@ -11937,12 +12054,12 @@
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/*
* Since there is no SCTP h/w cksum support yet, just
* clear the flag.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -12031,7 +12148,7 @@
return;
ipoptions:
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
if (!ip_options_cksum(q, first_mp, ipha, ire))
goto slow_done;
@@ -12041,7 +12158,7 @@
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha))
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
goto slow_done;
/*
* Make sure that first_mp points back to mp as
@@ -12183,7 +12300,7 @@
* Clear the indication that this may have a hardware checksum
* as we are not using it
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
/*
* Now hand the packet to ip_newroute.
@@ -12351,7 +12468,7 @@
* Clear the indication that this may have
* hardware checksum as we are not using it.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp,
ICMP_SOURCE_ROUTE_FAILED);
ire_refrele(ire);
@@ -12361,7 +12478,7 @@
}
/* Packet is being forwarded. Turning off hwcksum flag. */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
if (ip_g_send_redirects) {
/*
* Check whether the incoming interface and outgoing
@@ -12435,15 +12552,17 @@
{
queue_t *q;
ire_t *ire;
+ uint16_t hcksumflags;
q = *qp;
ire = *irep;
/*
* Clear the indication that this may have hardware
- * checksum as we are not using it.
- */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ * checksum as we are not using it for forwarding.
+ */
+ hcksumflags = DB_CKSUMFLAGS(mp);
+ DB_CKSUMFLAGS(mp) = 0;
/*
* Directed broadcast forwarding: if the packet came in over a
@@ -12613,6 +12732,9 @@
}
*irep = ire;
+
+ /* Restore any hardware checksum flags */
+ DB_CKSUMFLAGS(mp) = hcksumflags;
return (B_FALSE);
}
@@ -12632,7 +12754,7 @@
* Clear the indication that this may have hardware
* checksum as we are not using it.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
retval = ip_mforward(ill, ipha, mp);
/* ip_mforward updates mib variables if needed */
/* clear b_prev - used by ip_mroute_decap */
@@ -12951,7 +13073,7 @@
/*
* Also SIOC[GS]TUN* ioctls can come here.
*/
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
"ip_input_end: q %p (%S)", q, "uninit");
return;
@@ -13300,9 +13422,20 @@
continue;
}
- /* broadcast? */
+ /*
+ * Broadcast IRE may indicate either broadcast or
+ * multicast packet
+ */
if (ire->ire_type == IRE_BROADCAST) {
- if (ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
+ /*
+ * Skip broadcast checks if packet is UDP multicast;
+ * we'd rather not enter ip_rput_process_broadcast()
+ * unless the packet is broadcast for real, since
+ * that routine is a no-op for multicast.
+ */
+ if ((ipha->ipha_protocol != IPPROTO_UDP ||
+ !CLASSD(ipha->ipha_dst)) &&
+ ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
dst, cgtp_flt_pkt, ll_multicast)) {
continue;
}
@@ -13533,24 +13666,6 @@
}
/*
- * This function is used to free a message that has gone through
- * mi_copyin processing which modifies the M_IOCTL mblk's b_next
- * and b_prev pointers. We use this function to set b_next/b_prev
- * to NULL and free them.
- */
-void
-ip_ioctl_freemsg(mblk_t *mp)
-{
- mblk_t *bp = mp;
-
- for (; bp != NULL; bp = bp->b_cont) {
- bp->b_prev = NULL;
- bp->b_next = NULL;
- }
- freemsg(mp);
-}
-
-/*
* Handling of DLPI messages that require exclusive access to the ipsq.
*
* Need to do ill_pending_mp_release on ioctl completion, which could
@@ -14483,7 +14598,7 @@
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
ASSERT(ipsq->ipsq_current_ipif != NULL);
ASSERT(connp != NULL);
ip_ioctl_finish(CONNP_TO_WQ(connp), mp,
@@ -14515,7 +14630,7 @@
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
if (iocp->ioc_error == 0)
mp->b_datap->db_type = M_IOCDATA;
ASSERT(connp != NULL);
@@ -14596,7 +14711,7 @@
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
if (iocp->ioc_error == 0)
iocp->ioc_error = EINVAL;
ASSERT(connp != NULL);
@@ -15321,7 +15436,7 @@
*/
ASSERT(!mctl_present);
ASSERT(first_mp == mp);
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
return;
}
/*
@@ -15337,7 +15452,7 @@
* Clear hardware checksumming flag as it is currently only
* used by TCP and UDP.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
/* Now we have a complete datagram, destined for this machine. */
u1 = IPH_HDR_LENGTH(ipha);
@@ -15839,7 +15954,7 @@
bad_src_route:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
return (B_FALSE);
@@ -16022,14 +16137,14 @@
param_prob:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_param_problem(q, mp, (uint8_t)code);
return (-1);
bad_src_route:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
return (-1);
}
@@ -17571,7 +17686,7 @@
* upper level protocol. We remove this conn from any fanout hash list it is
* on, and zero out the bind information. No reply is expected up above.
*/
-static void
+mblk_t *
ip_unbind(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
@@ -17591,7 +17706,7 @@
* original message.
*/
if (mp == NULL)
- return;
+ return (NULL);
/*
* Don't bzero the ports if its TCP since TCP still needs the
@@ -17601,7 +17716,7 @@
if (!IPCL_IS_TCP(connp))
bzero(&connp->u_port, sizeof (connp->u_port));
- qreply(q, mp);
+ return (mp);
}
/*
@@ -17657,7 +17772,9 @@
/* is queue flow controlled? */
if ((q->q_first != NULL || connp->conn_draining) &&
(caller == IP_WPUT)) {
- goto doputq;
+ ASSERT(!need_decref);
+ (void) putq(q, mp);
+ return;
}
/* Multidata transmit? */
@@ -17992,11 +18109,6 @@
CONN_DEC_REF(connp);
return;
-doputq:
- ASSERT(!need_decref);
- (void) putq(q, mp);
- return;
-
qnext:
/*
* Upper Level Protocols pass down complete IP datagrams
@@ -18933,7 +19045,7 @@
* the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
* the above holds.
*/
-static ipif_t *
+ipif_t *
conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
{
ipif_t *ipif;
@@ -19414,7 +19526,6 @@
boolean_t multirt_send = B_FALSE;
int err;
zoneid_t zoneid;
- boolean_t iphdrhwcksum = B_FALSE;
TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
"ip_wput_ire_start: q %p", q);
@@ -19749,102 +19860,6 @@
/* pseudo checksum (do it in parts for IP header checksum) */
cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-#define FRAGMENT_NEEDED(mtu, size) \
- (((mtu) < (unsigned int)(size)) ? B_TRUE : B_FALSE)
-
-#define IS_FASTPATH(ire, bp) \
- ((ire)->ire_fp_mp != NULL && \
- (MBLKHEAD((bp)) >= (MBLKL((ire)->ire_fp_mp)))) \
-
-#define IPH_UDPH_CHECKSUMP(ipha, hlen) \
- ((uint16_t *)(((uchar_t *)ipha)+(hlen + UDP_CHECKSUM_OFFSET)))
-#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
- ((uint16_t *)(((uchar_t *)ipha)+(hlen+TCP_CHECKSUM_OFFSET)))
-
-#define IP_CKSUM_XMIT(ill, ire, mp, up, proto, hlen, max_frag, \
- ipsec_len) { \
- uint32_t sum; \
- uint32_t xmit_capab = HCKSUM_INET_FULL_V4 | \
- HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM; \
- boolean_t cksum_offload = B_FALSE; \
- \
- /* \
- * The ire fp mp can change due to the arrival of a \
- * DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST \
- * and IRE_MIPRTUN. Hence the ire_fp_mp has to be accessed \
- * only under the ire_lock in such cases. \
- */ \
- LOCK_IRE_FP_MP(ire); \
- if ((ill) && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) && \
- (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- xmit_capab) && (!FRAGMENT_NEEDED(max_frag, \
- (LENGTH + ipsec_len))) && (!(ire->ire_flags & \
- RTF_MULTIRT)) && (ipsec_len == 0) && \
- IS_FASTPATH((ire), (mp)) && (dohwcksum)) { \
- /* \
- * Underlying interface supports hardware checksumming. \
- * So postpone the checksum to the interface driver \
- */ \
- \
- if ((hlen) == IP_SIMPLE_HDR_LENGTH) { \
- if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_IPHDRCKSUM) { \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_IPV4_HDRCKSUM; \
- /* seed the cksum field to 0 */ \
- ipha->ipha_hdr_checksum = 0; \
- iphdrhwcksum = B_TRUE; \
- } \
- /* \
- * If underlying h/w supports full h/w checksumming \
- * and no IP options are present, then offload \
- * full checksumming to the hardware. \
- * \
- * If h/w can do partial checksumming then offload \
- * unless the startpoint offset, including mac-header, \
- * is too big for the interface to some of our \
- * hardware (CE and ERI) which have 6 bit fields. \
- * Sigh. \
- * Unhappily we don't have the mac-header size here \
- * so punt for any options. \
- */ \
- if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_INET_FULL_V4) { \
- UNLOCK_IRE_FP_MP(ire); \
- /* Seed the checksum field to 0 */ \
- *up = 0; \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_FULLCKSUM; \
- cksum_offload = B_TRUE; \
- } else if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_INET_PARTIAL) { \
- UNLOCK_IRE_FP_MP(ire); \
- sum = *up + cksum + proto; \
- sum = (sum & 0xFFFF) + (sum >> 16); \
- *up = (sum & 0xFFFF) + (sum >> 16); \
- /* \
- * All offsets are relative to the beginning \
- * of the IP header. \
- */ \
- mp->b_datap->db_cksumstart = hlen; \
- mp->b_datap->db_cksumstuff = \
- (PROTO == IPPROTO_UDP) ? \
- (hlen) + UDP_CHECKSUM_OFFSET : \
- (hlen) + TCP_CHECKSUM_OFFSET; \
- mp->b_datap->db_cksumend = ipha->ipha_length; \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_PARTIALCKSUM; \
- cksum_offload = B_TRUE; \
- } \
- } \
- } \
- if (!cksum_offload) { \
- UNLOCK_IRE_FP_MP(ire); \
- IP_STAT(ip_out_sw_cksum); \
- (sum) = IP_CSUM((mp), (hlen), cksum + proto); \
- *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \
- } \
-}
if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
queue_t *dev_q = stq->q_next;
@@ -19856,10 +19871,16 @@
(ip_hdr_included != IP_HDR_INCLUDED)) {
hlen = (V_HLEN & 0xF) << 2;
up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- if (*up) {
- IP_CKSUM_XMIT(ill, ire, mp, up,
- IP_UDP_CSUM_COMP, hlen, max_frag,
- ipsec_len);
+ if (*up != 0) {
+ IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
+ hlen, LENGTH, max_frag, ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(
+ ip_udp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
}
}
} else if (ip_hdr_included != IP_HDR_INCLUDED) {
@@ -19873,8 +19894,14 @@
* replicated via several interfaces, and not all of
* them may have this capability.
*/
- IP_CKSUM_XMIT(ill, ire, mp, up,
- IP_TCP_CSUM_COMP, hlen, max_frag, ipsec_len);
+ IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
+ LENGTH, max_frag, ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
} else {
sctp_hdr_t *sctph;
@@ -19904,7 +19931,7 @@
cksum += ttl_protocol;
/* fragment the packet */
- if (FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len)))
+ if (max_frag < (uint_t)(LENGTH + ipsec_len))
goto fragmentit;
/*
* Don't use frag_flag if packet is pre-built or source
@@ -19918,8 +19945,8 @@
ipha->ipha_fragment_offset_and_flags |=
htons(ire->ire_frag_flag);
- if (!iphdrhwcksum) {
- /* checksum */
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+ /* calculate IP header checksum */
cksum += ipha->ipha_ident;
cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
cksum += ipha->ipha_fragment_offset_and_flags;
@@ -20258,7 +20285,11 @@
hlen = (V_HLEN & 0xF) << 2;
up = IPH_TCPH_CHECKSUMP(ipha, hlen);
IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+ LENGTH - hlen);
*up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
+ if (*up == 0)
+ *up = 0xFFFF;
} else if (PROTO == IPPROTO_SCTP &&
(ip_hdr_included != IP_HDR_INCLUDED)) {
sctp_hdr_t *sctph;
@@ -20338,17 +20369,18 @@
*/
hlen = (V_HLEN & 0xF) << 2;
up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- if (*up) {
- uint_t sum;
-
- /*
- * NOTE: watch out for compiler high
- * bits
- */
- IP_STAT(ip_out_sw_cksum);
- sum = IP_CSUM(mp, hlen,
- cksum + IP_UDP_CSUM_COMP);
- *up = (uint16_t)(sum ? sum : ~sum);
+ max_frag = ire->ire_max_frag;
+ if (*up != 0) {
+ IP_CKSUM_XMIT(ire_ill, ire, mp, ipha,
+ up, PROTO, hlen, LENGTH, max_frag,
+ ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(
+ ip_udp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
}
}
}
@@ -20369,9 +20401,7 @@
conn_multicast_loop));
/* Forget header checksum offload */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
- iphdrhwcksum = B_FALSE;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
/*
* Local loopback of multicasts? Check the
@@ -20459,10 +20489,8 @@
}
max_frag = ire->ire_max_frag;
cksum += ttl_protocol;
- if (!FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) {
+ if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
/* No fragmentation required for this one. */
- /* Complete the IP header checksum. */
- cksum += ipha->ipha_ident;
/*
* Don't use frag_flag if packet is pre-built or source
* routed or if multicast (since multicast packets do
@@ -20475,26 +20503,32 @@
ipha->ipha_fragment_offset_and_flags |=
htons(ire->ire_frag_flag);
- cksum += (v_hlen_tos_len >> 16)+
- (v_hlen_tos_len & 0xFFFF);
- cksum += ipha->ipha_fragment_offset_and_flags;
- hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- if (hlen) {
- checksumoptions:
- /*
- * Account for the IP Options in the IP
- * header checksum.
- */
- up = (uint16_t *)(rptr+IP_SIMPLE_HDR_LENGTH);
- do {
- cksum += up[0];
- cksum += up[1];
- up += 2;
- } while (--hlen);
- }
- cksum = ((cksum & 0xFFFF) + (cksum >> 16));
- cksum = ~(cksum + (cksum >> 16));
- ipha->ipha_hdr_checksum = (uint16_t)cksum;
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+ /* Complete the IP header checksum. */
+ cksum += ipha->ipha_ident;
+ cksum += (v_hlen_tos_len >> 16)+
+ (v_hlen_tos_len & 0xFFFF);
+ cksum += ipha->ipha_fragment_offset_and_flags;
+ hlen = (V_HLEN & 0xF) -
+ IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+ if (hlen) {
+ checksumoptions:
+ /*
+ * Account for the IP Options in the IP
+ * header checksum.
+ */
+ up = (uint16_t *)(rptr+
+ IP_SIMPLE_HDR_LENGTH);
+ do {
+ cksum += up[0];
+ cksum += up[1];
+ up += 2;
+ } while (--hlen);
+ }
+ cksum = ((cksum & 0xFFFF) + (cksum >> 16));
+ cksum = ~(cksum + (cksum >> 16));
+ ipha->ipha_hdr_checksum = (uint16_t)cksum;
+ }
if (ipsec_len != 0) {
ipsec_out_process(q, first_mp, ire, ill_index);
if (!next_mp) {
@@ -20991,6 +21025,298 @@
}
/*
+ * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
+ * block chain. We could rewrite to handle arbitrary message block chains but
+ * that would make the code complicated and slow. Right now there three
+ * restrictions:
+ *
+ * 1. The first message block must contain the complete IP header and
+ * at least 1 byte of payload data.
+ * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
+ * so that we can use a single Multidata message.
+ * 3. No frag must be distributed over two or more message blocks so
+ * that we don't need more than two packet descriptors per frag.
+ *
+ * The above restrictions allow us to support userland applications (which
+ * will send down a single message block) and NFS over UDP (which will
+ * send down a chain of at most three message blocks).
+ *
+ * We also don't use MDT for payloads with less than or equal to
+ * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
+ */
+boolean_t
+ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
+{
+ int blocks;
+ ssize_t total, missing, size;
+
+ ASSERT(mp != NULL);
+ ASSERT(hdr_len > 0);
+
+ size = MBLKL(mp) - hdr_len;
+ if (size <= 0)
+ return (B_FALSE);
+
+ /* The first mblk contains the header and some payload. */
+ blocks = 1;
+ total = size;
+ size %= len;
+ missing = (size == 0) ? 0 : (len - size);
+ mp = mp->b_cont;
+
+ while (mp != NULL) {
+ /*
+ * Give up if we encounter a zero length message block.
+ * In practice, this should rarely happen and therefore
+ * not worth the trouble of freeing and re-linking the
+ * mblk from the chain to handle such case.
+ */
+ if ((size = MBLKL(mp)) == 0)
+ return (B_FALSE);
+
+ /* Too many payload buffers for a single Multidata message? */
+ if (++blocks > MULTIDATA_MAX_PBUFS)
+ return (B_FALSE);
+
+ total += size;
+ /* Is a frag distributed over two or more message blocks? */
+ if (missing > size)
+ return (B_FALSE);
+ size -= missing;
+
+ size %= len;
+ missing = (size == 0) ? 0 : (len - size);
+
+ mp = mp->b_cont;
+ }
+
+ return (total > ip_wput_frag_mdt_min);
+}
+
+/*
+ * Outbound IPv4 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
+ uint32_t frag_flag, int offset)
+{
+ ipha_t *ipha_orig;
+ int i1, ip_data_end;
+ uint_t pkts, wroff, hdr_chunk_len, pbuf_idx;
+ mblk_t *hdr_mp, *md_mp = NULL;
+ unsigned char *hdr_ptr, *pld_ptr;
+ multidata_t *mmd;
+ ip_pdescinfo_t pdi;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(MBLKL(mp) > sizeof (ipha_t));
+
+ ipha_orig = (ipha_t *)mp->b_rptr;
+ mp->b_rptr += sizeof (ipha_t);
+
+ /* Calculate how many packets we will send out */
+ i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+ pkts = (i1 + len - 1) / len;
+ ASSERT(pkts > 1);
+
+ /* Allocate a message block which will hold all the IP Headers. */
+ wroff = ip_wroff_extra;
+ hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
+
+ i1 = pkts * hdr_chunk_len;
+ /*
+ * Create the header buffer, Multidata and destination address
+ * and SAP attribute that should be associated with it.
+ */
+ if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+ ((hdr_mp->b_wptr += i1),
+ (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+ !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) {
+ freemsg(mp);
+ if (md_mp == NULL) {
+ freemsg(hdr_mp);
+ } else {
+free_mmd: IP_STAT(ip_frag_mdt_discarded);
+ freemsg(md_mp);
+ }
+ IP_STAT(ip_frag_mdt_allocfail);
+ UPDATE_MIB(&ip_mib, ipOutDiscards, pkts);
+ return;
+ }
+ IP_STAT(ip_frag_mdt_allocd);
+
+ /*
+ * Add a payload buffer to the Multidata; this operation must not
+ * fail, or otherwise our logic in this routine is broken. There
+ * is no memory allocation done by the routine, so any returned
+ * failure simply tells us that we've done something wrong.
+ *
+ * A failure tells us that either we're adding the same payload
+ * buffer more than once, or we're trying to add more buffers than
+ * allowed. None of the above cases should happen, and we panic
+ * because either there's horrible heap corruption, and/or
+ * programming mistake.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+
+ hdr_ptr = hdr_mp->b_rptr;
+ pld_ptr = mp->b_rptr;
+
+ /* Establish the ending byte offset, based on the starting offset. */
+ offset <<= 3;
+ ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
+ IP_SIMPLE_HDR_LENGTH;
+
+ pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+ while (pld_ptr < mp->b_wptr) {
+ ipha_t *ipha;
+ uint16_t offset_and_flags;
+ uint16_t ip_len;
+ int error;
+
+ ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+ ipha = (ipha_t *)(hdr_ptr + wroff);
+ ASSERT(OK_32PTR(ipha));
+ *ipha = *ipha_orig;
+
+ if (ip_data_end - offset > len) {
+ offset_and_flags = IPH_MF;
+ } else {
+ /*
+ * Last frag. Set len to the length of this last piece.
+ */
+ len = ip_data_end - offset;
+ /* A frag of a frag might have IPH_MF non-zero */
+ offset_and_flags =
+ ntohs(ipha->ipha_fragment_offset_and_flags) &
+ IPH_MF;
+ }
+ offset_and_flags |= (uint16_t)(offset >> 3);
+ offset_and_flags |= (uint16_t)frag_flag;
+ /* Store the offset and flags in the IP header. */
+ ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
+
+ /* Store the length in the IP header. */
+ ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
+ ipha->ipha_length = htons(ip_len);
+
+ /*
+ * Set the IP header checksum. Note that mp is just
+ * the header, so this is easy to pass to ip_csum.
+ */
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+ /*
+ * Record offset and size of header and data of the next packet
+ * in the multidata message.
+ */
+ PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
+ PDESC_PLD_INIT(&pdi);
+ i1 = MIN(mp->b_wptr - pld_ptr, len);
+ ASSERT(i1 > 0);
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+ if (i1 == len) {
+ pld_ptr += len;
+ } else {
+ i1 = len - i1;
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ ASSERT(MBLKL(mp) >= i1);
+ /*
+ * Attach the next payload message block to the
+ * multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+ pld_ptr = mp->b_rptr + i1;
+ }
+
+ if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * Any failure other than ENOMEM indicates that we
+ * have passed in invalid pdesc info or parameters
+ * to mmd_addpdesc, which must not happen.
+ *
+ * EINVAL is a result of failure on boundary checks
+ * against the pdesc info contents. It should not
+ * happen, and we panic because either there's
+ * horrible heap corruption, and/or programming
+ * mistake.
+ */
+ if (error != ENOMEM) {
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
+ "pdesc logic error detected for "
+ "mmd %p pinfo %p (%d)\n",
+ (void *)mmd, (void *)&pdi, error);
+ /* NOTREACHED */
+ }
+ IP_STAT(ip_frag_mdt_addpdescfail);
+ /* Free unattached payload message blocks as well */
+ md_mp->b_cont = mp->b_cont;
+ goto free_mmd;
+ }
+
+ /* Advance fragment offset. */
+ offset += len;
+
+ /* Advance to location for next header in the buffer. */
+ hdr_ptr += hdr_chunk_len;
+
+ /* Did we reach the next payload message block? */
+ if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+ mp = mp->b_cont;
+ /*
+ * Attach the next message block with payload
+ * data to the multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ pld_ptr = mp->b_rptr;
+ }
+ }
+
+ ASSERT(hdr_mp->b_wptr == hdr_ptr);
+ ASSERT(mp->b_wptr == pld_ptr);
+
+ /* Update IP statistics */
+ UPDATE_MIB(&ip_mib, ipFragCreates, pkts);
+ BUMP_MIB(&ip_mib, ipFragOKs);
+ IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts);
+
+ if (pkt_type == OB_PKT) {
+ ire->ire_ob_pkt_count += pkts;
+ if (ire->ire_ipif != NULL)
+ atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+ } else {
+ /*
+ * The type is IB_PKT in the forwarding path and in
+ * the mobile IP case when the packet is being reverse-
+ * tunneled to the home agent.
+ */
+ ire->ire_ib_pkt_count += pkts;
+ ASSERT(!IRE_IS_LOCAL(ire));
+ if (ire->ire_type & IRE_BROADCAST)
+ atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
+ else
+ atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
+ }
+ ire->ire_last_used_time = lbolt;
+ /* Send it down */
+ putnext(ire->ire_stq, md_mp);
+ return;
+
+pbuf_panic:
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
+ "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+ pbuf_idx);
+ /* NOTREACHED */
+}
+
+/*
* Outbound IP fragmentation routine.
*
* NOTE : This routine does not ire_refrele the ire that is passed in
@@ -21000,29 +21326,30 @@
ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
uint32_t frag_flag)
{
- int i1;
- mblk_t *ll_hdr_mp;
- int ll_hdr_len;
- int hdr_len;
- mblk_t *hdr_mp;
- ipha_t *ipha;
- int ip_data_end;
- int len;
- mblk_t *mp = mp_orig;
- int offset;
- queue_t *q;
+ int i1;
+ mblk_t *ll_hdr_mp;
+ int ll_hdr_len;
+ int hdr_len;
+ mblk_t *hdr_mp;
+ ipha_t *ipha;
+ int ip_data_end;
+ int len;
+ mblk_t *mp = mp_orig;
+ int offset;
+ queue_t *q;
uint32_t v_hlen_tos_len;
- mblk_t *first_mp;
- boolean_t mctl_present;
- mblk_t *xmit_mp;
- mblk_t *carve_mp;
- ire_t *ire1 = NULL;
- ire_t *save_ire = NULL;
- mblk_t *next_mp = NULL;
- boolean_t last_frag = B_FALSE;
- boolean_t multirt_send = B_FALSE;
- ire_t *first_ire = NULL;
- irb_t *irb = NULL;
+ mblk_t *first_mp;
+ boolean_t mctl_present;
+ ill_t *ill;
+ mblk_t *xmit_mp;
+ mblk_t *carve_mp;
+ ire_t *ire1 = NULL;
+ ire_t *save_ire = NULL;
+ mblk_t *next_mp = NULL;
+ boolean_t last_frag = B_FALSE;
+ boolean_t multirt_send = B_FALSE;
+ ire_t *first_ire = NULL;
+ irb_t *irb = NULL;
TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
"ip_wput_frag_start:");
@@ -21036,6 +21363,7 @@
mctl_present = B_FALSE;
}
+ ASSERT(MBLKL(mp) >= sizeof (ipha_t));
ipha = (ipha_t *)mp->b_rptr;
/*
@@ -21079,8 +21407,37 @@
}
hdr_len = (V_HLEN & 0xF) << 2;
+
ipha->ipha_hdr_checksum = 0;
+ /*
+ * Establish the number of bytes maximum per frag, after putting
+ * in the header.
+ */
+ len = (max_frag - hdr_len) & ~7;
+
+ /* Check if we can use MDT to send out the frags. */
+ ASSERT(!IRE_IS_LOCAL(ire));
+ if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound &&
+ !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) &&
+ (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) &&
+ IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
+ ASSERT(ill->ill_mdt_capab != NULL);
+ if (!ill->ill_mdt_capab->ill_mdt_on) {
+ /*
+ * If MDT has been previously turned off in the past,
+ * and we currently can do MDT (due to IPQoS policy
+ * removal, etc.) then enable it for this interface.
+ */
+ ill->ill_mdt_capab->ill_mdt_on = 1;
+ ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
+ ill->ill_name));
+ }
+ ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
+ offset);
+ return;
+ }
+
/* Get a copy of the header for the trailing frags */
hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset);
if (!hdr_mp) {
@@ -21100,12 +21457,6 @@
offset <<= 3;
ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
- /*
- * Establish the number of bytes maximum per frag, after putting
- * in the header.
- */
- len = (max_frag - hdr_len) & ~7;
-
/* Store the length of the first fragment in the IP header. */
i1 = len + hdr_len;
ASSERT(i1 <= IP_MAXPACKET);
@@ -22565,8 +22916,6 @@
zoneid_t zoneid;
uint32_t cksum;
uint16_t *up;
- /* Hack until the UDP merge into IP happens. */
- extern boolean_t udp_compute_checksum(void);
#ifdef _BIG_ENDIAN
#define LENGTH (v_hlen_tos_len & 0xFFFF)
#else
@@ -22741,6 +23090,8 @@
offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET;
IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes,
+ ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH));
#define iphs ((uint16_t *)ipha)
cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
iphs[9] + ntohs(htons(ipha->ipha_length) -
@@ -23790,10 +24141,10 @@
void
ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
{
- conn_t *connp = (conn_t *)arg;
+ conn_t *connp = arg;
tcp_t *tcp;
- ASSERT(connp != NULL && connp->conn_tcp != NULL);
+ ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL);
tcp = connp->conn_tcp;
if (connp->conn_tcp->tcp_state == TCPS_CLOSED)
@@ -23801,7 +24152,6 @@
else
tcp_rput_other(tcp, mp);
CONN_OPER_PENDING_DONE(connp);
-
}
/* Called from ip_wput for all non data messages */
@@ -24031,31 +24381,48 @@
case T_BIND_REQ: {
/* Request can get queued in bind */
ASSERT(connp != NULL);
+ /*
+ * Both TCP and UDP call ip_bind_{v4,v6}() directly
+ * instead of going through this path. We only get
+ * here in the following cases:
+ *
+ * a. Bind retries, where ipsq is non-NULL.
+ * b. T_BIND_REQ is issued from non TCP/UDP
+ * transport, e.g. icmp for raw socket,
+ * in which case ipsq will be NULL.
+ */
+ ASSERT(ipsq != NULL ||
+ (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp)));
+
/* Don't increment refcnt if this is a re-entry */
if (ipsq == NULL)
CONN_INC_REF(connp);
- mp = connp->conn_af_isv6 ?
- ip_bind_v6(q, mp, connp, NULL) :
- ip_bind_v4(q, mp, connp);
- if (mp != NULL) {
- tcp_t *tcp;
-
- tcp = connp->conn_tcp;
- if (tcp != NULL) {
- if (ipsq == NULL) {
- tcp_rput_other(tcp, mp);
- } else {
- CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp,
- ip_resume_tcp_bind,
- connp, SQTAG_TCP_RPUTOTHER);
- return;
- }
- } else {
- qreply(q, mp);
- }
- CONN_OPER_PENDING_DONE(connp);
- }
+ mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
+ connp, NULL) : ip_bind_v4(q, mp, connp);
+ if (mp == NULL)
+ return;
+ if (IPCL_IS_TCP(connp)) {
+ /*
+ * In the case of TCP endpoint we
+ * come here only for bind retries
+ */
+ ASSERT(ipsq != NULL);
+ CONN_INC_REF(connp);
+ squeue_fill(connp->conn_sqp, mp,
+ ip_resume_tcp_bind, connp,
+ SQTAG_BIND_RETRY);
+ return;
+ } else if (IPCL_IS_UDP(connp)) {
+ /*
+ * In the case of UDP endpoint we
+ * come here only for bind retries
+ */
+ ASSERT(ipsq != NULL);
+ udp_resume_bind(connp, mp);
+ return;
+ }
+ qreply(q, mp);
+ CONN_OPER_PENDING_DONE(connp);
return;
}
case T_SVR4_OPTMGMT_REQ:
@@ -24111,7 +24478,8 @@
}
return;
case T_UNBIND_REQ:
- ip_unbind(q, mp);
+ mp = ip_unbind(q, mp);
+ qreply(q, mp);
return;
default:
/*