1361 Add support for socket options TCP_KEEPCNT, TCP_KEEPIDLE, TCP_KEEPINTVL
authorGarrett D'Amore <garrett@nexenta.com>
Wed, 17 Aug 2011 16:31:10 -0700
changeset 13435 3185061eadee
parent 13434 af0bf36c290c
child 13436 a6bc41319a2d
1361 Add support for socket options TCP_KEEPCNT, TCP_KEEPIDLE, TCP_KEEPINTVL Reviewed by: Pavan <[email protected]> Reviewed by: Dan McDonald <danmcd at nexenta.com> Reviewed by: Garrett D'Amore <[email protected]> Approved by: Garrett D'Amore <[email protected]>
usr/src/man/man7p/tcp.7p
usr/src/uts/common/inet/tcp.h
usr/src/uts/common/inet/tcp/tcp.c
usr/src/uts/common/inet/tcp/tcp_opt_data.c
usr/src/uts/common/inet/tcp/tcp_timers.c
usr/src/uts/common/netinet/tcp.h
--- a/usr/src/man/man7p/tcp.7p	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/man/man7p/tcp.7p	Wed Aug 17 16:31:10 2011 -0700
@@ -1,5 +1,6 @@
 '\" te
 .\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved.
+.\" Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
 .\" Copyright 1989 AT&T
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
@@ -344,6 +345,14 @@
 out and abort the connection when probing. The system default is controlled by
 the TCP ndd parameter tcp_keepalive_abort_interval. The default is eight
 minutes.
+.sp
+.LP
+socket options TCP_KEEPIDLE, TCP_KEEPCNT and TCP_KEEPINTVL are also supported
+for compatibility with other Unix Flavors. TCP_KEEPIDLE option specifies the
+interval in seconds for sending out the first keep-alive probe. TCP_KEEPCNT
+specifies the number of keep-alive probes to be sent before aborting the
+connection in the event of no response from peer. TCP_KEEPINTVL specifies the
+interval in seconds between successive keep-alive probes.
 .SH SEE ALSO
 .sp
 .LP
@@ -385,7 +394,7 @@
 \fB\fBEISCONN\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 A \fBconnect()\fR operation was attempted on a socket on which a
 \fBconnect()\fR operation had already been performed.
 .RE
@@ -397,7 +406,7 @@
 \fB\fBETIMEDOUT\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 A connection was dropped due to excessive retransmissions.
 .RE
 
@@ -408,7 +417,7 @@
 \fB\fBECONNRESET\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 The remote peer forced the connection to be closed (usually because the remote
 machine has lost state information about the connection due to a crash).
 .RE
@@ -420,7 +429,7 @@
 \fB\fBECONNREFUSED\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 The remote peer actively refused connection establishment (usually because no
 process is listening to the port).
 .RE
@@ -432,7 +441,7 @@
 \fB\fBEADDRINUSE\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 A \fBbind()\fR operation was attempted on a socket with a network address/port
 pair that has already been bound to another socket.
 .RE
@@ -444,7 +453,7 @@
 \fB\fBEADDRNOTAVAIL\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 A \fBbind()\fR operation was attempted on a socket with a network address for
 which no network interface exists.
 .RE
@@ -456,7 +465,7 @@
 \fB\fBEACCES\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 A \fBbind()\fR operation was attempted with a "reserved" port number and the
 effective user \fBID\fR of the process was not the privileged user.
 .RE
@@ -468,7 +477,7 @@
 \fB\fBENOBUFS\fR\fR
 .ad
 .RS 17n
-.rt  
+.rt
 The system ran out of memory for internal data structures.
 .RE
 
--- a/usr/src/uts/common/inet/tcp.h	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/uts/common/inet/tcp.h	Wed Aug 17 16:31:10 2011 -0700
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -334,11 +335,26 @@
 	} tcp_conn;
 	uint32_t tcp_syn_rcvd_timeout;	/* How many SYN_RCVD timeout in q0 */
 
-	/* TCP Keepalive Timer members */
+	/*
+	 * TCP Keepalive Timer members.
+	 * All keepalive timer intervals are in milliseconds.
+	 */
 	int32_t	tcp_ka_last_intrvl;	/* Last probe interval */
 	timeout_id_t tcp_ka_tid;	/* Keepalive timer ID */
 	uint32_t tcp_ka_interval;	/* Keepalive interval */
+
+	/*
+	 * TCP connection is terminated if we don't hear back from the peer
+	 * for tcp_ka_abort_thres milliseconds after the first keepalive probe.
+	 * tcp_ka_rinterval is the interval in milliseconds between successive
+	 * keepalive probes. tcp_ka_cnt is the number of keepalive probes to
+	 * be sent before terminating the connection, if we don't hear back from
+	 * peer.
+	 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt
+	 */
+	uint32_t tcp_ka_rinterval;	/* keepalive retransmit interval */
 	uint32_t tcp_ka_abort_thres;	/* Keepalive abort threshold */
+	uint32_t tcp_ka_cnt;		/* count of keepalive probes */
 
 	int32_t	tcp_client_errno;	/* How the client screwed up */
 
--- a/usr/src/uts/common/inet/tcp/tcp.c	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/uts/common/inet/tcp/tcp.c	Wed Aug 17 16:31:10 2011 -0700
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -2354,6 +2355,8 @@
 
 		tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
 		tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
+		tcp->tcp_ka_cnt = 0;
+		tcp->tcp_ka_rinterval = 0;
 
 		/*
 		 * Default value of tcp_init_cwnd is 0, so no need to set here
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c	Wed Aug 17 16:31:10 2011 -0700
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -117,6 +118,12 @@
 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0	},
 
+{ TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 	sizeof (int), 0	},
 
@@ -403,6 +410,25 @@
 		case TCP_KEEPALIVE_THRESHOLD:
 			*i1 = tcp->tcp_ka_interval;
 			return (sizeof (int));
+
+		/*
+		 * TCP_KEEPIDLE expects value in seconds, but
+		 * tcp_ka_interval is in milliseconds.
+		 */
+		case TCP_KEEPIDLE:
+			*i1 = tcp->tcp_ka_interval / 1000;
+			return (sizeof (int));
+		case TCP_KEEPCNT:
+			*i1 = tcp->tcp_ka_cnt;
+			return (sizeof (int));
+
+		/*
+		 * TCP_KEEPINTVL expects value in seconds, but
+		 * tcp_ka_rinterval is in milliseconds.
+		 */
+		case TCP_KEEPINTVL:
+			*i1 = tcp->tcp_ka_rinterval / 1000;
+			return (sizeof (int));
 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
 			*i1 = tcp->tcp_ka_abort_thres;
 			return (sizeof (int));
@@ -682,6 +708,18 @@
 			}
 			tcp->tcp_init_cwnd = val;
 			break;
+
+		/*
+		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
+		 * is in milliseconds. TCP_KEEPIDLE is introduced for
+		 * compatibility with other Unix flavors.
+		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
+		 * converting the input to milliseconds.
+		 */
+		case TCP_KEEPIDLE:
+			*i1 *= 1000;
+			/* fall through */
+
 		case TCP_KEEPALIVE_THRESHOLD:
 			if (checkonly)
 				break;
@@ -708,6 +746,66 @@
 				}
 			}
 			break;
+
+		/*
+		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
+		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
+		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
+		 * tcp_ka_cnt.
+		 */
+		case TCP_KEEPCNT:
+			if (checkonly)
+				break;
+
+			if (*i1 == 0) {
+				return (EINVAL);
+			} else if (tcp->tcp_ka_rinterval == 0) {
+				if ((tcp->tcp_ka_abort_thres / *i1) <
+				    tcp->tcp_rto_min ||
+				    (tcp->tcp_ka_abort_thres / *i1) >
+				    tcp->tcp_rto_max)
+					return (EINVAL);
+
+				tcp->tcp_ka_rinterval =
+				    tcp->tcp_ka_abort_thres / *i1;
+			} else {
+				if ((*i1 * tcp->tcp_ka_rinterval) <
+				    tcps->tcps_keepalive_abort_interval_low ||
+				    (*i1 * tcp->tcp_ka_rinterval) >
+				    tcps->tcps_keepalive_abort_interval_high)
+					return (EINVAL);
+				tcp->tcp_ka_abort_thres =
+				    (*i1 * tcp->tcp_ka_rinterval);
+			}
+			tcp->tcp_ka_cnt = *i1;
+			break;
+		case TCP_KEEPINTVL:
+			/*
+			 * TCP_KEEPINTVL is specified in seconds, but
+			 * tcp_ka_rinterval is in milliseconds.
+			 */
+
+			if (checkonly)
+				break;
+
+			if ((*i1 * 1000) < tcp->tcp_rto_min ||
+			    (*i1 * 1000) > tcp->tcp_rto_max)
+				return (EINVAL);
+
+			if (tcp->tcp_ka_cnt == 0) {
+				tcp->tcp_ka_cnt =
+				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
+			} else {
+				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
+				    tcps->tcps_keepalive_abort_interval_low ||
+				    (*i1 * tcp->tcp_ka_cnt * 1000) >
+				    tcps->tcps_keepalive_abort_interval_high)
+					return (EINVAL);
+				tcp->tcp_ka_abort_thres =
+				    (*i1 * tcp->tcp_ka_cnt * 1000);
+			}
+			tcp->tcp_ka_rinterval = *i1 * 1000;
+			break;
 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
 			if (!checkonly) {
 				if (*i1 <
@@ -718,6 +816,8 @@
 					return (EINVAL);
 				}
 				tcp->tcp_ka_abort_thres = *i1;
+				tcp->tcp_ka_cnt = 0;
+				tcp->tcp_ka_rinterval = 0;
 			}
 			break;
 		case TCP_CORK:
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c	Wed Aug 17 16:31:10 2011 -0700
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -390,6 +391,11 @@
  * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
  * kill the connection unless the keepalive abort threshold is 0.  In
  * that case, we will probe "forever."
+ * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
+ * the exponential backoff, but send probes tcp_ka_cnt times in regular
+ * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
+ * Kill the connection if we don't hear back from peer after tcp_ka_cnt
+ * probes are sent.
  */
 void
 tcp_keepalive_timer(void *arg)
@@ -455,7 +461,9 @@
 			if (mp != NULL) {
 				tcp_send_data(tcp, mp);
 				TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
-				if (tcp->tcp_ka_last_intrvl != 0) {
+				if (tcp->tcp_ka_rinterval) {
+					firetime = tcp->tcp_ka_rinterval;
+				} else if (tcp->tcp_ka_last_intrvl != 0) {
 					int max;
 					/*
 					 * We should probe again at least
--- a/usr/src/uts/common/netinet/tcp.h	Mon Aug 15 19:42:37 2011 +0100
+++ b/usr/src/uts/common/netinet/tcp.h	Wed Aug 17 16:31:10 2011 -0700
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 /*
@@ -125,6 +126,9 @@
 /* gap for expansion of ``standard'' options */
 #define	TCP_ANONPRIVBIND		0x20	/* for internal use only  */
 #define	TCP_EXCLBIND			0x21	/* for internal use only  */
+#define	TCP_KEEPIDLE			0x22
+#define	TCP_KEEPCNT			0x23
+#define	TCP_KEEPINTVL			0x24
 
 #ifdef	__cplusplus
 }