6934730 rpcbind wedged up due to stale conn_idl
authormeem <Peter.Memishian@Sun.COM>
Sat, 27 Mar 2010 02:33:20 -0400
changeset 12020 61024838b665
parent 12019 cfe7a7d0d7f0
child 12021 e3ff6e69ccff
6934730 rpcbind wedged up due to stale conn_idl 6937207 stale conn drain logic and comments
usr/src/uts/common/inet/ip.h
usr/src/uts/common/inet/ip/ip.c
usr/src/uts/common/inet/ip/ip_attr.c
usr/src/uts/common/inet/ipclassifier.h
--- a/usr/src/uts/common/inet/ip.h	Fri Mar 26 20:19:35 2010 -0400
+++ b/usr/src/uts/common/inet/ip.h	Sat Mar 27 02:33:20 2010 -0400
@@ -828,15 +828,11 @@
 } iulp_t;
 
 /*
- * The conn drain list structure (idl_t).
- * The list is protected by idl_lock. Each conn_t inserted in the list
- * points back at this idl_t using conn_idl. IP primes the draining of the
- * conns queued in these lists, by qenabling the 1st conn of each list. This
- * occurs when STREAMS backenables ip_wsrv on the IP module. Each conn instance
- * of ip_wsrv successively qenables the next conn in the list.
- * idl_lock protects all other members of idl_t and conn_drain_next
- * and conn_drain_prev of conn_t. The conn_lock protects IPCF_DRAIN_DISABLED
- * flag of the conn_t and conn_idl.
+ * The conn drain list structure (idl_t), protected by idl_lock.  Each conn_t
+ * inserted in the list points back at this idl_t using conn_idl, and is
+ * chained by conn_drain_next and conn_drain_prev, which are also protected by
+ * idl_lock.  When flow control is relieved, either ip_wsrv() (STREAMS) or
+ * ill_flow_enable() (non-STREAMS) will call conn_drain().
  *
  * The conn drain list, idl_t, itself is part of tx cookie list structure.
  * A tx cookie list points to a blocked Tx ring and contains the list of
@@ -860,10 +856,6 @@
 struct idl_s {
 	conn_t		*idl_conn;		/* Head of drain list */
 	kmutex_t	idl_lock;		/* Lock for this list */
-	uint32_t
-		idl_repeat : 1,			/* Last conn must re-enable */
-						/* drain list again */
-		idl_unused : 31;
 	idl_tx_list_t	*idl_itl;
 };
 
--- a/usr/src/uts/common/inet/ip/ip.c	Fri Mar 26 20:19:35 2010 -0400
+++ b/usr/src/uts/common/inet/ip/ip.c	Sat Mar 27 02:33:20 2010 -0400
@@ -503,50 +503,36 @@
  * |idl_tx_list[n]|-> ...
  * ----------------
  *
- * When mac_tx() returns a cookie, the cookie is used to hash into a
- * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
- * called passing idl_tx_list. The connp gets inserted in a drain list
- * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
- * the sockets (non stream based) and sets QFULL condition on the conn_wq
- * of streams sockets, or the su_txqfull for non-streams sockets.
- * connp->conn_direct_blocked will be set to indicate the blocked
- * condition.
- *
- * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
- * A cookie is passed in the call to ill_flow_enable() that identifies the
- * blocked Tx ring. This cookie is used to get to the idl_tx_list that
- * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
- * and goes through each conn in the drain list and calls conn_idl_remove
- * for the conn to clear the qfull condition for the conn, as well as to
- * remove the conn from the idl list. In addition, streams based sockets
- * will have the conn_wq enabled, causing ip_wsrv to run for the
- * conn. ip_wsrv drains the queued messages, and removes the conn from the
- * drain list, if all messages were drained. It also notifies the
- * conn_upcalls for the conn to signal that flow-control has opened up.
- *
- * In reality the drain list is not a single list, but a configurable number
- * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
- * each conn in the list. conn_drain_insert and conn_drain_tail are the only
- * functions that manipulate this drain list. conn_drain_insert is called in
- * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
- * (as opposed to from ip_wsrv context for STREAMS
- * case -- see below). The synchronization between drain insertion and flow
- * control wakeup is handled by using idl_txl->txl_lock.
- *
- * Flow control using STREAMS:
- * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
- * is used. On the send side, if the packet cannot be sent down to the
- * driver by IP, because of a canput failure, ip_xmit drops the packet
- * and returns EWOULDBLOCK to the caller, who may then invoke
- * ixa_check_drain_insert to insert the conn on the 0'th drain list.
- * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
- * blocked conns in the * 0'th drain list is drained as with the
- * non-STREAMS case.
- *
- * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
- * qfull is done when the conn is inserted into the drain list
- * (conn_drain_insert()) and cleared when the conn is removed from the drain
- * list (conn_idl_remove()).
+ * When mac_tx() returns a cookie, the cookie is hashed into an index into
+ * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
+ * to insert the conn onto.  conn_drain_insert() asserts flow control for the
+ * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
+ * Further, conn_blocked is set to indicate that the conn is blocked.
+ *
+ * GLDv3 calls ill_flow_enable() when flow control is relieved.  The cookie
+ * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
+ * is again hashed to locate the appropriate idl_tx_list, which is then
+ * drained via conn_walk_drain().  conn_walk_drain() goes through each conn in
+ * the drain list and calls conn_drain_remove() to clear flow control (via
+ * calling su_txq_full() or clearing QFULL), and remove the conn from the
+ * drain list.
+ *
+ * Note that the drain list is not a single list but a (configurable) array of
+ * lists (8 elements by default).  Synchronization between drain insertion and
+ * flow control wakeup is handled by using idl_txl->txl_lock, and only
+ * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
+ *
+ * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
+ * On the send side, if the packet cannot be sent down to the driver by IP
+ * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
+ * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
+ * the 0'th drain list.  When ip_wsrv() runs on the ill_wq because flow
+ * control has been relieved, the blocked conns in the 0'th drain list are
+ * drained as in the non-STREAMS case.
+ *
+ * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
+ * is done when the conn is inserted into the drain list (conn_drain_insert())
+ * and cleared when the conn is removed from the it (conn_drain_remove()).
  *
  * IPQOS notes:
  *
@@ -727,7 +713,7 @@
 
 static void	conn_drain_init(ip_stack_t *);
 static void	conn_drain_fini(ip_stack_t *);
-static void	conn_drain_tail(conn_t *connp, boolean_t closing);
+static void	conn_drain(conn_t *connp, boolean_t closing);
 
 static void	conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
 static void	conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
@@ -4192,7 +4178,7 @@
 	 */
 	if (drain_cleanup_reqd && connp->conn_idl != NULL) {
 		mutex_enter(&connp->conn_idl->idl_lock);
-		conn_drain_tail(connp, B_TRUE);
+		conn_drain(connp, B_TRUE);
 		mutex_exit(&connp->conn_idl->idl_lock);
 	}
 
@@ -13010,15 +12996,11 @@
 }
 
 /*
- * Note: For an overview of how flowcontrol is handled in IP please see the
- * IP Flowcontrol notes at the top of this file.
- *
- * Flow control has blocked us from proceeding. Insert the given conn in one
- * of the conn drain lists. These conn wq's will be qenabled later on when
- * STREAMS flow control does a backenable. conn_walk_drain will enable
- * the first conn in each of these drain lists. Each of these qenabled conns
- * in turn enables the next in the list, after it runs, or when it closes,
- * thus sustaining the drain process.
+ * Flow control has blocked us from proceeding.  Insert the given conn in one
+ * of the conn drain lists.  When flow control is unblocked, either ip_wsrv()
+ * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
+ * will call conn_walk_drain().  See the flow control notes at the top of this
+ * file for more details.
  */
 void
 conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
@@ -13049,6 +13031,8 @@
 		if (index == ipst->ips_conn_drain_list_cnt)
 			index = 0;
 		tx_list->txl_drain_index = index;
+	} else {
+		ASSERT(connp->conn_idl->idl_itl == tx_list);
 	}
 	mutex_exit(&connp->conn_lock);
 
@@ -13094,18 +13078,13 @@
 }
 
 static void
-conn_idl_remove(conn_t *connp)
+conn_drain_remove(conn_t *connp)
 {
 	idl_t *idl = connp->conn_idl;
 
 	if (idl != NULL) {
 		/*
-		 * Remove ourself from the drain list, if we did not do
-		 * a putq, or if the conn is closing.
-		 * Note: It is possible that q->q_first is non-null. It means
-		 * that these messages landed after we did a enableok() in
-		 * ip_wsrv. Thus STREAMS will call ip_wsrv once again to
-		 * service them.
+		 * Remove ourself from the drain list.
 		 */
 		if (connp->conn_drain_next == connp) {
 			/* Singleton in the list */
@@ -13119,6 +13098,16 @@
 			if (idl->idl_conn == connp)
 				idl->idl_conn = connp->conn_drain_next;
 		}
+
+		/*
+		 * NOTE: because conn_idl is associated with a specific drain
+		 * list which in turn is tied to the index the TX ring
+		 * (txl_cookie) hashes to, and because the TX ring can change
+		 * over the lifetime of the conn_t, we must clear conn_idl so
+		 * a subsequent conn_drain_insert() will set conn_idl again
+		 * based on the latest txl_cookie.
+		 */
+		connp->conn_idl = NULL;
 	}
 	connp->conn_drain_next = NULL;
 	connp->conn_drain_prev = NULL;
@@ -13139,7 +13128,7 @@
  * inform the sockfs upcalls about the change in flow-control.
  */
 static void
-conn_drain_tail(conn_t *connp, boolean_t closing)
+conn_drain(conn_t *connp, boolean_t closing)
 {
 	idl_t *idl;
 	conn_t *next_connp;
@@ -13157,42 +13146,31 @@
 	ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
 
 	/*
-	 * If connp->conn_idl is null, the conn has not been inserted into any
-	 * drain list even once since creation of the conn. Just return.
-	 */
-	if (connp == NULL || connp->conn_idl == NULL)
+	 * If the conn doesn't exist or is not on a drain list, bail.
+	 */
+	if (connp == NULL || connp->conn_idl == NULL ||
+	    connp->conn_drain_prev == NULL) {
 		return;
-
-	if (connp->conn_drain_prev == NULL) {
-		/* This conn is currently not in the drain list.  */
-		return;
-	}
+	}
+
 	idl = connp->conn_idl;
 	if (!closing) {
-		/*
-		 * This conn is the current drainer. If this is the last conn
-		 * in the drain list, we need to do more checks, in the 'if'
-		 * below. Otherwwise we need to just qenable the next conn,
-		 * to sustain the draining, and is handled in the 'else'
-		 * below.
-		 */
 		next_connp = connp->conn_drain_next;
 		while (next_connp != connp) {
 			conn_t *delconnp = next_connp;
 
 			next_connp = next_connp->conn_drain_next;
-			conn_idl_remove(delconnp);
+			conn_drain_remove(delconnp);
 		}
 		ASSERT(connp->conn_drain_next == idl->idl_conn);
 	}
-	conn_idl_remove(connp);
-
+	conn_drain_remove(connp);
 }
 
 /*
  * Write service routine. Shared perimeter entry point.
  * The device queue's messages has fallen below the low water mark and STREAMS
- * has backenabled the ill_wq. Send sockfs notification about flow-control onx
+ * has backenabled the ill_wq. Send sockfs notification about flow-control on
  * each waiting conn.
  */
 void
@@ -13244,13 +13222,8 @@
 }
 
 /*
- * Flowcontrol has relieved, and STREAMS has backenabled us. For each list
- * of conns that need to be drained, check if drain is already in progress.
- * If so set the idl_repeat bit, indicating that the last conn in the list
- * needs to reinitiate the drain once again, for the list. If drain is not
- * in progress for the list, initiate the draining, by qenabling the 1st
- * conn in the list. The drain is self-sustaining, each qenabled conn will
- * in turn qenable the next conn, when it is done/blocked/closing.
+ * Flow control has been relieved and STREAMS has backenabled us; drain
+ * all the conn lists on `tx_list'.
  */
 static void
 conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
@@ -13263,7 +13236,7 @@
 	for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
 		idl = &tx_list->txl_drain_list[i];
 		mutex_enter(&idl->idl_lock);
-		conn_drain_tail(idl->idl_conn, B_FALSE);
+		conn_drain(idl->idl_conn, B_FALSE);
 		mutex_exit(&idl->idl_lock);
 	}
 }
@@ -13376,7 +13349,10 @@
 			}
 		}
 	}
-	connp->conn_direct_blocked = B_FALSE;
+
+	mutex_enter(&connp->conn_lock);
+	connp->conn_blocked = B_FALSE;
+	mutex_exit(&connp->conn_lock);
 }
 
 /*
--- a/usr/src/uts/common/inet/ip/ip_attr.c	Fri Mar 26 20:19:35 2010 -0400
+++ b/usr/src/uts/common/inet/ip/ip_attr.c	Sat Mar 27 02:33:20 2010 -0400
@@ -1307,35 +1307,49 @@
 
 	idd = &(ill)->ill_dld_capab->idc_direct;
 	idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
-	if (cookie == 0) {
-		/*
-		 * ip_xmit failed the canputnext check
-		 */
-		connp->conn_did_putbq = 1;
-		ASSERT(cookie == 0);
-		conn_drain_insert(connp, idl_txl);
-		if (!IPCL_IS_NONSTR(connp))
-			noenable(connp->conn_wq);
-		return (B_TRUE);
-	}
+	mutex_enter(&idl_txl->txl_lock);
+
+	/*
+	 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
+	 * control is asserted on an ill that does not support direct calls.
+	 * Jump to insert.
+	 */
+	if (cookie == 0)
+		goto tryinsert;
+
 	ASSERT(ILL_DIRECT_CAPABLE(ill));
-	mutex_enter(&idl_txl->txl_lock);
-	if (connp->conn_direct_blocked ||
-	    (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0)) {
-		DTRACE_PROBE1(ill__tx__not__blocked, boolean,
-		    connp->conn_direct_blocked);
+
+	if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
+		DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
 	} else if (idl_txl->txl_cookie != NULL &&
 	    idl_txl->txl_cookie != ixa->ixa_cookie) {
-		DTRACE_PROBE2(ill__send__tx__collision, uintptr_t, cookie,
+		DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
 		    uintptr_t, idl_txl->txl_cookie);
-		/* bump kstat for cookie collision */
+		/* TODO: bump kstat for cookie collision */
 	} else {
-		connp->conn_direct_blocked = B_TRUE;
-		idl_txl->txl_cookie = cookie;
-		conn_drain_insert(connp, idl_txl);
-		if (!IPCL_IS_NONSTR(connp))
-			noenable(connp->conn_wq);
-		inserted = B_TRUE;
+		/*
+		 * Check/set conn_blocked under conn_lock.  Note that txl_lock
+		 * will not suffice since two separate UDP threads may be
+		 * racing to send to different destinations that are
+		 * associated with different cookies and thus may not be
+		 * holding the same txl_lock.  Further, since a given conn_t
+		 * can only be on a single drain list, the conn_t will be
+		 * enqueued on whichever thread wins this race.
+		 */
+tryinsert:	mutex_enter(&connp->conn_lock);
+		if (connp->conn_blocked) {
+			DTRACE_PROBE1(ill__tx__conn__already__blocked,
+			    conn_t *, connp);
+			mutex_exit(&connp->conn_lock);
+		} else {
+			connp->conn_blocked = B_TRUE;
+			mutex_exit(&connp->conn_lock);
+			idl_txl->txl_cookie = cookie;
+			conn_drain_insert(connp, idl_txl);
+			if (!IPCL_IS_NONSTR(connp))
+				noenable(connp->conn_wq);
+			inserted = B_TRUE;
+		}
 	}
 	mutex_exit(&idl_txl->txl_lock);
 	return (inserted);
--- a/usr/src/uts/common/inet/ipclassifier.h	Fri Mar 26 20:19:35 2010 -0400
+++ b/usr/src/uts/common/inet/ipclassifier.h	Sat Mar 27 02:33:20 2010 -0400
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -276,28 +276,26 @@
 		conn_reuseaddr : 1,		/* SO_REUSEADDR state */
 		conn_keepalive : 1,		/* SO_KEEPALIVE state */
 		conn_multi_router : 1,		/* Wants all multicast pkts */
-		conn_did_putbq : 1,		/* ip_wput did a putbq */
+		conn_unspec_src : 1,		/* IP_UNSPEC_SRC */
 
-		conn_unspec_src : 1,		/* IP_UNSPEC_SRC */
 		conn_policy_cached : 1,		/* Is policy cached/latched ? */
 		conn_in_enforce_policy : 1,	/* Enforce Policy on inbound */
 		conn_out_enforce_policy : 1,	/* Enforce Policy on outbound */
+		conn_debug : 1,			/* SO_DEBUG */
 
-		conn_debug : 1,			/* SO_DEBUG */
 		conn_ipv6_v6only : 1,		/* IPV6_V6ONLY */
 		conn_oobinline : 1, 		/* SO_OOBINLINE state */
 		conn_dgram_errind : 1,		/* SO_DGRAM_ERRIND state */
+		conn_exclbind : 1,		/* SO_EXCLBIND state */
 
-		conn_exclbind : 1,		/* SO_EXCLBIND state */
 		conn_mdt_ok : 1,		/* MDT is permitted */
 		conn_allzones : 1,		/* SO_ALLZONES */
 		conn_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
-
 		conn_mcbc_bind : 1,		/* Bound to multi/broadcast */
 
-		conn_pad_to_bit_31 : 11;
+		conn_pad_to_bit_31 : 12;
 
-	boolean_t conn_direct_blocked;		/* conn is flow-controlled */
+	boolean_t	conn_blocked;		/* conn is flow-controlled */
 
 	squeue_t	*conn_initial_sqp;	/* Squeue at open time */
 	squeue_t	*conn_final_sqp;	/* Squeue after connect */