components/open-fabrics/rds-tools/patches/001-rds-tools-solaris_port.patch
author Rich Burridge <rich.burridge@oracle.com>
Tue, 02 May 2017 17:33:26 -0700
changeset 7964 d9801318ed3d
parent 5826 9c90e4a8156c
permissions -rw-r--r--
25981468 Build ilmbase and openexr with the GNU compilers

#This patch was developed both in-house and from outside. We plan to submit it
#upstream, but do not yet have a target date for doing so
#
# HG changeset patch
# Parent  1afa90e87b4b0fcc3ccef29f1029fa7d1628f389

diff -r 1afa90e87b4b Makefile.in
--- a/Makefile.in	Tue Feb 23 22:30:57 2016 -0800
+++ b/Makefile.in	Tue Feb 23 22:30:58 2016 -0800
@@ -4,18 +4,22 @@
 mandir		= $(DESTDIR)@mandir@
 incdir		= $(DESTDIR)@includedir@
 
+CC=gcc
+
 all: all-programs
 
-CFLAGS = -O2 -Wall -Iinclude
-CPPFLAGS = -DDEBUG_EXE -DRDS_VERSION=\"@VERSION@\" -MD -MP -MF $(@D)/.$(basename $(@F)).d
+CFLAGS += -O2 -Wall -Iinclude
+CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__ \
+	-DDEBUG_EXE -DRDS_VERSION=\"@VERSION@\" -MD -MP -MF $(@D)/.$(basename $(@F)).d
+LDFLAGS += -libverbs -lsocket -lnsl -llgrp
 
 HEADERS = kernel-list.h pfhack.h include/rds.h
-COMMON_SOURCES = pfhack.c
+COMMON_SOURCES = pfhack.c rds-vendor.c
 SOURCES = $(addsuffix .c,$(PROGRAMS)) $(COMMON_SOURCES)
 CLEAN_OBJECTS = $(addsuffix .o,$(PROGRAMS)) $(subst .c,.o,$(COMMON_SOURCES))
 
 # This is the default
-DYNAMIC_PF_RDS = true
+#DYNAMIC_PF_RDS = true
 
 ifneq ($(DYNAMIC_PF_RDS),)
 CPPFLAGS += -DDYNAMIC_PF_RDS
@@ -29,14 +33,14 @@
 all-programs: $(PROGRAMS)
 
 install: $(PROGRAMS)
-	install -d $(bindir)
-	install -m 555 -s $(PROGRAMS) $(bindir)
-	install -d $(mandir)/man1
-	install -d $(mandir)/man7
-	install -m 644 *.1 $(mandir)/man1
-	install -m 644 *.7 $(mandir)/man7
-	install -d $(incdir)/net
-	install -m 444 include/rds.h $(incdir)/net
+	$(INSTALL) -d $(bindir)
+	$(INSTALL) -m 755 -s $(PROGRAMS) $(bindir)
+	$(INSTALL) -d $(mandir)/man1
+	$(INSTALL) -d $(mandir)/man7
+	$(INSTALL) -m 644 *.1 $(mandir)/man1
+	$(INSTALL) -m 644 *.7 $(mandir)/man7
+	$(INSTALL) -d $(incdir)/net
+	$(INSTALL) -m 444 include/rds.h $(incdir)/net
 
 clean:
 	rm -f $(PROGRAMS) $(CLEAN_OBJECTS)
@@ -47,7 +51,7 @@
 
 
 $(PROGRAMS) : % : %.o $(COMMON_OBJECTS)
-	gcc $(CFLAGS) $(LDFLAGS) -o $@ $^
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -o $@ $^
 
 LOCAL_DFILES := $(wildcard .*.d)
 ifneq ($(LOCAL_DFILES),)
@@ -72,8 +76,6 @@
 		configure \
 		README \
 		rds-tools.txt \
-		stap/rds.stp \
-		stap/README \
 		docs/rds-architecture.txt \
 		examples/Makefile \
 		examples/rds-sample.c \
diff -r 1afa90e87b4b configure
--- a/configure	Tue Feb 23 22:30:57 2016 -0800
+++ b/configure	Tue Feb 23 22:30:58 2016 -0800
@@ -1215,7 +1215,7 @@
 
 
 
-VERSION=2.0.4
+VERSION=2.0.7
 RELEASE=1
 
 
diff -r 1afa90e87b4b configure.in
--- a/configure.in	Tue Feb 23 22:30:57 2016 -0800
+++ b/configure.in	Tue Feb 23 22:30:58 2016 -0800
@@ -1,7 +1,7 @@
 AC_PREREQ(2.55)
 AC_INIT()
 
-VERSION=2.0.4
+VERSION=2.0.7
 RELEASE=1
 
 AC_SUBST(VERSION)
diff -r 1afa90e87b4b examples/Makefile
--- a/examples/Makefile	Tue Feb 23 22:30:57 2016 -0800
+++ b/examples/Makefile	Tue Feb 23 22:30:58 2016 -0800
@@ -1,6 +1,12 @@
+CC=gcc
+LIBS = -lsocket -lnsl
+CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__
 
 all: rds-sample
 
-rds-sample: rds-sample.o
+rds-sample: rds-sample.c
+	$(CC) $(CPPFLAGS) $(CFLAGS) -o rds-sample rds-sample.c $(LIBS)
 
-CFLAGS = -I ../include
+clean:
+	rm -rf rds-sample
+
diff -r 1afa90e87b4b examples/rds-sample.c
--- a/examples/rds-sample.c	Tue Feb 23 22:30:57 2016 -0800
+++ b/examples/rds-sample.c	Tue Feb 23 22:30:58 2016 -0800
@@ -26,6 +26,7 @@
 #include <string.h>
 #include <stdlib.h>
 
+#if !(defined(__SVR4) && defined(__sun))
 /* FIXME - this is a hack to getaround RDS not exporting any header files.
  * This is a local copy of the file found at net/rds/
  */
@@ -33,6 +34,9 @@
 /* These are defined in rds.h....but that file is not happily included */
 #define SOL_RDS		272
 #define PF_RDS		28
+#else
+#include <sys/rds.h>
+#endif
 
 
 #define TESTPORT	4000
@@ -107,12 +111,12 @@
 	cmsg->cmsg_type = RDS_CMSG_RDMA_ARGS;
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct rds_rdma_args));
 
-	iov.addr = (uint64_t) buf;
+	iov.addr = (uint64_t)(uintptr_t)buf;
 	iov.bytes = sizeof(struct rdss_message);
 
 	args->remote_vec.addr = 0;
 	args->remote_vec.bytes = sizeof(struct rdss_message);
-	args->local_vec_addr = (uint64_t) &iov;
+	args->local_vec_addr = (uint64_t)(uintptr_t)&iov;
 	args->nr_local = 1;
 	args->flags = remote_flags ? (RDS_RDMA_READWRITE | RDS_RDMA_FENCE) : 0;
 	args->flags |= RDS_RDMA_NOTIFY_ME;
@@ -244,9 +248,9 @@
 	void *ctlbuf;
 	struct iovec *iov;
 
-	mr_args.vec.addr = (uint64_t) buf;
+	mr_args.vec.addr = (uint64_t)(uintptr_t)buf;
 	mr_args.vec.bytes = sizeof(struct rdss_message);
-	mr_args.cookie_addr = (uint64_t) cookie;
+	mr_args.cookie_addr = (uint64_t)(uintptr_t)cookie;
 	mr_args.flags = RDS_RDMA_USE_ONCE;
 
 	ctlbuf = calloc(1, CMSG_SPACE(sizeof(mr_args)));
diff -r 1afa90e87b4b include/rds.h
--- a/include/rds.h	Tue Feb 23 22:30:57 2016 -0800
+++ b/include/rds.h	Tue Feb 23 22:30:58 2016 -0800
@@ -84,6 +84,8 @@
 #define RDS_CMSG_CONG_UPDATE		5
 #define RDS_CMSG_ATOMIC_FADD		6
 #define RDS_CMSG_ATOMIC_CSWP		7
+#define	RDS_CMSG_MASKED_ATOMIC_FADD	8
+#define	RDS_CMSG_MASKED_ATOMIC_CSWP	9
 
 #define RDS_INFO_FIRST			10000
 #define RDS_INFO_COUNTERS		10000
@@ -252,8 +254,25 @@
 	rds_rdma_cookie_t cookie;
 	uint64_t 	local_addr;
 	uint64_t 	remote_addr;
-	uint64_t	swap_add;
-	uint64_t	compare;
+	union {
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+		} cswp;
+		struct {
+			uint64_t	add;
+		} fadd;
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+			uint64_t	compare_mask;
+			uint64_t	swap_mask;
+		} m_cswp;
+		struct {
+			uint64_t	add;
+			uint64_t	nocarry_mask;
+		} m_fadd;
+	};
 	u_int64_t	flags;
 	u_int64_t	user_token;
 };
@@ -278,5 +297,6 @@
 #define RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
 #define RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
 #define RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+#define RDS_RDMA_SILENT		0x0040	/* Do not interrupt remote */
 
 #endif /* IB_RDS_H */
diff -r 1afa90e87b4b pfhack.h
--- a/pfhack.h	Tue Feb 23 22:30:57 2016 -0800
+++ b/pfhack.h	Tue Feb 23 22:30:58 2016 -0800
@@ -44,9 +44,11 @@
 #ifndef __PF_HACK_H
 #define __PF_HACK_H
 
+#if !((defined(__SVR4) && defined(__sun)))
 #define PF_RDS		21
 #define AF_RDS		21
 #define SOL_RDS		276
+#endif
 
 extern int discover_pf_rds();
 extern int discover_sol_rds();
diff -r 1afa90e87b4b rds-info.1
--- a/rds-info.1	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-info.1	Tue Feb 23 22:30:58 2016 -0800
@@ -1,162 +1,160 @@
-.Dd October 30, 2006
-.Dt RDS-INFO 1
-.Os
-.Sh NAME
-.Nm rds-info
-.Nd display information from the RDS kernel module
-.Pp
-.Sh SYNOPSIS
-.Nm rds-info
-.Op Fl v
-.Bk -words
-.Op Fl cknrstIT
+.TH RDS-INFO 1 "October 30, 2006"
+.SH "NAME"
+rds-info - display information from the RDS kernel module
+.SH SYNOPSIS
+.B rds-info [-cknrstIT]
 
-.Sh DESCRIPTION
-The
-.Nm
-utility presents various sources of information that
+.SH DESCRIPTION
+.PP
+The utility presents various sources of information that
 the RDS kernel module maintains.  When run without any optional arguments
-.Nm
 will output all the information it knows of.  When options are specified then
 only the information associated with those options is displayed.
 
 The options are as follows:
-.Bl -tag -width Ds
-.It Fl v
+.SH OPTIONS
+.PP
+.TP 7
+\fB\-v
 Requests verbose output. When this option is given, some classes of information
 will display additional data.
 
-.It Fl c
+.TP
+\fB\-c
 Display global counters.  Each counter increments as its event
 occurs.  The counters may not be reset.  The set of supported counters
 may change over time.
 
-.Bl -tag -width 4
-.It CounterName
+.IP	CounterName
 The name of the counter.  These names come from the kernel and can change
 depending on the capability of the kernel module.
-.It Value
+.IP 	Value
 The number of times that the counter has been incremented since the kernel
 module was loaded.
-.El
 
-.It Fl k
+.TP
+\fB\-k\fR
 Display all the RDS sockets in the system.  There will always be one socket
 listed that is neither bound to nor connected to any addresses because
-.Nm
 itself uses an unbound socket to collect information.
 
-.Bl -tag -width 4
-.It BoundAddr, BPort
+.IP	BoundAddr, BPort
 The IP address and port that the socket is bound to.  0.0.0.0 0 indicates that
 the socket has not been bound.
-.It ConnAddr, CPort
+.IP	ConnAddr, CPort
 The IP address and port that the socket is connected to.  0.0.0.0 0 indicates
 that the socket has not been connected.
-.It SndBuf, RcvBuf
+.IP	SndBuf, RcvBuf
 The number of bytes of message payload which can be queued for sending or
 receiving on the socket, respectively.
-.It Inode
+.IP	Inode
 The number of the inode object associated with the socket. Can be used to
 locate the process owning a given socket by searching /proc/*/fd for
 open files referencing a socket with this inode number.
-.El
 
-.It Fl n
+.TP
+\fB\-n\fR
 Display all RDS connections.  RDS connections are maintained between
 nodes by transports.  
 
-.Bl -tag -width 4
-.It LocalAddr
+.IP	LocalAddr
 The IP address of this node.  For connections that originate and terminate on
 the same node the local address indicates which address initiated the
 connection establishment.
-.It RemoteAddr
+.IP	RemoteAddr
 The IP address of the remote end of the connection.  
-.It NextTX
+.IP	Tos
+The type of service value for this connection.
+.IP	NextTX
 The sequence number that will be given to the next message that is sent
 over the connection.
-.It NextRX
+.IP	NextRX
 The sequence number that is expected from the next message to arrive over
 the connection.  Any incoming messages with sequence numbers less than this
 will be dropped.
-.It Flg
+.IP	Flg
 Flags which indicate the state of the connection. 
-.Bl -tag -width 4
-.It s
-A process is currently sending a message down the connection.
-.It c
-The transport is attempting to connect to the remote address.
-.It C
-The connection to the remote host is connected and active.
-.El
-.El
 
-.It Fl r, Fl s, Fl t
+.IP 		s
+	A process is currently sending a message down
+	the connection.
+.IP 		c
+	The transport is attempting to connect to the
+	remote address.
+.IP 		C
+	The connection to the remote host is connected
+	and active.
+.IP 		E
+	The connection to the remote host is in error.
+
+.TP
+\fB\-r\fR, \fB\-s\fR, \fB\-t\fR
 Display the messages in the receive, send, or retransmit queues respectively.
-.Bl -tag -width 4
-.It LocalAddr, LPort
+
+.IP	LocalAddr, LPort
 The local IP address and port on this node associated with the message. For
 sent messages this is the source address, for receive messages it is the
 destination address.
-.It RemoteAddr, RPort
+.IP	RemoteAddr, RPort
 The remote IP address and port associated with the message. For sent messages
 this is the destination address, for receive messages it is the source address.
-.It Seq
+.IP	Tos
+The type of service for this message.
+.IP	Seq
 The sequence number of the message.
-.It Bytes
+.IP	Bytes
 The number of bytes in the message payload.
-.El
 
+
+.PP
 The following information sources are dependent on specific transports which
 may not always be available. 
 
-.It Fl I
+.TP 7
+\fB\-I\fR
 Display the IB connections which the IB transport is using to provide
 RDS connections.
 
-.Bl -tag -width 4
-.It LocalAddr
+.IP	LocalAddr
 The local IP address of this connection.
-.It RemoteAddr
+.IP	RemoteAddr
 The remote IP address of this connection.
-.It LocalDev
+.IP	Tos
+The type of service value for this connection.
+.IP	SL
+The QoS Service Level for this connection.
+.IP	LocalDev
 The local IB Global Identifier, printed in IPv6 address syntax.
-.It RemoteDev
+.IP	RemoteDev
 The remote IB Global Identifier, printed in IPv6 address syntax.
-.El
 
 If verbose output is requested, per-connection settings such as the
 maximum number of send and receive work requests will be displayed
 in addition.
 
-.It Fl T
+.TP 7
+\fB\-T\fR
 Display the TCP sockets which the TCP transport is using to provide
 RDS connections.
 
-.Bl -tag -width 4
-.It LocalAddr, LPort
+.IP	LocalAddr, LPort
 The local IP address and port of this socket.
-.It RemoteAddr, RPort
+.IP	RemoteAddr, RPort
 The remote IP address and port that this socket is connected to.
-.It HdrRemain
+.IP	HdrRemain
 The number of bytes that must be read off the socket to complete the next
 full RDS header.
-.It DataRemain
+.IP	DataRemain
 The number of bytes that must be read off the socket to complete the data
 payload of the message which is being received.
-.It SentNxt
+.IP	SentNxt
 The TCP sequence number of the first byte of the last message that we sent
 down the connection.
-.It ExpectedUna
+.IP	ExpectedUna
 The TCP sequence number of the byte past the last byte of the last message
 that we sent down the connection.  When we see that the remote side has
 acked up to this byte then we know that the remote side has received all
 our RDS messages.
-.It SeenUna
+.IP	SeenUna
 The TCP sequence number of the byte past the last byte which has been
 acked by the remote host.
-.El
-
-.El
-.Pp
diff -r 1afa90e87b4b rds-info.c
--- a/rds-info.c	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-info.c	Tue Feb 23 22:30:58 2016 -0800
@@ -42,16 +42,28 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <errno.h>
+#if defined(__SVR4) && defined(__sun)
+#include <strings.h>
+#else
 #include <string.h>
+#endif
 #include <inttypes.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
 
+#if defined(__SVR4) && defined(__sun)
+#include <sys/rds.h>
+#include <infiniband/ofa_solaris.h>
+#else
 #include "rds.h"
+#endif
 #include "pfhack.h"
 
+/* WHUPS changed the struct rds_info_connection definition b/w rds in 1.4 & 1.5. gotta support both
+   for now. TODO remove check of transport[15] once ofed pre-1.5 is extinct. */
 #define rds_conn_flag(conn, flag, letter) \
-	(conn.flags & RDS_INFO_CONNECTION_FLAG_##flag ? letter : '-')
+	(conn.flags & RDS_INFO_CONNECTION_FLAG_##flag \
+	|| conn.transport[15] & RDS_INFO_CONNECTION_FLAG_##flag ? letter : '-')
 
 #define min(a, b) (a < b ? a : b)
 #define array_size(foo) (sizeof(foo) / sizeof(foo[0]))
@@ -76,6 +88,10 @@
 
 char *progname = "rds-info";
 
+#if defined(__SVR4) && defined(__sun)
+int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
+#endif
+
 /* Like inet_ntoa, but can be re-entered several times without clobbering
  * the previously returned string. */
 static const char *paddr(int af, const void *addrp)
@@ -134,18 +150,20 @@
 {
 	struct rds_info_connection conn;
 
-	printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n",
-		"LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg");
+	printf("\nRDS Connections:\n%15s %15s %4s %16s %16s %4s\n",
+		"LocalAddr", "RemoteAddr", "Tos", "NextTX", "NextRX", "Flgs");
 	
 	for_each(conn, data, each, len) {
-		printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n",
+		printf("%15s %15s %4u %16"PRIu64" %16"PRIu64" %c%c%c%c\n",
 			ipv4addr(conn.laddr),
 			ipv4addr(conn.faddr),
+			conn.tos,
 			conn.next_tx_seq,
 			conn.next_rx_seq,
 			rds_conn_flag(conn, SENDING, 's'),
 			rds_conn_flag(conn, CONNECTING, 'c'),
-			rds_conn_flag(conn, CONNECTED, 'C'));
+			rds_conn_flag(conn, CONNECTED, 'C'),
+			rds_conn_flag(conn, ERROR, 'E'));
 	}
 }
 
@@ -153,16 +171,17 @@
 {
 	struct rds_info_message msg;
 
-	printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n",
+	printf("\n%s Message Queue:\n%15s %5s %15s %5s %4s %16s %10s\n",
 		(char *)extra,
-		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes");
+		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Tos","Seq", "Bytes");
 	
 	for_each(msg, data, each, len) {
-		printf("%15s %5u %15s %5u %16"PRIu64" %10u\n",
+		printf("%15s %5u %15s %5u %4u %16"PRIu64" %10u\n",
 			ipv4addr(msg.laddr),
 			ntohs(msg.lport),
 			ipv4addr(msg.faddr),
 			ntohs(msg.fport),
+			msg.tos,
 			msg.seq, msg.len);
 	}
 }
@@ -191,13 +210,14 @@
 {
 	struct rds_info_rdma_connection ic;
 
-	printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n",
-		"LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev");
+	printf("\nRDS IB Connections:\n%15s %15s %4s %3s %32s %32s\n",
+		"LocalAddr", "RemoteAddr", "Tos", "SL", "LocalDev", "RemoteDev");
 
 	for_each(ic, data, each, len) {
-		printf("%15s %15s %32s %32s",
+		printf("%15s %15s %4u %3u %32s %32s",
 			ipv4addr(ic.src_addr),
 			ipv4addr(ic.dst_addr),
+			ic.tos,ic.sl,
 			ipv6addr(ic.src_gid),
 			ipv6addr(ic.dst_gid));
 
@@ -234,8 +254,10 @@
 		print_msgs, "Send", 0 },
 	['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
 		  print_msgs, "Retransmit", 0 },
+#if !(defined(__SVR4) && defined(__sun))
 	['T'] = { RDS_INFO_TCP_SOCKETS, "TCP transport sockets",
 		  print_tcp_socks, NULL, 0 },
+#endif
 	['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
 		  print_ib_conns, NULL, 0 },
 };
@@ -266,6 +288,9 @@
 	char optstring[258] = "v+";
 	int given_options = 0;
 	socklen_t len = 0;
+#if defined(__SVR4) && defined(__sun)
+	struct rds_info_arg arg;
+#endif
 	void *data = NULL;
 	int fd;
 	int each;
@@ -322,6 +347,7 @@
 		    (given_options && !infos[i].option_given))
 			continue;
 
+#if !(defined(__SVR4) && defined(__sun))
 		/* read in the info until we get a full snapshot */
 		while ((each = getsockopt(fd, sol, infos[i].opt_val, data,
 				   &len)) < 0) {
@@ -345,15 +371,47 @@
 				return 1;
 			}
 		}
+#else
+		int retcode;
+
+		retcode = sol_ioctl(fd, infos[i].opt_val, &arg, &len, &each);
+		if (retcode != 0) {
+			if (retcode == 1) {
+				fprintf(stderr, "%s: Unable get statistics: "
+				    "%s\n", progname, strerror(errno));
+				return 1;
+			} else if (retcode == 2) {
+				fprintf(stderr, "%s: Unable to allocate memory"
+				    " for %u bytes of info: %s\n",
+				    progname, len, strerror(errno));
+				return 1;
+			} else {
+				fprintf(stderr, "%s: Unable to get statistics:"
+				    " %s\n", progname, strerror(errno));
+				return 1;
+			}
+		}
+
+		/* No data at the driver */
+		if (len == 0)
+			invalid_opt = 1;;
+#endif
 
 		if (invalid_opt)
 			continue;
 
+#if !(defined(__SVR4) && defined(__sun))
 		infos[i].print(data, each, len, infos[i].extra);
-
+#else
+		infos[i].print((void *)(uintptr_t)arg.datap, each, len,
+		    infos[i].extra);
+#endif
 		if (given_options && --given_options == 0)
 			break;
 	}
+#if defined(__SVR4) && defined(__sun)
+	free((void *)(uintptr_t)arg.datap);
+#endif
 
 	return 0;
 }
diff -r 1afa90e87b4b rds-ping.1
--- a/rds-ping.1	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-ping.1	Tue Feb 23 22:30:58 2016 -0800
@@ -1,69 +1,63 @@
-.Dd Apr 22, 2008
-.Dt RDS-PING 1
-.Os
-.Sh NAME
-.Nm rds-ping
-.Nd test reachability of remote node over RDS
-.Pp
-.Sh SYNOPSIS
-.Nm rds-ping
-.Bk -words
-.Op Fl c Ar count
-.Op Fl i Ar interval
-.Op Fl I Ar local_addr
-.Ar remote_addr
+.TH RDS-PING 1 "Apr 22, 2008"
+.SH NAME
+rds-ping - test reachability of remote node over RDS
 
-.Sh DESCRIPTION
-.Nm rds-ping
-is used to test whether a remote node is reachable over RDS.
-Its interface is designed to operate pretty much the standard
-.Xr ping 8
+.SH SYNOPSIS
+.HP
+.nf
+rds-ping [-c count] [-Q tos] [-i interval] [-I local_addr]
+    remote_addr
+.fi
+
+.SH DESCRIPTION
+.PP
+rds-ping is used to test whether a remote node is reachable over RDS.
+Its interface is designed to operate pretty much the standard ping(8) 
 utility, even though the way it works is pretty different.
-.Pp
-.Nm rds-ping
-opens several RDS sockets and sends packets to port 0 on
+.PP
+rds-ping opens several RDS sockets and sends packets to port 0 on
 the indicated host. This is a special port number to which
 no socket is bound; instead, the kernel processes incoming
 packets and responds to them.
-.Sh OPTIONS
+.SH OPTIONS
 The following options are available for use on the command line:
-.Bl -tag -width Ds
-.It Fl c Ar count
-Causes
-.Nm rds-ping
-to exit after sending (and receiving) the specified number of
+.PP
+.TP 7
+\fB\-c count
+Causes rds-ping to exit after sending (and receiving) the specified number of
 packets.
-.It Fl I Ar address
-By default,
-.Nm rds-ping
-will pick the local source address for the RDS socket based
+.TP
+\fB\-Q tos
+By default, rds-ping sends the ping requests on base (tos = 0) RDS connection.
+With this option, the requests are sent on RDS connection with the specified tos
+value.  Valid values are 0-255.
+.TP
+\fB\-I address
+By default, rds-ping will pick the local source address for the RDS socket based
 on routing information for the destination address (i.e. if
 packets to the given destination would be routed through interface
-.Nm ib0 ,
+ib0 ,
 then it will use the IP address of
-.Nm ib0
+ib0
 as source address).
 Using the
 .Fl I
 option, you can override this choice.
-.It Fl i Ar timeout
-By default,
-.Nm rds-ping
-will wait for one second between sending packets. Use this option
+.TP
+\fB\-i timeout
+By default, rds-ping will wait for one second between sending packets. Use this option
 to specified a different interval. The timeout value is given in
 seconds, and can be a floating point number. Optionally, append
-.Nm msec
+msec
 or
-.Nm usec
+usec
 to specify a timeout in milliseconds or microseconds, respectively.
-.It
+.IP
 Specifying a timeout considerably smaller than the packet round-trip
 time will produce unexpected results.
-.El
-.Sh AUTHORS
-.Nm rds-ping
+
+.SH AUTHORS
+rds-ping
 was written by Olaf Kirch <[email protected]>.
-.Sh SEE ALSO
-.Xr rds 7 ,
-.Xr rds-info 1 ,
-.Xr rds-stress 1 .
+.S~ SEE ALSO
+rds 7, rds-info 1, rds-stress 1.
diff -r 1afa90e87b4b rds-ping.c
--- a/rds-ping.c	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-ping.c	Tue Feb 23 22:30:58 2016 -0800
@@ -48,7 +48,12 @@
 #include <sys/poll.h>
 #include <fcntl.h>
 #include <getopt.h>
+#include <sys/ioctl.h>
+#if defined(__SVR4) && defined(__sun)
+#include <sys/rds.h>
+#else
 #include "rds.h"
+#endif
 
 #include "pfhack.h"
 
@@ -67,6 +72,7 @@
 static unsigned long	opt_count;
 static struct in_addr	opt_srcaddr;
 static struct in_addr	opt_dstaddr;
+static uint8_t		opt_tos = 0;
 
 /* For reasons of simplicity, RDS ping does not use a packet
  * payload that is being echoed, the way ICMP does.
@@ -91,13 +97,14 @@
 static int	parse_timeval(const char *, struct timeval *);
 static int	parse_long(const char *ptr, unsigned long *);
 static int	parse_addr(const char *ptr, struct in_addr *);
+static unsigned long long	parse_ull(char *ptr, unsigned long long max);
 
 int
 main(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "c:i:I:")) != -1) {
+	while ((c = getopt(argc, argv, "c:i:I:Q:")) != -1) {
 		switch (c) {
 		case 'c':
 			if (!parse_long(optarg, &opt_count))
@@ -114,6 +121,9 @@
 				die("Bad wait time <%s>\n", optarg);
 			break;
 
+		case 'Q':
+			opt_tos = parse_ull(optarg, 255);
+			break;
 		default:
 			usage("Unknown option");
 		}
@@ -142,6 +152,9 @@
 	struct timeval	next_ts;
 	struct socket	socket[NSOCKETS];
 	struct pollfd	pfd[NSOCKETS];
+#if !(defined(__SVR4) && defined(__sun))
+	int             pending[NSOCKETS];
+#endif
 	int		i, next = 0;
 
 	for (i = 0; i < NSOCKETS; ++i) {
@@ -152,10 +165,18 @@
 		socket[i].fd = fd;
 		pfd[i].fd = fd;
 		pfd[i].events = POLLIN;
+#if !(defined(__SVR4) && defined(__sun))
+		pending[i] = 0;
+#endif
 	}
 
 	memset(&sin, 0, sizeof(sin));
+#if defined(__SVR4) && defined(__sun)
+	sin.sin_family = AF_INET_OFFLOAD;
+#else
 	sin.sin_family = AF_INET;
+#endif
+
 	sin.sin_addr = opt_dstaddr;
 
 	gettimeofday(&next_ts, NULL);
@@ -180,13 +201,31 @@
 			if (opt_count && sent >= opt_count)
 				break;
 
-			timeradd(&next_ts, &opt_wait, &next_ts);
-			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
-				err = errno;
-			sp->sent_id = ++sent;
-			sp->sent_ts = now;
-			sp->nreplies = 0;
-			next = (next + 1) % NSOCKETS;
+			timeradd(&now, &opt_wait, &next_ts);
+#if !(defined(__SVR4) && defined(__sun))
+			if (!pending[next]) {
+#endif
+				memset(&sin, 0, sizeof(sin));
+#if defined(__SVR4) && defined(__sun)
+				sin.sin_family = AF_INET_OFFLOAD;
+#else
+				sin.sin_family = AF_INET;
+#endif
+				sin.sin_addr = opt_dstaddr;
+
+				if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
+					err = errno;
+				sp->sent_id = ++sent;
+				sp->sent_ts = now;
+				sp->nreplies = 0;
+#if !(defined(__SVR4) && defined(__sun))
+				if (!err)
+					pending[next] = 1;
+#endif
+				next = (next + 1) % NSOCKETS;
+#if !(defined(__SVR4) && defined(__sun))
+			}
+#endif
 
 			if (err) {
 				static unsigned int nerrs = 0;
@@ -223,6 +262,9 @@
 					report_packet(sp, &now, NULL, errno);
 			} else {
 				report_packet(sp, &now, &from.sin_addr, 0);
+#if !(defined(__SVR4) && defined(__sun))
+				pending[i] = 0;
+#endif
 				recv++;
 			}
 		}
@@ -258,7 +300,11 @@
 	int pf;
 
 	memset(&sin, 0, sizeof(sin));
+#if defined(__SVR4) && defined(__sun)
+	sin.sin_family = AF_INET_OFFLOAD;
+#else
 	sin.sin_family = AF_INET;
+#endif
 
 #ifdef DYNAMIC_PF_RDS
         pf = discover_pf_rds();
@@ -278,6 +324,9 @@
 		if (ufd < 0)
 			die_errno("unable to create UDP socket");
 		sin.sin_addr = *dst;
+#if defined(__SVR4) && defined(__sun)
+		sin.sin_family = AF_INET;
+#endif
 		sin.sin_port = htons(1);
 		if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
 			die_errno("unable to connect to %s",
@@ -289,6 +338,9 @@
 
 		*src = sin.sin_addr;
 		close(ufd);
+#if defined(__SVR4) && defined(__sun)
+		sin.sin_family = AF_INET_OFFLOAD;
+#endif
 	}
 
 	sin.sin_addr = *src;
@@ -297,6 +349,9 @@
 	if (bind(fd, (struct sockaddr *) &sin, sizeof(sin)))
 		die_errno("bind() failed");
 
+	if (opt_tos && ioctl(fd, SIOCRDSSETTOS, &opt_tos)) 
+		die_errno("ERROR: failed to set TOS\n");
+
 	return fd;
 }
 
@@ -309,7 +364,8 @@
 		"%s\nUsage: rds-ping [options] dst_addr\n"
 		"Options:\n"
 		" -c count      limit packet count\n"
-		" -I interface  source IP address\n",
+		" -I interface  source IP address\n"
+		" -Q tos	type of service\n",
 		complaint);
 	exit(1);
 }
@@ -384,3 +440,31 @@
 	return 0;
 }
 
+static unsigned long long parse_ull(char *ptr, unsigned long long max)
+{
+	unsigned long long val;
+	char *endptr;
+
+	val = strtoull(ptr, &endptr, 0);
+	switch (*endptr) {
+	case 'k': case 'K':
+		val <<= 10;
+		endptr++;
+		break;
+
+	case 'm': case 'M':
+		val <<= 20;
+		endptr++;
+		break;
+
+	case 'g': case 'G':
+		val <<= 30;
+		endptr++;
+		break;
+	}
+
+	if (*ptr && !*endptr && val <= max)
+		return val;
+
+	die("invalid number '%s'\n", ptr);
+}
diff -r 1afa90e87b4b rds-stress.1
--- a/rds-stress.1	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-stress.1	Tue Feb 23 22:30:58 2016 -0800
@@ -1,119 +1,131 @@
-.Dd May 15, 2007
-.Dt RDS-STRESS 1
-.Os
-.Sh NAME
-.Nm rds-stress
-.Nd send messages between processes over RDS sockets
-.Pp
-.Sh SYNOPSIS
-.Nm rds-stress
-.Bk -words
-.Op Fl p Ar port_number
-.Op Fl r Ar receive_address
-.Op Fl s Ar send_address
-.Op Fl a Ar ack_bytes
-.Op Fl q Ar request_bytes
-.Op Fl D Ar rdma_bytes
-.Op Fl d Ar queue_depth
-.Op Fl t Ar nr_tasks
-.Op Fl c
-.Op Fl R
-.Op Fl V
-.Op Fl v
+.TH RDS-STRESS 1 " May 15, 2007"
+.SH "NAME"
+rds-stress - send messages between processes over RDS sockets
+.PP
+.SH SYNOPSIS
+.HP
+.nf
+rds-stress [-p port_number] -r [receive_address] [-s send_address]
+      [-Q tos] [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
+      [-d queue_depth] [-t nr_tasks] [-T time] [-c] [-R] [-V] [-v]
+.fi
 
-.Sh DESCRIPTION
-.Nm rds-stress
+
+.SH DESCRIPTION
+.PP
+.Nm
+rds-stress
 sends messages between groups tasks, usually running on seperate
 machines.
-.Pp
+.PP
 First a passive receiving instance is started.
-.Pp
-.Dl $ rds-stress
-.Pp
+.RS 12
+
+	$ rds-stress
+.RE
+.PP
 Then an active sending instance is started, giving it
 the address and port at which it will find a listening
 passive receiver.  In addition, it is given configuration options which
 both instances will use.
-.Pp
-.Dl $ rds-stress -s recvhost -p 4000 -t 1 -d 1
-.Pp
+.PP
+.RS 12
+	$ rds-stress -s recvhost -p 4000 -t 1 -d 1
+.RE
+.PP
 The active sender will parse the options, connect to the passive receiver, and
 send the options over this connection.  From this point on both instances
 exhibit the exact same behaviour.
-.Pp
+.PP
 They will create a number of child tasks as specified by the -t option.  Once
 the children are created the parent sleeps for a second at a time, printing a
 summary of statistics at each interval. 
-.Pp
+.PP
 Each child will open an RDS socket, each binding to a port number in order
 after the port number given on the command line.  The first child would bind to
 port 4001 in our example.  Each child sets the send and receive buffers to
 exactly fit the number of messages, requests and acks, that will be in flight
 as determind by the command line arguments.
-.Pp
+.PP
 The children then enter their loop.  They will keep a number of sent messages
 outstanding as specified by the -d option.  When they reach this limit they
 will wait to receive acks which will allow them to send again.  As they receive
 messages from their peers they immediately send acks.
-.Pp
+.PP
 Every second, the parent process will display statistics of the ongoing
 stress test. The output is described in section OUTPUT below.
-.Pp
+.PP
 If the -T option is given, the test will terminate after the specified time,
 and a summary is printed.
-.Pp
+.PP
 Each child maintains outstanding messages to all other children of the other instance.
 They do not send to their siblings.
-.Sh OPTIONS
+.SH OPTIONS
+.PP
 The following options are available for use on the command line:
-.Bl -tag -width Ds
-.It Fl p Ar port_number
+
+.TP 7
+\fB\-p  port_number
 Each parent binds a TCP socket to this port number and their respective
 address.  They will trade the negotiated options over this socket.  Each
 child will bind an RDS socket to the range of ports immediately following
 this port number, for as many children as there are.
-.It Fl s Ar send_address
+.TP
+\fB\-s send_address
 A connection attempt is made to this address.  Once its complete and the
 options are sent over it then children will be created and work will proceed.
-.It Fl r Ar receive_address
+.TP
+\fB\-r receive_address
 This specifies the address that messages will be sent from.  If -s is not
 specified then rds-stress waits for a connection on this address before
 proceeding.
-.Pp
+
 If this option is not given, rds-stress will choose an appropriate address.
 The passive process will accept connections on all local interfaces, and
 obtain the address once the control connection is established.
 The active process will choose a local address based on the interface through
 which it connects to the destination address.
-.It Fl a Ar ack_bytes
+.TP
+\fB\-Q tos
+Uses the RDS connection between IP addresses with the specified tos value. By 
+default, the base (tos = 0) RDS connection is used.  Valid values are 0-255.
+.TP
+\fB\-a ack_bytes
 This specifies the size of the ack messages, in bytes. There is a minimum size
 which depends on the format of the ack messages, which may change over time.
 See section "Message Sizes" below.
-.It Fl q Ar request_bytes
+.TP
+\fB\-q request_bytes
 This specifies the size of the request messages, in bytes.
 It also has a minimum size which may change over time.
 See section "Message Sizes" below.
-.It Fl D Ar rdma_bytes
+.TP
+\fB\-D rdma_bytes
 RDSv3 is capable of transmitting part of a message via RDMA directly from
 application buffer to application buffer. This option enables RDMA support
 in rds-stress: request packets include parameters for an RDMA READ or WRITE
 operation, which the receiving process executes at the time the ACK packet
 is sent.
 See section "Message Sizes" below.
-.It Fl d Ar queue_depth
+.TP
+\fB\-d queue_depth
 Each child will try to maintain this many sent messages outstanding to each
 of its peers on the remote address.
-.It Fl t Ar nr_tasks
+.TP
+\fB\-t nr_tasks
 Each parent will create this many children tasks.
-.It Fl T Ar seconds
+.TP
+\fB\-T seconds
 Specify the duration of the test run. After the specified number of seconds,
 all processes on both ends of the connection will terminate, and the
 active instance will print a summary. By default, rds-stress will keep
 on sending and receiving messages.
-.It Fl z
+.TP
+\fB\-z
 This flag can be used in conjunction with -T. It suppresses the ongoing
 display of statistics, and prints a summary only.
-.It Fl c
+.TP
+\fB\-c
 This causes rds-stress to create child tasks which just consume CPU cycles.
 One task is created for each CPU in the system.  First each child observes the
 maximum rate at which it can consume cycles.  This means that this option
@@ -121,54 +133,70 @@
 use of the system by observing the lesser rate at which the children consume
 cycles.  This option is *not* shared between the active and passive instances.
 It must be specified on each rds-stress command line.
-.It Fl R
+.TP
+\fB\-R
 This tells the rds-stress parent process to run with SCHED_RR priority,
 giving it precedence over the child processes. This is useful when running
 with lots of tasks, where there is a risk of the child processes starving
 the parent, and skewing the results.
-.It Fl v
+.TP
+\fB\-v
 With this option enabled, packets are filled with a pattern that is
 verified by the receiver. This check can help detect data corruption
 occuring under high load.
-.El
-.Pp
+.TP
+.PP
 
-.Ss Message Sizes
+.SS Message Sizes
 Options which set a message size (such as -a) specify a number of bytes
 by default. By appending \fBK\fP, \fBM\fP, or \fBG\fP, you can specify the size
 in kilobytes, megabytes or gigabytes, respectively. For instance,
 the following will run rds-stress with a message and ACK size of 1024
 bytes, and an RDMA message size of 1048576 bytes:
-.Pp
-.Dl rds-stress ... -q 1K -a 1K -D 1M
-.Pp
-.Pp
-.Sh OUTPUT
+.PP
+.RS 12
+rds-stress ... -q 1K -a 1K -D 1M
+.RE
+.PP
+.PP
+.SH OUTPUT
 Each parent outputs columns of statistics at a regular interval:
-.Bl -tag -width Ds
-.It tsks
+.TP 8
+tsks
 The number of child tasks which are running.
-.It tx/s
+.TP
+tx/s
 The number of sendmsg() calls that all children are executing, per second. 
-.It tx+rx K/s
+.TP
+rx/s
+The number of recvmsg() calls that all children are executing, per second. 
+.TP
+tx+rx K/s
 The total number of bytes that are flowing through sendmsg() and recvmsg() for all children.
 This includes both request and ack messages.
-.It rw+rr K/s
-The total number of bytes that are being transferred via RDMA READs and
+.TP
+mbi K/s
+The total number of bytes that are being received via RDMA READs and
 WRITEs for all children.
-.It tx us/c
+.TP
+mbo K/s
+The total number of bytes that are being transmited via RDMA READs and
+WRITEs for all children.
+.TP
+tx us/c
 The average number of microseconds spent in sendmsg() calls.
-.It rtt us
+.TP
+rtt us
 The average round trip time for a request and ack message pair.  This measures
 the total time between when a task sends a request and when it finally receives
 the ack for that message.  Because it includes the time it takes for the
 receiver to wake up, receive the message, and send an ack, it can grow to be
 quite large under load. 
-.It cpu %
+.TP
+cpu %
 This is the percentage of available CPU resources on this machine that are being
 consumed since rds-stress started running.  It will show -1.00 if -c is not
 given.  It is calculated based on the amount of CPU resources that CPU soaking
 tasks are able to consume.  This lets it measure CPU use by the system, say in
 interrupt handlers, that task-based CPU accounting does not include.
 For this to work rds-stress must be started with -c on an idle system.
-.El
diff -r 1afa90e87b4b rds-stress.c
--- a/rds-stress.c	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds-stress.c	Tue Feb 23 22:30:58 2016 -0800
@@ -15,17 +15,38 @@
 #include <sys/time.h>
 #include <time.h>
 #include <inttypes.h>
+#if defined(__SVR4) && defined(__sun)
+#include <sys/syscall.h>
+#include <signal.h>
+#include <sys/lgrp_user.h>
+#else
 #include <syscall.h>
+#endif
 #include <sys/stat.h>
 #include <sys/poll.h>
 #include <ctype.h>
 #include <fcntl.h>
 #include <sched.h>
 #include <getopt.h>
+#include <sys/ioctl.h>
+#if !(defined(__SVR4) && defined(__sun))
 #include <byteswap.h>
 #include "rds.h"
-
+#else
+#include <sys/rds.h>
+#endif
 #include "pfhack.h"
+#if defined(__SVR4) && defined(__sun)
+#include <infiniband/ofa_solaris.h>
+#endif
+
+#if defined(__SVR4) && defined(__sun)
+/*
+ * This definition is forward looking and is not yet present
+ * in Solaris rds.h file
+ */
+#define RDS_CMSG_RDMA_SEND_STATUS RDS_CMSG_RDMA_STATUS
+#endif
 
 /*
  *
@@ -45,8 +66,9 @@
         M_RDMA_READ_ONLY,
         M_RDMA_WRITE_ONLY
 };
-
-struct options {
+#define VERSION_MAX_LEN 16 
+
+struct options_2_0_6 {
 	uint32_t	req_depth;
 	uint32_t	req_size;
 	uint32_t	ack_size;
@@ -76,8 +98,67 @@
 	uint32_t	connect_retries;
 } __attribute__((packed));
 
+struct options {
+	char		version[VERSION_MAX_LEN];
+        uint32_t        req_depth;
+        uint32_t        req_size;
+        uint32_t        ack_size;
+        uint32_t        rdma_size;
+        uint32_t        send_addr;
+        uint32_t        receive_addr;
+        uint16_t        starting_port;
+        uint16_t        nr_tasks;
+        uint32_t        run_time;
+        uint8_t         summary_only;
+        uint8_t         rtprio;
+        uint8_t         tracing;
+        uint8_t         verify;
+        uint8_t         show_params;
+        uint8_t         show_perfdata;
+        uint8_t         use_cong_monitor;
+        uint8_t         rdma_use_once;
+        uint8_t         rdma_use_get_mr;
+        uint8_t         rdma_use_fence;
+        uint8_t         rdma_cache_mrs;
+        uint8_t         rdma_key_o_meter;
+        uint8_t         suppress_warnings;
+        uint8_t         simplex;
+        uint8_t         rw_mode;
+        uint32_t        rdma_vector;
+        uint32_t        rdma_alignment;
+        uint32_t        connect_retries;
+        uint8_t         tos;
+        uint8_t         async;
+} __attribute__((packed));
+
+#define MAX_BUCKETS 16
+
 static struct options	opt;
 static int		control_fd;
+static uint64_t         rtt_threshold;
+static int              show_histogram;
+static int		reset_connection;
+static char		peer_version[VERSION_MAX_LEN];
+
+static int get_bucket(uint64_t rtt_time)
+{
+  int i;
+  uint64_t l_rtt_time = rtt_time;
+
+  if (!l_rtt_time)
+    i = 0;
+  else
+  {
+    i = -1;
+    while (l_rtt_time)
+    {
+      i++;
+      l_rtt_time = (l_rtt_time >> 1);
+    }
+  }
+
+  return i;
+}
 
 struct counter {
 	uint64_t	nr;
@@ -102,6 +183,10 @@
 
 #define NR_STATS S__LAST
 
+#if defined(__SVR4) && defined(__sun)
+int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
+#endif
+
 /*
  * Parents share a mapped array of these with their children.  Each child
  * gets one.  It's used to communicate between the child and the parent
@@ -110,9 +195,11 @@
 struct child_control {
 	pid_t pid;
 	int ready;
+	int stopping;
 	struct timeval start;
 	struct counter cur[NR_STATS];
 	struct counter last[NR_STATS];
+        uint64_t       latency_histogram[MAX_BUCKETS];
 } __attribute__((aligned (256))); /* arbitrary */
 
 struct soak_control {
@@ -132,6 +219,7 @@
  */
 #define OP_REQ		1
 #define OP_ACK		2
+#define OP_DUMP		3
 
 #define RDMA_OP_READ	1
 #define RDMA_OP_WRITE	2
@@ -148,7 +236,7 @@
 	uint16_t	from_port;
 	uint16_t	to_port;
 	uint16_t	index;
-	uint8_t		op;
+	uint8_t         op;
 
 	/* RDMA related.
 	 * rdma_op must be the first field, because we
@@ -162,12 +250,21 @@
 	uint32_t	rdma_size;
 	uint32_t        rdma_vector;
 
-	uint8_t		data[0];
+	/* Async send related. */
+	uint8_t         retry;
+	uint8_t         rdma_remote_err;
+	uint8_t         pending;
+
+	uint8_t         data[0];
 } __attribute__((packed));
 
 #define MIN_MSG_BYTES		(sizeof(struct header))
 #define BASIC_HEADER_SIZE	(size_t)(&((struct header *) 0)->rdma_op)
 
+#define print_outlier(...) do {         \
+        fprintf(stderr, __VA_ARGS__);   \
+} while (0)
+
 #define die(fmt...) do {		\
 	fprintf(stderr, fmt);		\
 	exit(1);			\
@@ -254,6 +351,19 @@
 
 	die("invalid host name or dotted quad '%s'\n", ptr);
 }
+#if defined(__SVR4) && defined(__sun)
+static lgrp_id_t lgrp_id = -1;
+
+static void
+set_my_lgrp(void)
+{
+	if (lgrp_id != -1) {
+		lgrp_affinity_set(P_LWPID, P_MYID, lgrp_id,
+			LGRP_AFF_STRONG);
+		yield(); /* force a context switch */
+	}
+}
+#endif
 
 static void usage(void)
 {
@@ -273,6 +383,7 @@
 	" -d [depth, 1]     request pipeline depth, nr outstanding\n"
 	" -t [nr, 1]        number of child tasks\n"
 	" -T [seconds, 0]   runtime of test, 0 means infinite\n"
+	" -Q [tos, 0]       Type of Service\n"
 	" -D [bytes]        RDMA: size\n"
 	" -I [iovecs, 1]    RDMA: number of user buffers to target (max 512)\n"
         " -M [nr, 0]        RDMA: mode (0=readwrite,1=readonly,2=writeonly)\n"
@@ -281,6 +392,9 @@
 	" -c                measure cpu use with per-cpu soak processes\n"
 	" -V                trace execution\n"
 	" -z                print a summary at end of test only\n"
+#if defined(__SVR4) && defined(__sun)
+	" -g [lgrpid]       bind the process to the specified lgrp\n"
+#endif
 	"\n"
 	"Example:\n"
 	"  recv$ rds-stress\n"
@@ -310,7 +424,7 @@
 static void check_parent(pid_t pid)
 {
 	if (pid != getppid())
-		die("parent %u exited\n", pid);
+		die("parent %u exited\n", (int)pid);
 }
 
 /*
@@ -334,6 +448,7 @@
 		msg_pattern[i] = k;
 }
 
+#if !(defined(__SVR4) && defined(__sun))
 #if __BYTE_ORDER == __LITTLE_ENDIAN
 #define htonll(x)	bswap_64(x)
 #define ntohll(x)	bswap_64(x)
@@ -341,6 +456,7 @@
 #define htonll(x)	(x)
 #define ntohll(x)	(x)
 #endif
+#endif /* Not sun */
 
 static void encode_hdr(struct header *dst, const struct header *hdr)
 {
@@ -361,6 +477,7 @@
 	dst->rdma_key = htonll(hdr->rdma_key);
 	dst->rdma_size = htonl(hdr->rdma_size);
 	dst->rdma_vector = htonl(hdr->rdma_vector);
+	dst->retry = hdr->retry;
 }
 
 static void decode_hdr(struct header *dst, const struct header *hdr)
@@ -382,6 +499,7 @@
 	dst->rdma_key = ntohll(hdr->rdma_key);
 	dst->rdma_size = ntohl(hdr->rdma_size);
 	dst->rdma_vector = ntohl(hdr->rdma_vector);
+	dst->retry = hdr->retry;
 }
 
 static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
@@ -412,11 +530,19 @@
  * Compare incoming message header with expected header. All header fields
  * are in host byte order except for address and port fields.
  */
-static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
+static int check_hdr(void *message, uint32_t bytes, struct header *hdr, struct options *opts)
 {
 	struct header msghdr;
+	uint32_t	inc_seq;
+	uint32_t	my_seq;
 
 	decode_hdr(&msghdr, message);
+	inc_seq = msghdr.seq;
+	my_seq = hdr->seq;
+
+	if (msghdr.retry && (inc_seq < my_seq))
+		return -1;
+
 	if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
 #define bleh(var, disp)					\
 		disp(hdr->var),				\
@@ -428,7 +554,7 @@
 		 * with stdout() and we don't get things stomping on each
 		 * other
 		 */
-		printf( "An incoming message had a header which\n"
+		printf( "An incoming message had a %s header which\n"
 			"didn't contain the fields we expected:\n"
 			"    member        expected eq             got\n"
 			"       seq %15u %s %15u\n"
@@ -438,6 +564,7 @@
 			"   to_port %15u %s %15u\n"
 			"     index %15u %s %15u\n"
 			"        op %15u %s %15u\n",
+			(msghdr.retry) ? "RETRY" : "",
 			bleh(seq, /**/),
 			bleh(from_addr, inet_ntoa_32),
 			bleh(from_port, ntohs),
@@ -569,6 +696,9 @@
 
 	fcntl(fd, F_SETFL, O_NONBLOCK);
 
+	if (opts->tos && ioctl(fd, SIOCRDSSETTOS, &opts->tos)) 
+		die_errno("ERROR: failed to set TOS\n");
+
 	return fd;
 }
 
@@ -584,7 +714,11 @@
 	if (opts->receive_addr == 0)
 		return 1;
 
+#if defined(__SVR4) && defined(__sun)
+	sin.sin_family = AF_INET_OFFLOAD;
+#else
 	sin.sin_family = AF_INET;
+#endif
 	sin.sin_port = htons(opts->starting_port);
 	sin.sin_addr.s_addr = htonl(opts->receive_addr);
 
@@ -639,7 +773,7 @@
 	mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
 #endif
 	if (setsockopt(fd, sol, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
-		die_errno("setsockopt(RDS_FREE_MR) failed");
+		return;
 	mrs_allocated--;
 }
 
@@ -677,7 +811,11 @@
 	size = sizeof(struct rdma_key_o_meter)
 			+ 2 * nr_tasks * sizeof(*kt)
 			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
+#if defined(__SVR4) && defined(__sun)
+	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+#else
 	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
+#endif
 	if (base == MAP_FAILED)
 		die_errno("alloc_rdma_buffers: mmap failed");
 
@@ -828,13 +966,20 @@
 	}
 
 	if (!failed)
-		trace("compare pass pattern %Lx addr %p\n",
+		trace("compare pass pattern 0x%Lx addr %p\n",
 			(unsigned long long) pattern, addr);
 }
 
+struct retry_entry {
+	uint32_t	retries;
+	uint32_t	seq;
+	int		status;
+};
+
 struct task {
 	unsigned int		nr;
 	unsigned int		pending;
+	int			trace;
 	unsigned int		unacked;
 	struct sockaddr_in	src_addr;	/* same for all tasks */
 	struct sockaddr_in	dst_addr;
@@ -846,6 +991,13 @@
 	uint16_t		recv_index;
 	struct timeval *	send_time;
 	struct header *		ack_header;
+	struct header *         ack2_header;
+	struct header *         req_header;
+	uint64_t *		retry_token;
+	uint32_t		retries;
+	uint32_t            	last_retry_seq;
+	uint32_t		retry_index;
+
 
 	/* RDMA related stuff */
 	uint64_t **		local_buf;
@@ -865,7 +1017,11 @@
 	/* We use mmap here rather than malloc, because it is always
 	 * page aligned. */
 	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
+#if defined(__SVR4) && defined(__sun)
+	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+#else	
 	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+#endif
 	if (base == MAP_FAILED)
 		die_errno("alloc_rdma_buffers: mmap failed");
 	memset(base, 0x2f, len);
@@ -915,17 +1071,16 @@
 	if (RDMA_OP_READ == hdr->rdma_op) {
 		if (opt.verify)
 			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
-		trace("Requesting RDMA read for pattern %Lx "
-				"local addr to rdma read %p\n",
-				(unsigned long long) hdr->rdma_pattern,
+		trace("Requesting RDMA read for pattern 0x%Lx"
+				"local addr to rdma read 0x%p\n",
+				hdr->rdma_pattern,
 				rdma_addr);
 	} else {
 		if (opt.verify)
 			rds_fill_buffer(rdma_addr, rdma_size, 0);
-		trace("Requesting RDMA write for pattern %Lx "
-				"local addr to rdma write %p\n",
-				(unsigned long long) hdr->rdma_pattern,
-				rdma_addr);
+
+		trace("Requesting RDMA write for pattern 0x%Lx",
+				hdr->rdma_pattern);
 	}
 }
 
@@ -947,7 +1102,7 @@
 		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
 
 
-	trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
+	trace("RDS received request to issue rdma %s len %lu rva 0x%Lx key 0x%Lx pattern 0x%Lx\n",
 		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
 		rdma_size,
 		(unsigned long long) in_hdr->rdma_addr,
@@ -966,20 +1121,32 @@
 	hdr->rdma_vector = in_hdr->rdma_vector;
 }
 
-static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
+static inline uint64_t rdma_user_token(struct task *t, unsigned int qindex,  unsigned int type, uint32_t seq)
 {
-	return t->nr * opt.req_depth + qindex;
+	uint64_t tmp = seq;
+	return (tmp << 32) | ((t->nr * opt.req_depth + qindex) << 2 | type);
 }
 
-static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
+static void rdma_mark_completed(struct task *tasks, uint64_t token, int status, struct options *opts)
 {
 	struct task *t;
 	unsigned int i;
-
-	trace("RDS rdma completion for token %x\n", token);
-
-	t = &tasks[token / opt.req_depth];
-	i = token % opt.req_depth;
+	struct header *hdr = NULL;
+	uint32_t seq = token >> 32;
+	unsigned int type = token & 0x03;
+	unsigned int index = (token & 0xFFFFFFFF) >> 2;
+
+	trace("RDS rdma completion for token 0x%lx\n", token);
+
+	t = &tasks[index / opt.req_depth];
+	i = index % opt.req_depth;
+
+	if (opts->async) {
+		if (type == OP_REQ)
+			hdr = &t->req_header[i];
+		else
+			hdr = &t->ack2_header[i];
+	}
 
 	if (status) {
 		const char *errmsg;
@@ -987,26 +1154,59 @@
 		switch (status) {
 		case RDS_RDMA_REMOTE_ERROR:
 			errmsg = "remote error"; break;
-		case RDS_RDMA_CANCELED:
+		case RDS_RDMA_SEND_DROPPED:
+			errmsg = "operation was dropped"; break;
+		case RDS_RDMA_SEND_CANCELED:
 			errmsg = "operation was cancelled"; break;
-		case RDS_RDMA_DROPPED:
-			errmsg = "operation was dropped"; break;
-		case RDS_RDMA_OTHER_ERROR:
+		case RDS_RDMA_SEND_OTHER_ERROR:
 			errmsg = "other error"; break;
 		default:
 			errmsg = "unknown error"; break;
 		}
 
-		printf("%s:%u: RDMA op %u failed: %s\n",
+		trace("%s:%u: %s failed: %s\n",
 				inet_ntoa(t->dst_addr.sin_addr),
 				ntohs(t->dst_addr.sin_port),
-				i, errmsg);
+				type ? "SEND" : "RDMA",
+				errmsg);
+
+		if (hdr &&
+			(status == RDS_RDMA_SEND_DROPPED ||
+			 status == RDS_RDMA_REMOTE_ERROR)) {
+
+			if (hdr->seq == seq) {
+				hdr->retry = 1;
+				if (hdr->seq > t->last_retry_seq) {
+					if (status == RDS_RDMA_REMOTE_ERROR)
+						hdr->rdma_remote_err = 1;
+					t->retry_token[t->retry_index] = token;
+					t->retry_index = (t->retry_index + 1) %
+						(2 * opts->req_depth);
+					t->retries += 1;
+					t->last_retry_seq = hdr->seq;
+					if (t->retries > 2 * opts->req_depth)
+						die("Exceeded MAX retry entries..\n");
+				}
+			} else
+				die("SEQ Out-Of-Sync: %u/%u\n", hdr->seq, seq);
+		} else if (hdr) {
+			hdr->pending = 0;
+			hdr->retry = 0;
+			hdr->rdma_remote_err = 0;
+		}
+	} else if (hdr) {
+		hdr->pending = 0;
+		hdr->retry = 0;
+		hdr->rdma_remote_err = 0;
 	}
 
 	t->rdma_inflight[i] = 0;
 	t->drain_rdmas = 0;
 }
 
+#if defined(__SVR4) && defined(__sun)
+#undef MSG_MAXIOVLEN
+#endif
 #define MSG_MAXIOVLEN 2
 
 /*
@@ -1018,11 +1218,14 @@
 	static char ctlbuf[1024];
 	struct cmsghdr *cmsg;
 
-	msg->msg_control = ctlbuf;
-	msg->msg_controllen = CMSG_SPACE(size);
-
-	cmsg = CMSG_FIRSTHDR(msg);
-	cmsg->cmsg_level = sol;
+	if (!msg->msg_control) {
+		msg->msg_control = ctlbuf;
+		msg->msg_controllen = CMSG_SPACE(size);
+		cmsg = CMSG_FIRSTHDR(msg);
+	} else {
+		cmsg = (struct cmsghdr *)((char *)msg->msg_control + msg->msg_controllen);
+		msg->msg_controllen += CMSG_SPACE(size);
+	}cmsg->cmsg_level = sol;
 	cmsg->cmsg_type = type;
 	cmsg->cmsg_len = CMSG_LEN(size);
 	memcpy(CMSG_DATA(cmsg), ptr, size);
@@ -1034,7 +1237,7 @@
  * the ACK packet.
  */
 static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
-		unsigned int user_token, void *local_buf)
+		uint64_t user_token, void *local_buf)
 {
 
 #define RDS_MAX_IOV 512 /* FIX_ME - put this into rds.h or use socket max ?*/
@@ -1048,7 +1251,7 @@
 	rdma_size = hdr->rdma_size;
 	rdma_vector = hdr->rdma_vector;
 
-	trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p vector %u\n",
+	trace("RDS issuing rdma for token 0x%lx key 0x%llx len %d local_buf %p vector %d\n",
 			user_token,
 			(unsigned long long) hdr->rdma_key,
 			rdma_size, local_buf,
@@ -1102,6 +1305,15 @@
 	rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
 }
 
+static void build_cmsg_async_send(struct msghdr *msg, uint64_t user_token)
+{
+	struct rds_asend_args  args;
+
+	args.flags |= RDS_SEND_NOTIFY_ME;
+	args.user_token = user_token;
+	rdma_put_cmsg(msg, RDS_CMSG_ASYNC_SEND, &args, sizeof(args));
+}
+
 static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
 {
 	rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
@@ -1174,19 +1386,17 @@
 	hdr->index = qindex;
 }
 
-static int send_packet(int fd, struct task *t,
-		struct header *hdr, unsigned int size)
+static int send_msg(int fd, struct task *t, struct header *hdr,
+		    unsigned int size, struct options *opts, 
+		    struct child_control *ctl)
 {
-	unsigned char buf[size], *rdma_flight_recorder = NULL;
+	unsigned char buf[size];
+	uint8_t *rdma_flight_recorder = NULL;
 	rds_rdma_cookie_t cookie = 0;
 	struct msghdr msg;
 	struct iovec iov;
 	ssize_t ret;
 
-	/* Make sure we always have the current sequence number.
-	 * When we send ACK packets, the seq that gets filled in is
-	 * stale. */
-	hdr->seq = t->send_seq;
 	fill_hdr(buf, size, hdr);
 
 	memset(&msg, 0, sizeof(msg));
@@ -1198,27 +1408,10 @@
 	iov.iov_base = buf;
 	iov.iov_len = size;
 
-	/* If this is a REQ packet in which we pass the MR to the
-	 * peer, extract the RDMA cookie and pass it on in the control
-	 * message for now. */
-	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
-		if (hdr->rdma_key != 0) {
-			/* We used GET_MR to obtain a key */
-			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
-			cookie = hdr->rdma_key;
-			hdr->rdma_key = 0;
-		} else {
-			/* Use the RDMA_MAP cmsg to have sendmsg do the
-			 * mapping on the fly. */
-			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
-					    hdr->rdma_size * hdr->rdma_vector,
-					    &cookie);
-		}
-	}
 
 	/* If this is an ACK packet with RDMA, build the cmsg
-	 * header that goes with it. */
-	if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
+	   * header that goes with it. */
+	if (hdr->op == OP_ACK && hdr->rdma_op != 0 && !hdr->rdma_remote_err) {
 		unsigned int qindex = hdr->index;
 
 		if (t->rdma_inflight[qindex] != 0) {
@@ -1230,14 +1423,33 @@
 			 *
 			 * We return one of the more obscure error messages,
 			 * which we recognize and handle in the top loop. */
-			trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
+			trace("Drain RDMA 0x%lx\n", rdma_user_token(t, qindex, 0, hdr->seq));
 			errno = EBADSLT;
 			return -1;
 		}
 		rdma_build_cmsg_xfer(&msg, hdr,
-				rdma_user_token(t, qindex),
+				rdma_user_token(t, qindex, 0, hdr->seq),
 				t->local_buf[qindex]);
 		rdma_flight_recorder = &t->rdma_inflight[qindex];
+	} else if (opts->async) {
+		if (hdr->op == OP_REQ)
+			build_cmsg_async_send(&msg,
+				rdma_user_token(t, hdr->index, OP_REQ, hdr->seq));
+		else
+			build_cmsg_async_send(&msg,
+				rdma_user_token(t, hdr->index, OP_ACK, hdr->seq));
+	}
+
+	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
+		if (hdr->rdma_key != 0) {
+			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
+			cookie = hdr->rdma_key;
+			hdr->rdma_key = 0;
+		} else {
+			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
+					hdr->rdma_size * hdr->rdma_vector,
+					&cookie);
+		}
 	}
 
 	ret = sendmsg(fd, &msg, 0);
@@ -1256,22 +1468,57 @@
 		 * lower 32bit of the cookie */
 		rdma_key_o_meter_add(cookie);
 	}
+
+	hdr->pending = 1;
+
+	return ret;
+}
+
+static int send_packet(int fd, struct task *t,
+		struct header *hdr, unsigned int size,
+		struct options *opts, struct child_control *ctl)
+{
+	ssize_t ret;
+
+	/* Make sure we always have the current sequence number.
+	 * When we send ACK packets, the seq that gets filled in is
+	 * stale. */
+	hdr->seq = t->send_seq;
+
+	ret = send_msg(fd, t, hdr, size, opts, ctl);
+	if (ret < 0) return ret;
+
 	t->send_seq++;
 	return ret;
 }
 
+static int resend_packet(int fd, struct task *t,
+		struct header *hdr, unsigned int size,
+		struct options *opts, struct child_control *ctl)
+{
+	ssize_t ret;
+
+	ret = send_msg(fd, t, hdr, size, opts, ctl);
+
+	return ret;
+}
+
 static int send_one(int fd, struct task *t,
 		struct options *opts,
 		struct child_control *ctl)
 {
 	struct timeval start;
 	struct timeval stop;
-	struct header hdr;
+	struct header *hdr = &t->req_header[t->send_index]; 
 	int ret;
 
-	build_header(t, &hdr, OP_REQ, t->send_index);
+	if (opts->async && hdr->pending) {
+		return -1;
+	}
+
+	build_header(t, hdr, OP_REQ, t->send_index);
 	if (opts->rdma_size && t->send_seq > 10)
-		rdma_build_req(fd, &hdr, t,
+		rdma_build_req(fd, hdr, t,
 				opts->rdma_size,
 				opts->req_depth,
 				opts->rw_mode,
@@ -1279,7 +1526,7 @@
 
 
 	gettimeofday(&start, NULL);
-	ret = send_packet(fd, t, &hdr, opts->req_size);
+	ret = send_packet(fd, t, hdr, opts->req_size, opts, ctl);
 	gettimeofday(&stop, NULL);
 
 	if (ret < 0)
@@ -1302,10 +1549,15 @@
 		struct child_control *ctl)
 {
 	struct header *hdr = &t->ack_header[qindex];
+	struct header *hdr2 = &t->ack2_header[qindex];
 	ssize_t ret;
 
+	if (opts->async && hdr2->pending) {
+		return -1;
+	}
+
 	/* send an ack in response to the req we just got */
-	ret = send_packet(fd, t, hdr, opts->ack_size);
+	ret = send_packet(fd, t, hdr, opts->ack_size, opts, ctl);
 	if (ret < 0)
 		return ret;
 	if (ret != opts->ack_size)
@@ -1324,6 +1576,8 @@
 		break;
 	}
 
+	memcpy(hdr2, hdr, sizeof(struct header));
+
 	return ret;
 }
 
@@ -1354,8 +1608,49 @@
 			struct child_control *ctl,
 			int can_send, int do_work)
 {
+	struct header *hdr;
+	unsigned int index;
+	int req_size;
+	int num_retries = t->retries;
+	uint64_t token;
+	unsigned int type;
+	unsigned int index2;
+	unsigned int i;
+
+	while (opts->async && num_retries > 0) {
+		index = (t->retry_index - num_retries +
+			(2 * opts->req_depth)) % (2 * opts->req_depth);
+
+		token = t->retry_token[index];
+		type = token & 0x03;
+		index2 = (token & 0xFFFFFFFF) >> 2;
+		i = index2 % opts->req_depth;
+
+		if (type == OP_REQ)
+			hdr = &t->req_header[i];
+		else
+			hdr = &t->ack2_header[i];
+
+		if (!hdr->retry)
+			goto next;
+
+		if (hdr->op == OP_REQ)
+			req_size = opts->req_size;
+		else
+			req_size = opts->ack_size;
+
+		if (resend_packet(fd, t, hdr, req_size, opts, ctl) < 0) {
+			return -1;
+		}
+		hdr->retry = 0;
+next:
+		num_retries--;
+	}
+	t->last_retry_seq = t->retries = 0;
+
 	if (ack_anything(fd, t, opts, ctl, can_send) < 0)
 		return -1;
+
 	while (do_work && t->pending < opts->req_depth) {
 		if (!can_send)
 			goto eagain;
@@ -1375,7 +1670,8 @@
 		rds_rdma_cookie_t *cookie,
 		struct sockaddr_in *sin,
 		struct timeval *tstamp,
-		struct task *tasks)
+		struct task *tasks,
+		struct options *opts)
 {
 	struct cmsghdr *cmsg;
 	char cmsgbuf[256];
@@ -1398,15 +1694,16 @@
 
 	if (ret < 0)
 		return ret;
-	if (ret && ret < sizeof(struct header))
+	if (ret && !strcmp(RDS_VERSION, peer_version) &&
+		ret < sizeof(struct header))
 		die("recvmsg() returned short data: %zd", ret);
-	if (msg.msg_namelen < sizeof(struct sockaddr_in))
+	if (ret && msg.msg_namelen < sizeof(struct sockaddr_in))
 		die("socklen = %d < sizeof(sin) (%zu)\n",
 		    msg.msg_namelen, sizeof(struct sockaddr_in));
 
 	/* See if the message comes with a RDMA destination */
 	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
-		struct rds_rdma_notify notify;
+		struct rds_rdma_send_notify notify;
 
 		if (cmsg->cmsg_level != sol)
 			continue;
@@ -1432,11 +1729,11 @@
 			memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie));
 			break;
 
-		case RDS_CMSG_RDMA_STATUS:
+		case RDS_CMSG_RDMA_SEND_STATUS:
 			if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
 				die("RDS_CMSG_RDMA_DEST data too small");
 			memcpy(&notify, CMSG_DATA(cmsg), sizeof(notify));
-			rdma_mark_completed(tasks, notify.user_token, notify.status);
+			rdma_mark_completed(tasks, notify.user_token, notify.status, opts);
 			break;
 		}
 	}
@@ -1445,7 +1742,8 @@
 
 static int recv_one(int fd, struct task *tasks,
 			struct options *opts,
-		struct child_control *ctl)
+		struct child_control *ctl,
+		struct child_control *all_ctl)
 {
 	char buf[max(opts->req_size, opts->ack_size)];
 	rds_rdma_cookie_t rdma_dest = 0;
@@ -1456,15 +1754,18 @@
 	uint16_t expect_index;
 	int task_index;
 	ssize_t ret;
-
-	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
+	int	check_status;
+
+
+	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks, opts);
 	if (ret < 0)
 		return ret;
 
 	/* If we received only RDMA completions or cong updates,
 	 * ret will be 0 */
-	if (ret == 0)
+	if (ret == 0) {
 		return 0;
+	}
 
 	/* check the incoming sequence number */
 	task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
@@ -1508,15 +1809,31 @@
 	hdr.to_port = t->src_addr.sin_port;
 	hdr.index = expect_index;
 
-	if (check_hdr(buf, ret, &hdr))
-		die("header from %s:%u to id %u bogus\n",
-		    inet_ntoa(sin.sin_addr), htons(sin.sin_port),
-		    ntohs(t->src_addr.sin_port));
+	check_status = check_hdr(buf, ret, &hdr, opts);
+	if (check_status) {
+		if (check_status > 0) {
+			die("header from %s:%u to id %u bogus\n",
+		    	inet_ntoa(sin.sin_addr), htons(sin.sin_port),
+		    	ntohs(t->src_addr.sin_port));
+		} else
+			return 0;
+	}
 
 	if (hdr.op == OP_ACK) {
-		stat_inc(&ctl->cur[S_RTT_USECS],
-			 usec_sub(&tstamp, &t->send_time[expect_index]));
-		t->pending -= 1;
+                uint64_t rtt_time = 
+                  usec_sub(&tstamp, &t->send_time[expect_index]);
+
+		stat_inc(&ctl->cur[S_RTT_USECS], rtt_time);
+                if (rtt_time > rtt_threshold)
+			print_outlier("Found RTT = 0x%lx\n", rtt_time);
+
+                if (show_histogram)
+                {
+                  ctl->latency_histogram[get_bucket(rtt_time)]++;
+                }
+
+		if (t->pending > 0)
+			t->pending -= 1;
 
 		if (in_hdr.rdma_key)
 			rdma_process_ack(fd, &in_hdr, ctl);
@@ -1549,6 +1866,7 @@
 }
 
 static void run_child(pid_t parent_pid, struct child_control *ctl,
+			struct child_control *all_ctl,
 		      struct options *opts, uint16_t id, int active)
 {
 	struct sockaddr_in sin;
@@ -1559,8 +1877,15 @@
 	struct task tasks[opts->nr_tasks];
 	struct timeval start;
         int do_work = opts->simplex ? active : 1;
-
+	int j;
+
+
+#if defined(__SVR4) && defined(__sun)
+	set_my_lgrp();
+	sin.sin_family = AF_INET_OFFLOAD;
+#else
 	sin.sin_family = AF_INET;
+#endif
 	sin.sin_port = htons(opts->starting_port + 1 + id);
 	sin.sin_addr.s_addr = htonl(opts->receive_addr);
 
@@ -1572,7 +1897,11 @@
 	for (i = 0; i < opts->nr_tasks; i++) {
 		tasks[i].nr = i;
 		tasks[i].src_addr = sin;
+#if defined(__SVR4) && defined(__sun)
+		tasks[i].dst_addr.sin_family = AF_INET_OFFLOAD;
+#else
 		tasks[i].dst_addr.sin_family = AF_INET;
+#endif
 		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
 		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
 		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
@@ -1581,6 +1910,15 @@
 		tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
 		tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
 		tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
+		tasks[i].ack2_header = alloca(opts->req_depth * sizeof(struct header));
+		for (j=0;j<opts->req_depth;j++)
+			tasks[i].ack2_header[j].pending = 0;
+
+		tasks[i].req_header = alloca(opts->req_depth * sizeof(struct header));
+		for (j=0;j<opts->req_depth;j++)
+			tasks[i].req_header[j].pending = 0;
+
+		tasks[i].retry_token = alloca(2 * opts->req_depth * sizeof(uint64_t));
 		tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
 	}
 
@@ -1611,7 +1949,7 @@
 
 		check_parent(parent_pid);
 
-		ret = poll(&pfd, 1, -1);
+		ret = poll(&pfd, 1, 1000);
 		if (ret < 0) {
 			if (errno == EINTR)
 				continue;
@@ -1621,10 +1959,14 @@
 		pfd.events = POLLIN;
 
 		if (pfd.revents & POLLIN) {
-			while (recv_one(fd, tasks, opts, ctl) >= 0)
+			while (recv_one(fd, tasks, opts, ctl, all_ctl) >= 0)
 				;
 		}
 
+		/* stop sending if in shutdown phase */
+		if (ctl->stopping)
+			continue;
+
 		/* keep the pipeline full */
 		can_send = !!(pfd.revents & POLLOUT);
 		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
@@ -1633,6 +1975,7 @@
 			if (t->drain_rdmas)
 				continue;
 			if (send_anything(fd, t, opts, ctl, can_send, do_work) < 0) {
+
 				pfd.events |= POLLOUT;
 
 				/* If the send queue is full, we will see EAGAIN.
@@ -1665,8 +2008,12 @@
 	uint32_t i;
 
 	len = opts->nr_tasks * sizeof(*ctl);
+#if defined(__SVR4) && defined(__sun)
+	ctl = (struct child_control *)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+#else
 	ctl = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED,
 		   0, 0);
+#endif
 	if (ctl == MAP_FAILED)
 		die("mmap of %u child control structs failed", opts->nr_tasks);
 
@@ -1688,7 +2035,7 @@
 				control_fd = -1;
 			}
 			rdma_key_o_meter_set_self(i);
-			run_child(parent, ctl + i, opts, i, active);
+			run_child(parent, ctl + i, ctl, opts, i, active);
 			exit(0);
 		}
 		ctl[i].pid = pid;
@@ -1699,7 +2046,7 @@
 			continue;
 		pid = waitpid(-1, NULL, WNOHANG);
 		if (pid)
-			die("child %u (pid %u) exited\n", i, pid);
+			die("child %u (pid %u) exited\n", i, (int)pid);
 		sleep(1);
 		i--; /* try this child again */
 	}
@@ -1823,6 +2170,7 @@
 
 	if (disable)
 		return;
+#if !(defined(__SVR4) && defined(__sun))
 	if ((fp = fopen("/proc/stat", "r")) == NULL) {
 		fprintf(stderr, "Cannot open /proc/stat (%s) - "
 				"not printing cpu stats\n",
@@ -1856,10 +2204,37 @@
 		}
 	}
 	fclose(fp);
+#else
+#define NSEC_TO_TICK(v)		(v * sysconf(_SC_CLK_TCK)/1000000000)
+	sol_cpu_stats_t		stats;	
+
+	solaris_init();
+	if ((sol_get_cpu_stats(&stats)) < 0) {
+		disable = 1;
+		return;
+	}
+	solaris_fini();
+	current.times[0] = stats.t_user;
+	current.times[1] = 0;
+	current.times[2] = stats.t_kernel;
+	current.times[3] = stats.t_idle;
+	current.times[4] = stats.t_iowait;
+	current.times[5] = 0;
+	current.times[6] = 0;
+	current.times[7] = 0;
+	current.intr = NSEC_TO_TICK(stats.t_intr);	/* NSEC_TO_TICK */
+
+#endif
 
 	if (initialize) {
+#if !(defined(__SVR4) && defined(__sun))
 		printf(",user:percent,system:percent,idle:percent"
 		       ",irq:percent,intr:count");
+#else
+		/* Solaris kstat doesn't provide irq/softirq info. */
+		printf(",user:percent,system:percent,idle:percent"
+		       ",intr:count");
+#endif
 	} else {
 		struct sys_stats sys;
 		unsigned long sum = 0;
@@ -1884,12 +2259,21 @@
 		 *  5	irq
 		 *  6	softirq
 		 */
+#if !(defined(__SVR4) && defined(__sun))
 		printf(",%f,%f,%f,%f,%Lu",
 			(sys.times[0] + sys.times[1]) * scale,
 			sys.times[2] * scale,
 			(sys.times[3] + sys.times[4]) * scale,
 			(sys.times[5] + sys.times[6]) * scale,
 			sys.intr);
+#else
+		/* Solaris kstat doesn't provide irq/softirq info. */
+		printf(",%f,%f,%f,%Lu",
+			(sys.times[0] + sys.times[1]) * scale,
+			sys.times[2] * scale,
+			(sys.times[3] + sys.times[4]) * scale,
+			sys.intr);
+#endif
 	}
 	prev = current;
 }
@@ -1903,6 +2287,10 @@
 	static socklen_t buflen = 0;
 	static int sock_fd = -1;
 	int i, count, item_size;
+#if defined(__SVR4) && defined(__sun)
+	socklen_t len;
+	struct rds_info_arg arg;
+#endif
 
 	if (sock_fd < 0) {
 		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
@@ -1912,6 +2300,7 @@
 
 	/* We should only loop once on the first call; after that the
 	 * buffer requirements for RDS counters should not change. */
+#if !(defined(__SVR4) && defined(__sun))
 	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
 		if (errno != ENOSPC)
 			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
@@ -1919,6 +2308,28 @@
 		if (!curr)
 			die_errno("Cannot allocate buffer for stats counters");
 	}
+#else
+	int retcode;
+
+	retcode = sol_ioctl(
+	    sock_fd, RDS_INFO_COUNTERS, &arg, &buflen, &item_size);
+	if (retcode != 0) {
+		if (retcode == 1) {
+			die_errno("ioctl(RDS_INFO_COUNTERS) failed");
+		} else if (retcode == 2) {
+			fprintf(stderr, "%s: Unable to allocate memory "
+			    "for %u bytes of info: %s\n",
+			    "rds-stress", buflen, strerror(errno));
+			return;
+		} else {
+			fprintf(stderr, "%s: Unable to access "
+			    "RDS_INFO_COUNTERS statistics: %s\n",
+			    "rds-stress", strerror(errno));
+			return;
+		}
+	}
+
+#endif
 
 	if (item_size > sizeof(*ctr))
 		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
@@ -1932,8 +2343,11 @@
 	}
 
 	for (i = 0; i < count; ++i)
+#if !(defined(__SVR4) && defined(__sun))
 		memcpy(ctr + i, curr + i * item_size, item_size);
-
+#else
+		memcpy(ctr + i, ((void *)(uintptr_t)arg.datap) + i * item_size, item_size);
+#endif
 	gettimeofday(&now, NULL);
 
 	if (initialize) {
@@ -1957,6 +2371,10 @@
 	memcpy(prev, ctr, count * sizeof(*ctr));
 	last_ts = now;
 
+#if defined(__SVR4) && defined(__sun)
+	free((void *)(uintptr_t)arg.datap);
+#endif
+
 	get_stats(initialize);
 }
 
@@ -1967,7 +2385,7 @@
 
 	pid = waitpid(-1, &status, wflags);
 	if (pid < 0)
-		die("waitpid returned %u", pid);
+		die("waitpid returned %u", (int)pid);
 	if (pid == 0)
 		return 0;
 
@@ -1975,15 +2393,15 @@
 		if (WEXITSTATUS(status) == 0)
 			return 1;
 		die("child pid %u exited with status %d\n",
-				pid, WEXITSTATUS(status));
+				(int)pid, WEXITSTATUS(status));
 	}
 	if (WIFSIGNALED(status)) {
 		if (WTERMSIG(status) == SIGTERM)
 			return 1;
 		die("child pid %u exited with signal %d\n",
-				pid, WTERMSIG(status));
+				(int)pid, WTERMSIG(status));
 	}
-	die("child pid %u wait status %d\n", pid, status);
+	die("child pid %u wait status %d\n", (int)pid, status);
 }
 
 static void release_children_and_wait(struct options *opts,
@@ -1995,8 +2413,12 @@
 	struct counter summary[NR_STATS];
 	struct timeval start, end, now, first_ts, last_ts;
 	double cpu_total = 0;
-	uint16_t i, cpu_samples = 0;
+	uint16_t i, j, cpu_samples = 0;
 	uint16_t nr_running;
+        uint64_t latency_histogram[MAX_BUCKETS];
+
+	if (show_histogram) 
+	        memset(latency_histogram, 0, sizeof(latency_histogram));
 
 	gettimeofday(&start, NULL);
 	start.tv_sec += 2;
@@ -2139,6 +2561,11 @@
 	control_fd = -1;
 
 	if (nr_running) {
+		/* let everything gracefully stop before we kill the chillins */
+		for (i = 0; i < opts->nr_tasks; i++)
+			ctl[i].stopping = 1;
+		sleep(1);
+
 		for (i = 0; i < opts->nr_tasks; i++)
 			kill(ctl[i].pid, SIGTERM);
 		stop_soakers(soak_arr);
@@ -2167,6 +2594,19 @@
 			avg(&summary[S_SENDMSG_USECS]),
 			avg(&summary[S_RTT_USECS]),
 			soak_arr? scale * cpu_total : -1.0);
+
+		if (show_histogram) 
+		{
+			for (i = 0; i < opts->nr_tasks; i++)
+			  for (j=0;j < MAX_BUCKETS; j++)
+			    latency_histogram[j] += ctl[i].latency_histogram[j];
+			    
+			printf("\nRTT histogram\n");
+			printf("RTT (us)        \t\t    Count\n");
+			for (i=0;i < MAX_BUCKETS; i++)
+			  printf("[%6u - %6u] \t\t %8u\n", 1 << i, 1 << (i+1), 
+			         (unsigned int)latency_histogram[i]);
+		}
 	}
 }
 
@@ -2220,6 +2660,21 @@
 {
 	ssize_t ret;
 
+	if (size == sizeof(struct options)) {
+		memset(ptr, 0, size);
+		ret = read(fd, peer_version, VERSION_MAX_LEN);
+		if (ret != VERSION_MAX_LEN)
+			die_errno("Failed to read version");
+
+		if (strcmp(peer_version, RDS_VERSION)) {
+			ptr += ret;
+			memcpy(ptr, peer_version, VERSION_MAX_LEN);
+			size = sizeof(struct options_2_0_6) - ret;
+		} else
+			size -= ret;
+		ptr += ret;
+	}
+
 	while (size) {
 		ret = read(fd, ptr, size);
 		if (ret < 0)
@@ -2233,6 +2688,7 @@
 
 static void encode_options(struct options *dst, const struct options *src)
 {
+	memcpy(dst->version, src->version, VERSION_MAX_LEN);
 	dst->req_depth = htonl(src->req_depth);
 	dst->req_size = htonl(src->req_size);
 	dst->ack_size = htonl(src->ack_size);
@@ -2262,10 +2718,13 @@
         dst->simplex = src->simplex;                    /* byte sized */
         dst->rw_mode = src->rw_mode;                    /* byte sized */
         dst->rdma_vector = htonl(src->rdma_vector);
+	dst->tos = src->tos;
+	dst->async = src->async;
 }
 
 static void decode_options(struct options *dst, const struct options *src)
 {
+	memcpy(dst->version, src->version, VERSION_MAX_LEN);
 	dst->req_depth = ntohl(src->req_depth);
 	dst->req_size = ntohl(src->req_size);
 	dst->ack_size = ntohl(src->ack_size);
@@ -2295,6 +2754,8 @@
         dst->simplex = src->simplex;                    /* byte sized */
         dst->rw_mode = src->rw_mode;                    /* byte sized */
 	dst->rdma_vector = ntohl(src->rdma_vector);
+	dst->tos = src->tos;
+	dst->async = src->async;
 }
 
 static void verify_option_encdec(const struct options *opts)
@@ -2316,6 +2777,25 @@
 		die("encode/decode check of options struct failed");
 }
 
+static void reset_conn(struct options *opts)
+{
+	struct rds_reset val;
+	int fd;
+	struct sockaddr_in sin;
+
+	sin.sin_family = AF_INET;
+	sin.sin_port = htons(opts->starting_port);
+	sin.sin_addr.s_addr = htonl(opts->receive_addr);
+
+	fd = bound_socket(pf, SOCK_SEQPACKET, 0, &sin);
+
+	val.tos = opts->tos;
+	val.src.s_addr = htonl(opts->receive_addr);
+	val.dst.s_addr = htonl(opts->send_addr);
+	if (setsockopt(fd, sol, RDS_CONN_RESET, &val, sizeof(val)))
+		die_errno("setsockopt RDS_CONN_RESET failed");
+}
+
 static int active_parent(struct options *opts, struct soak_control *soak_arr)
 {
 	struct options enc_options;
@@ -2324,6 +2804,11 @@
 	int fd;
 	uint8_t ok;
 
+	if (reset_connection) {
+		reset_conn(opts);
+		return 0;
+	}
+
 	if (opts->show_params) {
 		unsigned int k;
 
@@ -2387,7 +2872,11 @@
 	 * We just tell the peer what options to use.
 	 */
 	encode_options(&enc_options, opts);
-	peer_send(fd, &enc_options, sizeof(struct options));
+	if (opts->tos || opts->async)
+		peer_send(fd, &enc_options, sizeof(struct options));
+	else
+		peer_send(fd, &enc_options.req_depth,
+				sizeof(struct options_2_0_6));
 
 	printf("negotiated options, tasks will start in 2 seconds\n");
 	ctl = start_children(opts, 1);
@@ -2517,7 +3006,11 @@
 	/* an extra terminating entry which will be all 0s */
 	len = (nr_soak + 1) * sizeof(struct soak_control);
 	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
+#if defined(__SVR4) && defined(__sun)
+			MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+#else
 			MAP_ANONYMOUS|MAP_SHARED, 0, 0);
+#endif
 	if (soak_arr == MAP_FAILED)
 		die("mmap of %ld soak control structs failed", nr_soak);
 
@@ -2572,6 +3065,10 @@
 	OPT_CONNECT_RETRIES,
 	OPT_USE_CONG_MONITOR,
 	OPT_PERFDATA,
+        OPT_SHOW_OUTLIERS,
+        OPT_SHOW_HISTOGRAM,
+	OPT_RESET,
+	OPT_ASYNC,
 };
 
 static struct option long_options[] = {
@@ -2584,11 +3081,13 @@
 { "send-addr",		required_argument,	NULL,	's'	},
 { "port",		required_argument,	NULL,	'p'	},
 { "time",		required_argument,	NULL,	'T'	},
+{ "tos",                required_argument,      NULL,   'Q'     },
 { "report-cpu",		no_argument,		NULL,	'c'	},
 { "report-summary",	no_argument,		NULL,	'z'	},
 { "rtprio",		no_argument,		NULL,	'R'	},
 { "verify",		no_argument,		NULL,	'v'	},
 { "trace",		no_argument,		NULL,	'V'	},
+{ "lgrpid",		required_argument,	NULL,	'g'	},
 
 { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
 { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
@@ -2601,6 +3100,10 @@
 { "show-perfdata",	no_argument,		NULL,	OPT_PERFDATA },
 { "connect-retries",	required_argument,	NULL,	OPT_CONNECT_RETRIES },
 { "use-cong-monitor",	required_argument,	NULL,	OPT_USE_CONG_MONITOR },
+{ "show-outliers",      required_argument,      NULL,   OPT_SHOW_OUTLIERS    },
+{ "show-histogram",     no_argument,            NULL,   OPT_SHOW_HISTOGRAM   },
+{ "reset",              no_argument,            NULL,   OPT_RESET },
+{ "async",              no_argument,            NULL,   OPT_ASYNC },
 
 { NULL }
 };
@@ -2640,6 +3143,8 @@
 	opts.use_cong_monitor = 1;
 	opts.rdma_use_fence = 1;
 	opts.rdma_cache_mrs = 0;
+	opts.rdma_use_once = 0;
+	opts.rdma_use_get_mr = 0;
 	opts.rdma_alignment = 0;
 	opts.rdma_key_o_meter = 0;
 	opts.show_params = 0;
@@ -2648,11 +3153,17 @@
         opts.simplex = 0;
         opts.rw_mode = 0;
 	opts.rdma_vector = 1;
+        rtt_threshold = ~0U;
+        show_histogram = 0;
+	opts.tos = 0;
+	reset_connection = 0;
+	opts.async = 0;
+	strcpy(opts.version, RDS_VERSION);
 
 	while(1) {
 		int c, index;
 
-		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
+		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:Q:vVg:z",
 				long_options, &index);
 		if (c == -1)
 			break;
@@ -2702,6 +3213,9 @@
 			case 'T':
 				opts.run_time = parse_ull(optarg, (uint32_t)~0);
 				break;
+			case 'Q':
+				opts.tos = parse_ull(optarg, (uint8_t)~0);
+				break;
 			case 'z':
 				opts.summary_only = 1;
 				break;
@@ -2711,9 +3225,25 @@
 			case 'V':
 				opts.tracing = 1;
 				break;
+			case 'g':
+				lgrp_id = (lgrp_id_t)parse_ull(optarg,
+				    (uint32_t)~0);
+				break;
+                        case OPT_SHOW_OUTLIERS:
+                                rtt_threshold = parse_ull(optarg, ~0U);
+                                break;
+                        case OPT_SHOW_HISTOGRAM:
+                                show_histogram = 1;
+                                break;
 			case OPT_USE_CONG_MONITOR:
 				opts.use_cong_monitor = parse_ull(optarg, 1);
 				break;
+			case OPT_RESET:
+				reset_connection = 1;
+				break;
+			case OPT_ASYNC:
+				opts.async = 1;
+				break;
 			case OPT_RDMA_USE_ONCE:
 				opts.rdma_use_once = parse_ull(optarg, 1);
 				break;
@@ -2786,6 +3316,7 @@
 	if (opts.rdma_size && 0)
 		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
 
+	set_my_lgrp();
 	opt = opts;
 	return active_parent(&opts, soak_arr);
 }
diff -r 1afa90e87b4b rds.7
--- a/rds.7	Tue Feb 23 22:30:57 2016 -0800
+++ b/rds.7	Tue Feb 23 22:30:58 2016 -0800
@@ -6,6 +6,7 @@
 .nf
 .B #include <sys/socket.h>
 .B #include <netinet/in.h>
+.B #include <sys/rds.h>
 .fi
 .SH DESCRIPTION
 This is an implementation of the RDS socket API. It provides reliable,
@@ -14,18 +15,13 @@
 Currently, RDS can be transported over Infiniband, and loopback.
 RDS over TCP is disabled, but will be re-enabled in the near future.
 .PP
-RDS uses standard
-.B AF_INET
-addresses as described in
-.BR ip (7)
+RDS uses 
+.B AF_INET_OFFLOAD address family 
 to identify end points.
 .\"------------------------------------------------------------------
 .SS Socket Creation
 RDS is still in development and as such does not have a reserved protocol
-family constant. Applications must read the string representation of the
-protocol family value from the
-.B pf_rds
-sysctl parameter file described below.
+family constant.  Applications should use AF_INET_OFFLOAD.
 .PP
 .nf
 .B rds_socket = socket(pf_rds, SOCK_SEQPACKET, 0);
@@ -58,9 +54,6 @@
 .BR SOL_RDS ).
 Just as with the RDS protocol family, an official value has not been
 assigned yet, so the kernel will assign a value dynamically.
-The assigned value can be retrieved from the
-.B sol_rds
-sysctl parameter file.
 .PP
 RDS specific socket options will be described in a separate section
 below.
@@ -77,7 +70,7 @@
 .PP
 For instance, when binding to the address of an Infiniband interface
 such as
-.BR ib0 ,
+.BR ibd0 ,
 the socket will use the Infiniband transport. If RDS is not able
 to associate a transport with the given address, it will return
 .BR EADDRNOTAVAIL .
@@ -394,47 +387,6 @@
 be delivered in the order they're sent. Messages sent from different
 sockets, or to different destinations, may be delivered in any order.
 .\"------------------------------------------------------------------
-.SH SYSCTL VALUES
-These parameteres may only be accessed through their files in
-.BR /proc/sys/net/rds .
-Access through
-.BR sysctl (2)
-is not supported.
-.TP
-.B pf_rds
-This file contains the string representation of the protocol family
-constant passed to
-.BR socket (2)
-to create a new RDS socket.
-.TP
-.B sol_rds
-This file contains the string representation of the socket level parameter
-that is passed to
-.BR getsockopt (2)
-and
-.BR setsockopt (2)
-to manipulate RDS socket options.
-.TP
-.BR max_unacked_bytes " and " max_unacked_packets
-These parameters are used to tune the generation of acknowledgements. By
-default, the system receiving RDS messages does not send back explicit
-acknowledgements unless it transmits a message of its own (in which
-case the ACK is piggybacked onto the outgoing message), or when the sending
-system requests an ACK.
-.IP
-However, the sender needs to see an ACK from time to time so that it
-can purge old messages from the send queue. The unacked bytes and
-packet counters are used to keep track of how much data has been
-sent without requesting an ACK. The default is to request an acknowledgement
-every 16 packets, or every 16 MB, whichever comes first.
-.TP
-.BR reconnect_delay_min_ms " and " reconnect_delay_max_ms
-RDS uses host-to-host connections to transport RDS messages (both for the TCP
-and the Infiniband transport). If this connection breaks, RDS will try to
-re-establish the connection. Because this reconnect may be triggered by
-both hosts at the same time and fail, RDS uses a random backoff before attempting
-a reconnect. These two parameters specify the minimum and maximum delay in
-milliseconds. The default values are 1 and 1000, respectively.
 .SH SEE ALSO
 .BR rds-rdma (7),
 .BR socket (2),