15695368 SUNBT7017250 man page from rds-stress has incorect entry for rdma bytes transmi s11-update
authorBoris Chiu <boris.chiu@oracle.com>
Tue, 01 Jul 2014 14:49:32 -0700
branchs11-update
changeset 3195 cf6a5a756b74
parent 3193 e45380d8d511
child 3196 4c06db2d9388
15695368 SUNBT7017250 man page from rds-stress has incorect entry for rdma bytes transmi 17489343 Update solaris rds-tools to 2.0.7-1.12 18382333 libibverbs ibv_cmd_get_context() fails when mcxnex driver UAR space is exhausted 18422470 modify librdmacm on solaris to avoid opening device context per hca on startup 18728045 Include local copy of rds.h file to build NRM rds-tools
components/open-fabrics/libibverbs/solaris_compatibility.c
components/open-fabrics/librdmacm/patches/base.patch
components/open-fabrics/rds-tools/Makefile
components/open-fabrics/rds-tools/patches/base.patch
components/open-fabrics/rds-tools/rds.h
--- a/components/open-fabrics/libibverbs/solaris_compatibility.c	Thu Jun 26 11:51:15 2014 -0700
+++ b/components/open-fabrics/libibverbs/solaris_compatibility.c	Tue Jul 01 14:49:32 2014 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2010, 2014, Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -56,6 +56,8 @@
 #include <alloca.h>
 #include "../include/infiniband/arch.h"
 #include "../include/infiniband/verbs.h"
+#include "../include/infiniband/kern-abi.h"
+#include "../include/infiniband/driver.h"
 #include <errno.h>
 #include <pthread.h>
 #include <kstat.h>
@@ -207,6 +209,12 @@
 void __attribute__((constructor))solaris_init(void);
 void __attribute__((destructor))solaris_fini(void);
 
+int sol_ibv_query_device(struct ibv_device *device,
+    struct ibv_device_attr *device_attr);
+
+int sol_ibv_query_port(struct ibv_context *, uint8_t, struct ibv_port_attr *);
+
+
 void
 solaris_init(void)
 {
@@ -283,7 +291,6 @@
 {
 	ibdev_cache_info_t	info;
 	struct ibv_device	**root_dev_list, **dev_list = NULL;
-	struct ibv_context	*ctx = NULL;
 	struct ibv_device_attr	device_attr;
 	int			i, num_dev, dev_num, ret = 1;
 	uint64_t		guid;
@@ -298,17 +305,12 @@
 
 	for (i = 0; i < num_dev; i++, dev_list++) {
 
-		if (!(ctx = ibv_open_device(*dev_list))) {
-			fprintf(stderr, "failed to open device %p\n",
+		if (sol_ibv_query_device(*dev_list, &device_attr)) {
+			fprintf(stderr, "failed to query device %p\n",
 			    *dev_list);
 			goto error_exit2;
 		}
 
-		if (ibv_query_device(ctx, &device_attr)) {
-			fprintf(stderr, "failed to query device %p\n", ctx);
-			goto error_exit3;
-		}
-
 		guid = ntohll(device_attr.node_guid);
 		sprintf(info.ibd_node_guid_str, "%04x:%04x:%04x:%04x",
 		    (unsigned)(guid >> 48) & 0xffff,
@@ -339,7 +341,7 @@
 			p = ibdev + (strlen("mlx4_"));
 		} else {
 			fprintf(stderr, "Invalid device %s\n", ibdev);
-			goto error_exit3;
+			goto error_exit2;
 		}
 		dev_num = atoi(p);
 		(void) strcpy(info.ibd_name, ibdev);
@@ -349,16 +351,13 @@
 		if (ibdev_cache_add(dev_num, &info)) {
 			fprintf(stderr, "failed to add dev %d to ibdev cache\n",
 			    dev_num);
-			goto error_exit3;
+			goto error_exit2;
 		}
 	}
 
 	ret = 0;
 
 	/* clean up and Return */
-error_exit3:
-	if (ctx)
-		ibv_close_device(ctx);
 error_exit2:
 	if (root_dev_list)
 		ibv_free_device_list(root_dev_list);
@@ -747,10 +746,11 @@
     uint16_t **pkey_table)
 {
 	struct ibv_device 	**root_dev_list, **dev_list = NULL;
-	struct ibv_context 	*ctx = NULL;
+	struct ibv_context	ctx;
 	union ibv_gid 		*gids = NULL;
 	uint16_t		*pkeys = NULL;
 	int 			i, num_dev, rv, ret = 1;
+	char			uverbs_devpath[MAXPATHLEN];
 
 	root_dev_list = dev_list = ibv_get_device_list(&num_dev);
 	if (!dev_list) {
@@ -769,14 +769,17 @@
 		goto error_exit2;
 	}
 
-	if (!(ctx = ibv_open_device(*dev_list))) {
-		fprintf(stderr, "failed to open device %p\n", *dev_list);
+	snprintf(uverbs_devpath, MAXPATHLEN, "%s/%s", IB_OFS_DEVPATH_PREFIX,
+	    (*dev_list)->dev_name);
+
+	ctx.device = *dev_list;
+
+	if ((ctx.cmd_fd = open(uverbs_devpath, O_RDWR)) < 0)
 		goto error_exit2;
-	}
 
-	if (ibv_query_port(ctx, port_num, port_attr)) {
+	if (sol_ibv_query_port(&ctx, port_num, port_attr)) {
 		fprintf(stderr, "failed to query dev %p, port %d\n",
-		    ctx, port_num);
+		    &ctx, port_num);
 		goto error_exit3;
 	}
 
@@ -789,7 +792,7 @@
 		 * set high bit of port_num to get all gids in one shot.
 		 */
 		port_num |= 0x80;
-		rv = sol_ibv_query_gid(ctx, port_num, port_attr->gid_tbl_len,
+		rv = sol_ibv_query_gid(&ctx, port_num, port_attr->gid_tbl_len,
 		    gids);
 		if (rv != 0)
 			goto error_exit4;
@@ -808,7 +811,7 @@
 		 * set high bit of port_num to get all pkeys in one shot.
 		 */
 		port_num |= 0x80;
-		rv = sol_ibv_query_pkey(ctx, port_num, port_attr->pkey_tbl_len,
+		rv = sol_ibv_query_pkey(&ctx, port_num, port_attr->pkey_tbl_len,
 		    pkeys);
 		if (rv != 0)
 			goto error_exit5;
@@ -829,8 +832,8 @@
 	if (gids)
 		free(gids);
 error_exit3:
-	if (ctx)
-		ibv_close_device(ctx);
+	if (ctx.cmd_fd > 0)
+		close(ctx.cmd_fd);
 error_exit2:
 	if (root_dev_list)
 		ibv_free_device_list(root_dev_list);
@@ -1613,10 +1616,8 @@
 sol_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index,
     union ibv_gid *gid)
 {
-	char uverbs_devpath[MAXPATHLEN];
-	int uverbs_fd;
-	int count, start;
-	sol_uverbs_gid_t *uverbs_gidp;
+	int			count, start;
+	sol_uverbs_gid_t	*uverbs_gidp;
 
 	/*
 	 * Not exported via sysfs, use ioctl.
@@ -1625,12 +1626,6 @@
 	    ((port_num & 0x80) && (index == 0)))
 		return (-1);
 
-	snprintf(uverbs_devpath, MAXPATHLEN, "%s/%s", IB_OFS_DEVPATH_PREFIX,
-	    context->device->dev_name);
-
-	if ((uverbs_fd = open(uverbs_devpath, O_RDWR)) < 0)
-		return (-1);
-
 	if (port_num & 0x80) {
 		start = 0;
 		count = index;
@@ -1642,7 +1637,6 @@
 	uverbs_gidp = (sol_uverbs_gid_t *)malloc(count *
 	    sizeof (union ibv_gid) + sizeof (sol_uverbs_gid_t));
 	if (uverbs_gidp == NULL) {
-		close(uverbs_fd);
 		return (-1);
 	}
 
@@ -1650,7 +1644,7 @@
 	uverbs_gidp->uverbs_gid_cnt = count;
 	uverbs_gidp->uverbs_gid_start_index = start;
 
-	if (ioctl(uverbs_fd, UVERBS_IOCTL_GET_GIDS, uverbs_gidp) != 0) {
+	if (ioctl(context->cmd_fd, UVERBS_IOCTL_GET_GIDS, uverbs_gidp) != 0) {
 #ifdef	DEBUG
 		fprintf(stderr, "UVERBS_IOCTL_GET_GIDS failed: %s\n",
 		    strerror(errno));
@@ -1669,12 +1663,10 @@
 	}
 	memcpy(gid, uverbs_gidp->uverbs_gids, sizeof (union ibv_gid) * count);
 	free(uverbs_gidp);
-	close(uverbs_fd);
 	return (0);
 
 gid_error_exit:
 	free(uverbs_gidp);
-	close(uverbs_fd);
 	return (-1);
 }
 
@@ -1682,10 +1674,8 @@
 sol_ibv_query_pkey(struct ibv_context *context, uint8_t port_num,
     int index, uint16_t *pkey)
 {
-	char uverbs_devpath[MAXPATHLEN];
-	int uverbs_fd;
-	int count, start;
-	sol_uverbs_pkey_t *uverbs_pkeyp;
+	int			count, start;
+	sol_uverbs_pkey_t	*uverbs_pkeyp;
 
 	/*
 	 * Not exported via sysfs, use ioctl.
@@ -1694,9 +1684,7 @@
 	    ((port_num & 0x80) && (index == 0)))
 		return (-1);
 
-	snprintf(uverbs_devpath, MAXPATHLEN, "%s/%s", IB_OFS_DEVPATH_PREFIX,
-	    context->device->dev_name);
-	if ((uverbs_fd = open(uverbs_devpath, O_RDWR)) < 0)
+	if (context->cmd_fd < 0)
 		return (-1);
 
 	if (port_num & 0x80) {
@@ -1710,7 +1698,6 @@
 	uverbs_pkeyp = (sol_uverbs_pkey_t *)malloc(count *
 	    sizeof (uint16_t) + sizeof (sol_uverbs_pkey_t));
 	if (uverbs_pkeyp == NULL) {
-		close(uverbs_fd);
 		return (-1);
 	}
 
@@ -1718,7 +1705,7 @@
 	uverbs_pkeyp->uverbs_pkey_cnt = count;
 	uverbs_pkeyp->uverbs_pkey_start_index = start;
 
-	if (ioctl(uverbs_fd, UVERBS_IOCTL_GET_PKEYS, uverbs_pkeyp) != 0) {
+	if (ioctl(context->cmd_fd, UVERBS_IOCTL_GET_PKEYS, uverbs_pkeyp) != 0) {
 #ifdef	DEBUG
 		fprintf(stderr, "UVERBS_IOCTL_GET_PKEYS failed: %s\n",
 		    strerror(errno));
@@ -1737,12 +1724,57 @@
 	}
 	memcpy(pkey, uverbs_pkeyp->uverbs_pkey, sizeof (uint16_t) * count);
 	free(uverbs_pkeyp);
-	close(uverbs_fd);
 	return (0);
 
 pkey_error_exit:
 	free(uverbs_pkeyp);
-	close(uverbs_fd);
 	return (-1);
 }
+
+int
+sol_ibv_query_device(struct ibv_device *device, struct ibv_device_attr *attr)
+{
+	struct ibv_query_device cmd;
+	struct ibv_context	context;
+	char			uverbs_devpath[MAXPATHLEN];
+	int			uverbs_fd, ret;
+	uint64_t		raw_fw_ver;
+	unsigned		major, minor, sub_minor;
+
+	context.device = device;
+
+	if (!device || !attr)
+		return (-1);
+
+	snprintf(uverbs_devpath, MAXPATHLEN, "%s/%s", IB_OFS_DEVPATH_PREFIX,
+	    device->dev_name);
+
+	if ((context.cmd_fd = open(uverbs_devpath, O_RDWR)) <  0)
+		return (-1);
+
+	ret = ibv_cmd_query_device(&context, attr, &raw_fw_ver, &cmd,
+	    sizeof (cmd));
+
+	if (ret)
+		return (ret);
+
+	major	  = (raw_fw_ver >> 32) & 0xffff;
+	minor	  = (raw_fw_ver >> 16) & 0xffff;
+	sub_minor = raw_fw_ver & 0xffff;
+
+	snprintf(attr->fw_ver, sizeof (attr->fw_ver),
+	    "%d.%d.%03d", major, minor, sub_minor);
+
+	close(context.cmd_fd);
+	return (0);
+}
+
+int
+sol_ibv_query_port(struct ibv_context *context, uint8_t port,
+    struct ibv_port_attr *attr)
+{
+	struct	ibv_query_port cmd;
+
+	return (ibv_cmd_query_port(context, port, attr, &cmd, sizeof (cmd)));
+}
 #endif
--- a/components/open-fabrics/librdmacm/patches/base.patch	Thu Jun 26 11:51:15 2014 -0700
+++ b/components/open-fabrics/librdmacm/patches/base.patch	Tue Jul 01 14:49:32 2014 -0700
@@ -1,6 +1,6 @@
 diff -r -u /tmp/librdmacm-1.0.14.1/configure librdmacm-1.0.14.1/configure
 --- /tmp/librdmacm-1.0.14.1/configure	Tue Feb 15 17:12:14 2011
-+++ librdmacm-1.0.14.1/configure	Thu Feb 24 08:39:24 2011
++++ librdmacm-1.0.14.1/configure	Wed Apr 30 10:52:05 2014
 @@ -7625,6 +7625,7 @@
   	esac ;;
        esac
@@ -11,7 +11,7 @@
      sunos4*)
 diff -r -u /tmp/librdmacm-1.0.14.1/Makefile.in librdmacm-1.0.14.1/Makefile.in
 --- /tmp/librdmacm-1.0.14.1/Makefile.in	Tue Feb 15 17:12:13 2011
-+++ librdmacm-1.0.14.1/Makefile.in	Mon Mar 28 16:49:13 2011
++++ librdmacm-1.0.14.1/Makefile.in	Wed Apr 30 10:52:05 2014
 @@ -69,7 +69,7 @@
  	"$(DESTDIR)$(man1dir)" "$(DESTDIR)$(man3dir)" \
  	"$(DESTDIR)$(man7dir)" "$(DESTDIR)$(infinibandincludedir)" \
@@ -32,7 +32,7 @@
  examples_mckey_OBJECTS = $(am_examples_mckey_OBJECTS)
 diff -r -u /tmp/librdmacm-1.0.14.1/src/cma.h librdmacm-1.0.14.1/src/cma.h
 --- /tmp/librdmacm-1.0.14.1/src/cma.h	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/src/cma.h	Fri Feb 11 04:08:57 2011
++++ librdmacm-1.0.14.1/src/cma.h	Tue May  6 11:32:19 2014
 @@ -40,8 +40,10 @@
  
  #include <stdlib.h>
@@ -59,9 +59,18 @@
  #define min(a, b) (a < b ? a : b)
  
  static inline int ERR(int err)
[email protected]@ -74,7 +68,7 @@
+ 	return -1;
+ }
+ 
+-int ucma_init();
++int ucma_init(void);
+ extern int af_ib_support;
+ 
+ #define RAI_ROUTEONLY		0x01000000
 diff -r -u /tmp/librdmacm-1.0.14.1/src/cma.c librdmacm-1.0.14.1/src/cma.c
 --- /tmp/librdmacm-1.0.14.1/src/cma.c	Fri Dec 10 12:05:34 2010
-+++ librdmacm-1.0.14.1/src/cma.c	Mon Mar 28 16:44:55 2011
++++ librdmacm-1.0.14.1/src/cma.c	Tue May  6 11:30:54 2014
 @@ -46,12 +46,18 @@
  #include <poll.h>
  #include <unistd.h>
@@ -81,7 +90,217 @@
  #include <infiniband/driver.h>
  #include <infiniband/marshall.h>
  #include <rdma/rdma_cma.h>
[email protected]@ -354,9 +360,18 @@
[email protected]@ -100,6 +106,8 @@
+ 	struct ibv_pd	   *pd;
+ 	uint64_t	    guid;
+ 	int		    port_cnt;
++	int		    refcnt;
++	int		    max_qpsize;
+ 	uint8_t		    max_initiator_depth;
+ 	uint8_t		    max_responder_resources;
+ };
[email protected]@ -143,6 +151,7 @@
+ 
+ static struct cma_device *cma_dev_array;
+ static int cma_dev_cnt;
++static int cma_init_cnt;
+ static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
+ static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
+ int af_ib_support;
[email protected]@ -156,12 +165,16 @@
+ 
+ 	if (cma_dev_cnt) {
+ 		while (cma_dev_cnt--) {
+-			ibv_dealloc_pd(cma_dev_array[cma_dev_cnt].pd);
++			if (!cma_dev_array[cma_dev_cnt].verbs)
++				continue;
++
++			if (cma_dev_array[cma_dev_cnt].refcnt)
++				ibv_dealloc_pd(cma_dev_array[cma_dev_cnt].pd);
+ 			ibv_close_device(cma_dev_array[cma_dev_cnt].verbs);
++			cma_init_cnt--;
+ 		}
+ 
+ 		free(cma_dev_array);
+-		cma_dev_cnt = 0;
+ 	}
+ }
+ 
[email protected]@ -228,9 +241,7 @@
+ int ucma_init(void)
+ {
+ 	struct ibv_device **dev_list = NULL;
+-	struct cma_device *cma_dev;
+-	struct ibv_device_attr attr;
+-	int i, ret, dev_cnt, ib;
++	int i, ret, dev_cnt;
+ 
+ 	/* Quick check without lock to see if we're already initialized */
+ 	if (cma_dev_cnt)
[email protected]@ -253,46 +264,20 @@
+ 		goto err1;
+ 	}
+ 
+-	cma_dev_array = malloc(sizeof *cma_dev * dev_cnt);
++	if (!dev_cnt) {
++		printf("CMA: no RDMA devices found\n");
++		ret = ERR(ENODEV);
++		goto err2;
++	}
++	cma_dev_array = calloc(dev_cnt, sizeof *cma_dev_array);
+ 	if (!cma_dev_array) {
+ 		ret = ERR(ENOMEM);
+ 		goto err2;
+ 	}
+ 
+-	for (i = 0, ib = 0; dev_list[i];) {
+-		cma_dev = &cma_dev_array[i];
++	for (i = 0; dev_list[i]; i++)
++		cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]);
+ 
+-		cma_dev->guid = ibv_get_device_guid(dev_list[i]);
+-		cma_dev->verbs = ibv_open_device(dev_list[i]);
+-		if (!cma_dev->verbs) {
+-			printf("CMA: unable to open RDMA device\n");
+-			ret = ERR(ENODEV);
+-			goto err3;
+-		}
+-
+-		cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
+-		if (!cma_dev->pd) {
+-			ibv_close_device(cma_dev->verbs);
+-			ret = ERR(ENOMEM);
+-			goto err3;
+-		}
+-
+-		i++;
+-		ret = ibv_query_device(cma_dev->verbs, &attr);
+-		if (ret) {
+-			printf("CMA: unable to query RDMA device\n");
+-			ret = ERR(ret);
+-			goto err3;
+-		}
+-
+-		cma_dev->port_cnt = attr.phys_port_cnt;
+-		cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
+-		cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
+-		ib += (cma_dev->verbs->device->transport_type == IBV_TRANSPORT_IB);
+-	}
+-
+-	if (ib)
+-		ucma_ib_init();
+ 	cma_dev_cnt = dev_cnt;
+ 	ucma_set_af_ib_support();
+ 	pthread_mutex_unlock(&mut);
[email protected]@ -299,12 +284,6 @@
+ 	ibv_free_device_list(dev_list);
+ 	return 0;
+ 
+-err3:
+-	while (i--) {
+-		ibv_dealloc_pd(cma_dev_array[i].pd);
+-		ibv_close_device(cma_dev_array[i].verbs);
+-	}
+-	free(cma_dev_array);
+ err2:
+ 	ibv_free_device_list(dev_list);
+ err1:
[email protected]@ -312,12 +291,93 @@
+ 	return ret;
+ }
+ 
++static struct ibv_context *ucma_open_device(uint64_t guid)
++{
++	struct ibv_device **dev_list;
++	struct ibv_context *verbs = NULL;
++	int i;
++
++	dev_list = ibv_get_device_list(NULL);
++	if (!dev_list) {
++		fprintf(stderr, PFX "Fatal: unable to get RDMA device list\n");
++		return NULL;
++	}
++
++	for (i = 0; dev_list[i]; i++) {
++		if (ibv_get_device_guid(dev_list[i]) == guid) {
++			verbs = ibv_open_device(dev_list[i]);
++			break;
++		}
++	}
++
++	if (!verbs)
++		fprintf(stderr, PFX "Fatal: unable to open RDMA device\n");
++
++	ibv_free_device_list(dev_list);
++	return verbs;
++}
++
++static int ucma_init_device(struct cma_device *cma_dev)
++{
++	struct ibv_device_attr attr;
++	int ret;
++
++	if (cma_dev->verbs)
++		return 0;
++
++	cma_dev->verbs = ucma_open_device(cma_dev->guid);
++	if (!cma_dev->verbs)
++		return ERR(ENODEV);
++
++	ret = ibv_query_device(cma_dev->verbs, &attr);
++	if (ret) {
++		fprintf(stderr, PFX "Fatal: unable to query RDMA device\n");
++		ret = ERR(ret);
++		goto err;
++	}
++
++	cma_dev->port_cnt = attr.phys_port_cnt;
++	cma_dev->max_qpsize = attr.max_qp_wr;
++	cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
++	cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
++	cma_init_cnt++;
++	return 0;
++
++err:
++	ibv_close_device(cma_dev->verbs);
++	cma_dev->verbs = NULL;
++	return ret;
++}
++
++static int ucma_init_all(void)
++{
++	int i, ret = 0;
++
++	if (!cma_dev_cnt) {
++		ret = ucma_init();
++		if (ret)
++			return ret;
++	}
++
++	if (cma_init_cnt == cma_dev_cnt)
++		return 0;
++
++	pthread_mutex_lock(&mut);
++	for (i = 0; i < cma_dev_cnt; i++) {
++		ret = ucma_init_device(&cma_dev_array[i]);
++		if (ret)
++			break;
++	}
++	pthread_mutex_unlock(&mut);
++	return ret;
++}
++
+ struct ibv_context **rdma_get_devices(int *num_devices)
+ {
+ 	struct ibv_context **devs = NULL;
+ 	int i;
+ 
+-	if (ucma_init())
++	if (ucma_init_all())
+ 		goto out;
+ 
+ 	devs = malloc(sizeof *devs * (cma_dev_cnt + 1));
[email protected]@ -354,9 +414,18 @@
  	if (!channel)
  		return NULL;
  
@@ -100,7 +319,48 @@
  		goto err;
  	}
  	return channel;
[email protected]@ -1186,6 +1201,10 @@
[email protected]@ -374,18 +443,33 @@
+ static int ucma_get_device(struct cma_id_private *id_priv, uint64_t guid)
+ {
+ 	struct cma_device *cma_dev;
+-	int i;
++	int i, ret;
+ 
+ 	for (i = 0; i < cma_dev_cnt; i++) {
+ 		cma_dev = &cma_dev_array[i];
+-		if (cma_dev->guid == guid) {
+-			id_priv->cma_dev = cma_dev;
+-			id_priv->id.verbs = cma_dev->verbs;
+-			return 0;
+-		}
++		if (cma_dev->guid == guid)
++			goto match;
+ 	}
+-
++ 
+ 	return ERR(ENODEV);
++match:
++	pthread_mutex_lock(&mut);
++	if ((ret = ucma_init_device(cma_dev)))
++		goto out;
++ 
++	if (!cma_dev->refcnt++) {
++		cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
++		if (!cma_dev->pd) {
++			cma_dev->refcnt--;
++			ret = ERR(ENOMEM);
++			goto out;
++		}
++	}
++	id_priv->cma_dev = cma_dev;
++	id_priv->id.verbs = cma_dev->verbs;
++out:
++	pthread_mutex_unlock(&mut);
++	return ret;
+ }
+ 
+ static void ucma_free_id(struct cma_id_private *id_priv)
[email protected]@ -1186,6 +1270,10 @@
  	if (ret)
  		return ret;
  
@@ -111,7 +371,7 @@
  	qp = ibv_create_qp(pd, qp_init_attr);
  	if (!qp) {
  		ret = ERR(ENOMEM);
[email protected]@ -1787,6 +1806,9 @@
[email protected]@ -1787,6 +1875,9 @@
  
  	CMA_CREATE_MSG_CMD(msg, cmd, UCMA_CMD_ACCEPT, size);
  	cmd->id = id_priv->handle;
@@ -121,7 +381,7 @@
  
  	ret = write(id_priv->id.channel->fd, msg, size);
  	if (ret != size) {
[email protected]@ -2051,8 +2073,20 @@
[email protected]@ -2051,8 +2142,20 @@
  
  	CMA_CREATE_MSG_CMD_RESP(msg, cmd, resp, UCMA_CMD_MIGRATE_ID, size);
  	cmd->id = id_priv->handle;
@@ -144,7 +404,7 @@
  		return (ret >= 0) ? ERR(ENODATA) : -1;
 diff -r -u /tmp/librdmacm-1.0.14.1/man/rdma_create_id.3 librdmacm-1.0.14.1/man/rdma_create_id.3
 --- /tmp/librdmacm-1.0.14.1/man/rdma_create_id.3	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/man/rdma_create_id.3	Mon Mar 28 03:11:48 2011
++++ librdmacm-1.0.14.1/man/rdma_create_id.3	Wed Apr 30 10:52:04 2014
 @@ -31,9 +31,7 @@
  explicitly binding to a specified RDMA device before communication
  can occur, and most operations are asynchronous in nature.  Asynchronous
@@ -158,7 +418,7 @@
  .P
 diff -r -u /tmp/librdmacm-1.0.14.1/man/rdma_create_qp.3 librdmacm-1.0.14.1/man/rdma_create_qp.3
 --- /tmp/librdmacm-1.0.14.1/man/rdma_create_qp.3	Fri Dec 10 12:05:34 2010
-+++ librdmacm-1.0.14.1/man/rdma_create_qp.3	Mon Mar 28 03:11:48 2011
++++ librdmacm-1.0.14.1/man/rdma_create_qp.3	Wed Apr 30 11:48:07 2014
 @@ -33,8 +33,7 @@
  the rdma_cm_id will be created using a default protection domain.  One
  default protection domain is allocated per RDMA device.
@@ -171,7 +431,7 @@
  channels.  Completion channels and CQ data created by the rdma_cm are
 diff -r -u /tmp/librdmacm-1.0.14.1/man/rdma_cm.7 librdmacm-1.0.14.1/man/rdma_cm.7
 --- /tmp/librdmacm-1.0.14.1/man/rdma_cm.7	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/man/rdma_cm.7	Mon Mar 28 03:11:47 2011
++++ librdmacm-1.0.14.1/man/rdma_cm.7	Wed Apr 30 11:48:07 2014
 @@ -19,7 +19,7 @@
  API defined by the libibverbs library.  The libibverbs library provides the
  underlying interfaces needed to send and receive data.
@@ -238,7 +498,7 @@
  rdma_resolve_route(3),
 diff -r -u /tmp/librdmacm-1.0.14.1/include/infiniband/ib.h librdmacm-1.0.14.1/include/infiniband/ib.h
 --- /tmp/librdmacm-1.0.14.1/include/infiniband/ib.h	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/include/infiniband/ib.h	Fri Feb 11 04:08:56 2011
++++ librdmacm-1.0.14.1/include/infiniband/ib.h	Wed Apr 30 10:52:05 2014
 @@ -33,7 +33,11 @@
  #if !defined(_RDMA_IB_H)
  #define _RDMA_IB_H
@@ -253,7 +513,7 @@
  #ifndef AF_IB
 diff -r -u /tmp/librdmacm-1.0.14.1/include/rdma/rdma_cma_abi.h librdmacm-1.0.14.1/include/rdma/rdma_cma_abi.h
 --- /tmp/librdmacm-1.0.14.1/include/rdma/rdma_cma_abi.h	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/include/rdma/rdma_cma_abi.h	Fri Feb 11 04:08:48 2011
++++ librdmacm-1.0.14.1/include/rdma/rdma_cma_abi.h	Wed Apr 30 10:52:05 2014
 @@ -104,6 +104,9 @@
  	__u64 response;
  	struct sockaddr_in6 addr;
@@ -276,7 +536,7 @@
  struct ucma_abi_join_mcast {
 diff -r -u /tmp/librdmacm-1.0.14.1/examples/udaddy.c librdmacm-1.0.14.1/examples/udaddy.c
 --- /tmp/librdmacm-1.0.14.1/examples/udaddy.c	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/examples/udaddy.c	Fri Feb 11 04:08:48 2011
++++ librdmacm-1.0.14.1/examples/udaddy.c	Wed Apr 30 11:48:05 2014
 @@ -40,7 +40,9 @@
  #include <netinet/in.h>
  #include <sys/socket.h>
@@ -289,7 +549,7 @@
  #include <rdma/rdma_cma.h>
 diff -r -u /tmp/librdmacm-1.0.14.1/examples/mckey.c librdmacm-1.0.14.1/examples/mckey.c
 --- /tmp/librdmacm-1.0.14.1/examples/mckey.c	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/examples/mckey.c	Fri Feb 11 04:08:48 2011
++++ librdmacm-1.0.14.1/examples/mckey.c	Wed Apr 30 11:48:05 2014
 @@ -41,7 +41,9 @@
  #include <arpa/inet.h>
  #include <sys/socket.h>
@@ -348,7 +608,7 @@
  
 diff -r -u /tmp/librdmacm-1.0.14.1/examples/cmatose.c librdmacm-1.0.14.1/examples/cmatose.c
 --- /tmp/librdmacm-1.0.14.1/examples/cmatose.c	Mon Oct  4 17:00:18 2010
-+++ librdmacm-1.0.14.1/examples/cmatose.c	Fri Feb 11 04:08:48 2011
++++ librdmacm-1.0.14.1/examples/cmatose.c	Wed Apr 30 11:48:03 2014
 @@ -40,7 +40,9 @@
  #include <netinet/in.h>
  #include <sys/socket.h>
@@ -361,7 +621,7 @@
  #include <rdma/rdma_cma.h>
 diff -r -u /tmp/librdmacm-1.0.14.1/examples/rping.c librdmacm-1.0.14.1/examples/rping.c
 --- /tmp/librdmacm-1.0.14.1/examples/rping.c	Tue Feb 15 17:10:48 2011
-+++ librdmacm-1.0.14.1/examples/rping.c	Wed May 25 14:46:30 2011
++++ librdmacm-1.0.14.1/examples/rping.c	Wed Apr 30 11:48:04 2014
 @@ -40,11 +40,17 @@
  #include <netinet/in.h>
  #include <sys/socket.h>
--- a/components/open-fabrics/rds-tools/Makefile	Thu Jun 26 11:51:15 2014 -0700
+++ b/components/open-fabrics/rds-tools/Makefile	Tue Jul 01 14:49:32 2014 -0700
@@ -18,7 +18,7 @@
 #
 # CDDL HEADER END
 #
-# Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
 #
 include ../../../make-rules/shared-macros.mk
 
@@ -40,6 +40,8 @@
 LIBS +=	-lsocket -lnsl -llgrp -lkstat -libverbs
 
 COMPONENT_PREP_ACTION = \
+	mkdir -p $(PROTO_DIR)/usr/include/sys ; \
+	cp rds.h $(PROTO_DIR)/usr/include/sys/ ; \
 	cp rds-vendor.c $(@D)/
 
 CLEANUP_DIRS =	$(PROTOUSRBINDIR)/net
--- a/components/open-fabrics/rds-tools/patches/base.patch	Thu Jun 26 11:51:15 2014 -0700
+++ b/components/open-fabrics/rds-tools/patches/base.patch	Tue Jul 01 14:49:32 2014 -0700
@@ -15,10 +15,11 @@
  #include <sys/stat.h>
  #include <sys/poll.h>
  #include <ctype.h>
[email protected]@ -22,10 +28,16 @@
[email protected]@ -22,12 +28,27 @@
  #include <fcntl.h>
  #include <sched.h>
  #include <getopt.h>
++#include <sys/ioctl.h>
 +#if !(defined(__SVR4) && defined(__sun))
  #include <byteswap.h>
  #include "rds.h"
@@ -31,9 +32,99 @@
 +#include <infiniband/ofa_solaris.h>
 +#endif
  
++#if defined(__SVR4) && defined(__sun)
  /*
++ * This definition is forward looking and is not yet present
++ * in Solaris rds.h file
++ */
++#define RDS_CMSG_RDMA_SEND_STATUS RDS_CMSG_RDMA_STATUS
++#endif
++
++/*
   *
[email protected]@ -102,6 +114,10 @@
+  * TODO
+  *  - checksum the data some day.
[email protected]@ -45,8 +66,9 @@
+         M_RDMA_READ_ONLY,
+         M_RDMA_WRITE_ONLY
+ };
++#define VERSION_MAX_LEN 16 
+ 
+-struct options {
++struct options_2_0_6 {
+ 	uint32_t	req_depth;
+ 	uint32_t	req_size;
+ 	uint32_t	ack_size;
[email protected]@ -76,9 +98,68 @@
+ 	uint32_t	connect_retries;
+ } __attribute__((packed));
+ 
++struct options {
++	char		version[VERSION_MAX_LEN];
++        uint32_t        req_depth;
++        uint32_t        req_size;
++        uint32_t        ack_size;
++        uint32_t        rdma_size;
++        uint32_t        send_addr;
++        uint32_t        receive_addr;
++        uint16_t        starting_port;
++        uint16_t        nr_tasks;
++        uint32_t        run_time;
++        uint8_t         summary_only;
++        uint8_t         rtprio;
++        uint8_t         tracing;
++        uint8_t         verify;
++        uint8_t         show_params;
++        uint8_t         show_perfdata;
++        uint8_t         use_cong_monitor;
++        uint8_t         rdma_use_once;
++        uint8_t         rdma_use_get_mr;
++        uint8_t         rdma_use_fence;
++        uint8_t         rdma_cache_mrs;
++        uint8_t         rdma_key_o_meter;
++        uint8_t         suppress_warnings;
++        uint8_t         simplex;
++        uint8_t         rw_mode;
++        uint32_t        rdma_vector;
++        uint32_t        rdma_alignment;
++        uint32_t        connect_retries;
++        uint8_t         tos;
++        uint8_t         async;
++} __attribute__((packed));
++
++#define MAX_BUCKETS 16
++
+ static struct options	opt;
+ static int		control_fd;
++static uint64_t         rtt_threshold;
++static int              show_histogram;
++static int		reset_connection;
++static char		peer_version[VERSION_MAX_LEN];
+ 
++static int get_bucket(uint64_t rtt_time)
++{
++  int i;
++  uint64_t l_rtt_time = rtt_time;
++
++  if (!l_rtt_time)
++    i = 0;
++  else
++  {
++    i = -1;
++    while (l_rtt_time)
++    {
++      i++;
++      l_rtt_time = (l_rtt_time >> 1);
++    }
++  }
++
++  return i;
++}
++
+ struct counter {
+ 	uint64_t	nr;
+ 	uint64_t	sum;
[email protected]@ -102,6 +183,10 @@
  
  #define NR_STATS S__LAST
  
@@ -44,7 +135,7 @@
  /*
   * Parents share a mapped array of these with their children.  Each child
   * gets one.  It's used to communicate between the child and the parent
[email protected]@ -110,6 +126,7 @@
[email protected]@ -110,9 +195,11 @@
  struct child_control {
  	pid_t pid;
  	int ready;
@@ -52,7 +143,51 @@
  	struct timeval start;
  	struct counter cur[NR_STATS];
  	struct counter last[NR_STATS];
[email protected]@ -254,7 +271,20 @@
++        uint64_t       latency_histogram[MAX_BUCKETS];
+ } __attribute__((aligned (256))); /* arbitrary */
+ 
+ struct soak_control {
[email protected]@ -132,6 +219,7 @@
+  */
+ #define OP_REQ		1
+ #define OP_ACK		2
++#define OP_DUMP		3
+ 
+ #define RDMA_OP_READ	1
+ #define RDMA_OP_WRITE	2
[email protected]@ -148,7 +236,7 @@
+ 	uint16_t	from_port;
+ 	uint16_t	to_port;
+ 	uint16_t	index;
+-	uint8_t		op;
++	uint8_t         op;
+ 
+ 	/* RDMA related.
+ 	 * rdma_op must be the first field, because we
[email protected]@ -162,12 +250,21 @@
+ 	uint32_t	rdma_size;
+ 	uint32_t        rdma_vector;
+ 
+-	uint8_t		data[0];
++	/* Async send related. */
++	uint8_t         retry;
++	uint8_t         rdma_remote_err;
++	uint8_t         pending;
++
++	uint8_t         data[0];
+ } __attribute__((packed));
+ 
+ #define MIN_MSG_BYTES		(sizeof(struct header))
+ #define BASIC_HEADER_SIZE	(size_t)(&((struct header *) 0)->rdma_op)
+ 
++#define print_outlier(...) do {         \
++        fprintf(stderr, __VA_ARGS__);   \
++} while (0)
++
+ #define die(fmt...) do {		\
+ 	fprintf(stderr, fmt);		\
+ 	exit(1);			\
[email protected]@ -254,7 +351,20 @@
  
  	die("invalid host name or dotted quad '%s'\n", ptr);
  }
@@ -73,7 +208,15 @@
  static void usage(void)
  {
          fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
[email protected]@ -281,6 +311,9 @@
[email protected]@ -273,6 +383,7 @@
+ 	" -d [depth, 1]     request pipeline depth, nr outstanding\n"
+ 	" -t [nr, 1]        number of child tasks\n"
+ 	" -T [seconds, 0]   runtime of test, 0 means infinite\n"
++	" -Q [tos, 0]       Type of Service\n"
+ 	" -D [bytes]        RDMA: size\n"
+ 	" -I [iovecs, 1]    RDMA: number of user buffers to target (max 512)\n"
+         " -M [nr, 0]        RDMA: mode (0=readwrite,1=readonly,2=writeonly)\n"
[email protected]@ -281,6 +392,9 @@
  	" -c                measure cpu use with per-cpu soak processes\n"
  	" -V                trace execution\n"
  	" -z                print a summary at end of test only\n"
@@ -83,7 +226,7 @@
  	"\n"
  	"Example:\n"
  	"  recv$ rds-stress\n"
[email protected]@ -310,7 +343,7 @@
[email protected]@ -310,7 +424,7 @@
  static void check_parent(pid_t pid)
  {
  	if (pid != getppid())
@@ -92,7 +235,7 @@
  }
  
  /*
[email protected]@ -334,6 +367,7 @@
[email protected]@ -334,6 +448,7 @@
  		msg_pattern[i] = k;
  }
  
@@ -100,7 +243,7 @@
  #if __BYTE_ORDER == __LITTLE_ENDIAN
  #define htonll(x)	bswap_64(x)
  #define ntohll(x)	bswap_64(x)
[email protected]@ -341,6 +375,7 @@
[email protected]@ -341,6 +456,7 @@
  #define htonll(x)	(x)
  #define ntohll(x)	(x)
  #endif
@@ -108,7 +251,71 @@
  
  static void encode_hdr(struct header *dst, const struct header *hdr)
  {
[email protected]@ -584,7 +619,11 @@
[email protected]@ -361,6 +477,7 @@
+ 	dst->rdma_key = htonll(hdr->rdma_key);
+ 	dst->rdma_size = htonl(hdr->rdma_size);
+ 	dst->rdma_vector = htonl(hdr->rdma_vector);
++	dst->retry = hdr->retry;
+ }
+ 
+ static void decode_hdr(struct header *dst, const struct header *hdr)
[email protected]@ -382,6 +499,7 @@
+ 	dst->rdma_key = ntohll(hdr->rdma_key);
+ 	dst->rdma_size = ntohl(hdr->rdma_size);
+ 	dst->rdma_vector = ntohl(hdr->rdma_vector);
++	dst->retry = hdr->retry;
+ }
+ 
+ static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
[email protected]@ -412,11 +530,19 @@
+  * Compare incoming message header with expected header. All header fields
+  * are in host byte order except for address and port fields.
+  */
+-static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
++static int check_hdr(void *message, uint32_t bytes, struct header *hdr, struct options *opts)
+ {
+ 	struct header msghdr;
++	uint32_t	inc_seq;
++	uint32_t	my_seq;
+ 
+ 	decode_hdr(&msghdr, message);
++	inc_seq = msghdr.seq;
++	my_seq = hdr->seq;
++
++	if (msghdr.retry && (inc_seq < my_seq))
++		return -1;
++
+ 	if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
+ #define bleh(var, disp)					\
+ 		disp(hdr->var),				\
[email protected]@ -428,7 +554,7 @@
+ 		 * with stdout() and we don't get things stomping on each
+ 		 * other
+ 		 */
+-		printf( "An incoming message had a header which\n"
++		printf( "An incoming message had a %s header which\n"
+ 			"didn't contain the fields we expected:\n"
+ 			"    member        expected eq             got\n"
+ 			"       seq %15u %s %15u\n"
[email protected]@ -438,6 +564,7 @@
+ 			"   to_port %15u %s %15u\n"
+ 			"     index %15u %s %15u\n"
+ 			"        op %15u %s %15u\n",
++			(msghdr.retry) ? "RETRY" : "",
+ 			bleh(seq, /**/),
+ 			bleh(from_addr, inet_ntoa_32),
+ 			bleh(from_port, ntohs),
[email protected]@ -569,6 +696,9 @@
+ 
+ 	fcntl(fd, F_SETFL, O_NONBLOCK);
+ 
++	if (opts->tos && ioctl(fd, SIOCRDSSETTOS, &opts->tos)) 
++		die_errno("ERROR: failed to set TOS\n");
++
+ 	return fd;
+ }
+ 
[email protected]@ -584,7 +714,11 @@
  	if (opts->receive_addr == 0)
  		return 1;
  
@@ -120,7 +327,16 @@
  	sin.sin_port = htons(opts->starting_port);
  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
  
[email protected]@ -677,7 +716,11 @@
[email protected]@ -639,7 +773,7 @@
+ 	mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
+ #endif
+ 	if (setsockopt(fd, sol, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
+-		die_errno("setsockopt(RDS_FREE_MR) failed");
++		return;
+ 	mrs_allocated--;
+ }
+ 
[email protected]@ -677,7 +811,11 @@
  	size = sizeof(struct rdma_key_o_meter)
  			+ 2 * nr_tasks * sizeof(*kt)
  			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
@@ -132,7 +348,7 @@
  	if (base == MAP_FAILED)
  		die_errno("alloc_rdma_buffers: mmap failed");
  
[email protected]@ -828,7 +871,7 @@
[email protected]@ -828,13 +966,20 @@
  	}
  
  	if (!failed)
@@ -141,7 +357,35 @@
  			(unsigned long long) pattern, addr);
  }
  
[email protected]@ -865,7 +908,11 @@
++struct retry_entry {
++	uint32_t	retries;
++	uint32_t	seq;
++	int		status;
++};
++
+ struct task {
+ 	unsigned int		nr;
+ 	unsigned int		pending;
++	int			trace;
+ 	unsigned int		unacked;
+ 	struct sockaddr_in	src_addr;	/* same for all tasks */
+ 	struct sockaddr_in	dst_addr;
[email protected]@ -846,7 +991,14 @@
+ 	uint16_t		recv_index;
+ 	struct timeval *	send_time;
+ 	struct header *		ack_header;
++	struct header *         ack2_header;
++	struct header *         req_header;
++	uint64_t *		retry_token;
++	uint32_t		retries;
++	uint32_t            	last_retry_seq;
++	uint32_t		retry_index;
+ 
++
+ 	/* RDMA related stuff */
+ 	uint64_t **		local_buf;
+ 	uint64_t **		rdma_buf;
[email protected]@ -865,7 +1017,11 @@
  	/* We use mmap here rather than malloc, because it is always
  	 * page aligned. */
  	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
@@ -153,7 +397,7 @@
  	if (base == MAP_FAILED)
  		die_errno("alloc_rdma_buffers: mmap failed");
  	memset(base, 0x2f, len);
[email protected]@ -915,17 +962,16 @@
[email protected]@ -915,17 +1071,16 @@
  	if (RDMA_OP_READ == hdr->rdma_op) {
  		if (opt.verify)
  			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
@@ -177,7 +421,7 @@
  	}
  }
  
[email protected]@ -947,7 +993,7 @@
[email protected]@ -947,7 +1102,7 @@
  		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
  
  
@@ -186,7 +430,104 @@
  		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
  		rdma_size,
  		(unsigned long long) in_hdr->rdma_addr,
[email protected]@ -1007,6 +1053,9 @@
[email protected]@ -966,21 +1121,33 @@
+ 	hdr->rdma_vector = in_hdr->rdma_vector;
+ }
+ 
+-static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
++static inline uint64_t rdma_user_token(struct task *t, unsigned int qindex,  unsigned int type, uint32_t seq)
+ {
+-	return t->nr * opt.req_depth + qindex;
++	uint64_t tmp = seq;
++	return (tmp << 32) | ((t->nr * opt.req_depth + qindex) << 2 | type);
+ }
+ 
+-static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
++static void rdma_mark_completed(struct task *tasks, uint64_t token, int status, struct options *opts)
+ {
+ 	struct task *t;
+ 	unsigned int i;
++	struct header *hdr = NULL;
++	uint32_t seq = token >> 32;
++	unsigned int type = token & 0x03;
++	unsigned int index = (token & 0xFFFFFFFF) >> 2;
+ 
+-	trace("RDS rdma completion for token %x\n", token);
++	trace("RDS rdma completion for token 0x%lx\n", token);
+ 
+-	t = &tasks[token / opt.req_depth];
+-	i = token % opt.req_depth;
++	t = &tasks[index / opt.req_depth];
++	i = index % opt.req_depth;
+ 
++	if (opts->async) {
++		if (type == OP_REQ)
++			hdr = &t->req_header[i];
++		else
++			hdr = &t->ack2_header[i];
++	}
++
+ 	if (status) {
+ 		const char *errmsg;
+ 
[email protected]@ -987,20 +1154,50 @@
+ 		switch (status) {
+ 		case RDS_RDMA_REMOTE_ERROR:
+ 			errmsg = "remote error"; break;
+-		case RDS_RDMA_CANCELED:
+-			errmsg = "operation was cancelled"; break;
+-		case RDS_RDMA_DROPPED:
++		case RDS_RDMA_SEND_DROPPED:
+ 			errmsg = "operation was dropped"; break;
+-		case RDS_RDMA_OTHER_ERROR:
++		case RDS_RDMA_SEND_CANCELED:
++			errmsg = "operation was cancelled"; break;
++		case RDS_RDMA_SEND_OTHER_ERROR:
+ 			errmsg = "other error"; break;
+ 		default:
+ 			errmsg = "unknown error"; break;
+ 		}
+ 
+-		printf("%s:%u: RDMA op %u failed: %s\n",
++		trace("%s:%u: %s failed: %s\n",
+ 				inet_ntoa(t->dst_addr.sin_addr),
+ 				ntohs(t->dst_addr.sin_port),
+-				i, errmsg);
++				type ? "SEND" : "RDMA",
++				errmsg);
++
++		if (hdr &&
++			(status == RDS_RDMA_SEND_DROPPED ||
++			 status == RDS_RDMA_REMOTE_ERROR)) {
++
++			if (hdr->seq == seq) {
++				hdr->retry = 1;
++				if (hdr->seq > t->last_retry_seq) {
++					if (status == RDS_RDMA_REMOTE_ERROR)
++						hdr->rdma_remote_err = 1;
++					t->retry_token[t->retry_index] = token;
++					t->retry_index = (t->retry_index + 1) %
++						(2 * opts->req_depth);
++					t->retries += 1;
++					t->last_retry_seq = hdr->seq;
++					if (t->retries > 2 * opts->req_depth)
++						die("Exceeded MAX retry entries..\n");
++				}
++			} else
++				die("SEQ Out-Of-Sync: %u/%u\n", hdr->seq, seq);
++		} else if (hdr) {
++			hdr->pending = 0;
++			hdr->retry = 0;
++			hdr->rdma_remote_err = 0;
++		}
++	} else if (hdr) {
++		hdr->pending = 0;
++		hdr->retry = 0;
++		hdr->rdma_remote_err = 0;
+ 	}
+ 
+ 	t->rdma_inflight[i] = 0;
[email protected]@ -1007,6 +1204,9 @@
  	t->drain_rdmas = 0;
  }
  
@@ -196,10 +537,430 @@
  #define MSG_MAXIOVLEN 2
  
  /*
[email protected]@ -1560,7 +1609,12 @@
[email protected]@ -1018,11 +1218,14 @@
+ 	static char ctlbuf[1024];
+ 	struct cmsghdr *cmsg;
+ 
+-	msg->msg_control = ctlbuf;
+-	msg->msg_controllen = CMSG_SPACE(size);
+-
+-	cmsg = CMSG_FIRSTHDR(msg);
+-	cmsg->cmsg_level = sol;
++	if (!msg->msg_control) {
++		msg->msg_control = ctlbuf;
++		msg->msg_controllen = CMSG_SPACE(size);
++		cmsg = CMSG_FIRSTHDR(msg);
++	} else {
++		cmsg = (struct cmsghdr *)((char *)msg->msg_control + msg->msg_controllen);
++		msg->msg_controllen += CMSG_SPACE(size);
++	}cmsg->cmsg_level = sol;
+ 	cmsg->cmsg_type = type;
+ 	cmsg->cmsg_len = CMSG_LEN(size);
+ 	memcpy(CMSG_DATA(cmsg), ptr, size);
[email protected]@ -1034,7 +1237,7 @@
+  * the ACK packet.
+  */
+ static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
+-		unsigned int user_token, void *local_buf)
++		uint64_t user_token, void *local_buf)
+ {
+ 
+ #define RDS_MAX_IOV 512 /* FIX_ME - put this into rds.h or use socket max ?*/
[email protected]@ -1048,7 +1251,7 @@
+ 	rdma_size = hdr->rdma_size;
+ 	rdma_vector = hdr->rdma_vector;
+ 
+-	trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p vector %u\n",
++	trace("RDS issuing rdma for token 0x%lx key 0x%llx len %d local_buf %p vector %d\n",
+ 			user_token,
+ 			(unsigned long long) hdr->rdma_key,
+ 			rdma_size, local_buf,
[email protected]@ -1102,6 +1305,15 @@
+ 	rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
+ }
+ 
++static void build_cmsg_async_send(struct msghdr *msg, uint64_t user_token)
++{
++	struct rds_asend_args  args;
++
++	args.flags |= RDS_SEND_NOTIFY_ME;
++	args.user_token = user_token;
++	rdma_put_cmsg(msg, RDS_CMSG_ASYNC_SEND, &args, sizeof(args));
++}
++
+ static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
+ {
+ 	rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
[email protected]@ -1174,19 +1386,17 @@
+ 	hdr->index = qindex;
+ }
+ 
+-static int send_packet(int fd, struct task *t,
+-		struct header *hdr, unsigned int size)
++static int send_msg(int fd, struct task *t, struct header *hdr,
++		    unsigned int size, struct options *opts, 
++		    struct child_control *ctl)
+ {
+-	unsigned char buf[size], *rdma_flight_recorder = NULL;
++	unsigned char buf[size];
++	uint8_t *rdma_flight_recorder = NULL;
+ 	rds_rdma_cookie_t cookie = 0;
+ 	struct msghdr msg;
+ 	struct iovec iov;
+ 	ssize_t ret;
+ 
+-	/* Make sure we always have the current sequence number.
+-	 * When we send ACK packets, the seq that gets filled in is
+-	 * stale. */
+-	hdr->seq = t->send_seq;
+ 	fill_hdr(buf, size, hdr);
+ 
+ 	memset(&msg, 0, sizeof(msg));
[email protected]@ -1198,27 +1408,10 @@
+ 	iov.iov_base = buf;
+ 	iov.iov_len = size;
+ 
+-	/* If this is a REQ packet in which we pass the MR to the
+-	 * peer, extract the RDMA cookie and pass it on in the control
+-	 * message for now. */
+-	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
+-		if (hdr->rdma_key != 0) {
+-			/* We used GET_MR to obtain a key */
+-			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
+-			cookie = hdr->rdma_key;
+-			hdr->rdma_key = 0;
+-		} else {
+-			/* Use the RDMA_MAP cmsg to have sendmsg do the
+-			 * mapping on the fly. */
+-			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
+-					    hdr->rdma_size * hdr->rdma_vector,
+-					    &cookie);
+-		}
+-	}
+ 
+ 	/* If this is an ACK packet with RDMA, build the cmsg
+-	 * header that goes with it. */
+-	if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
++	   * header that goes with it. */
++	if (hdr->op == OP_ACK && hdr->rdma_op != 0 && !hdr->rdma_remote_err) {
+ 		unsigned int qindex = hdr->index;
+ 
+ 		if (t->rdma_inflight[qindex] != 0) {
[email protected]@ -1230,16 +1423,35 @@
+ 			 *
+ 			 * We return one of the more obscure error messages,
+ 			 * which we recognize and handle in the top loop. */
+-			trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
++			trace("Drain RDMA 0x%lx\n", rdma_user_token(t, qindex, 0, hdr->seq));
+ 			errno = EBADSLT;
+ 			return -1;
+ 		}
+ 		rdma_build_cmsg_xfer(&msg, hdr,
+-				rdma_user_token(t, qindex),
++				rdma_user_token(t, qindex, 0, hdr->seq),
+ 				t->local_buf[qindex]);
+ 		rdma_flight_recorder = &t->rdma_inflight[qindex];
++	} else if (opts->async) {
++		if (hdr->op == OP_REQ)
++			build_cmsg_async_send(&msg,
++				rdma_user_token(t, hdr->index, OP_REQ, hdr->seq));
++		else
++			build_cmsg_async_send(&msg,
++				rdma_user_token(t, hdr->index, OP_ACK, hdr->seq));
+ 	}
+ 
++	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
++		if (hdr->rdma_key != 0) {
++			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
++			cookie = hdr->rdma_key;
++			hdr->rdma_key = 0;
++		} else {
++			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
++					hdr->rdma_size * hdr->rdma_vector,
++					&cookie);
++		}
++	}
++
+ 	ret = sendmsg(fd, &msg, 0);
+ 	if (ret < 0) {
+ 		if (errno != EAGAIN && errno != ENOBUFS)
[email protected]@ -1256,10 +1468,41 @@
+ 		 * lower 32bit of the cookie */
+ 		rdma_key_o_meter_add(cookie);
+ 	}
++
++	hdr->pending = 1;
++
++	return ret;
++}
++
++static int send_packet(int fd, struct task *t,
++		struct header *hdr, unsigned int size,
++		struct options *opts, struct child_control *ctl)
++{
++	ssize_t ret;
++
++	/* Make sure we always have the current sequence number.
++	 * When we send ACK packets, the seq that gets filled in is
++	 * stale. */
++	hdr->seq = t->send_seq;
++
++	ret = send_msg(fd, t, hdr, size, opts, ctl);
++	if (ret < 0) return ret;
++
+ 	t->send_seq++;
+ 	return ret;
+ }
+ 
++static int resend_packet(int fd, struct task *t,
++		struct header *hdr, unsigned int size,
++		struct options *opts, struct child_control *ctl)
++{
++	ssize_t ret;
++
++	ret = send_msg(fd, t, hdr, size, opts, ctl);
++
++	return ret;
++}
++
+ static int send_one(int fd, struct task *t,
+ 		struct options *opts,
+ 		struct child_control *ctl)
[email protected]@ -1266,12 +1509,16 @@
+ {
+ 	struct timeval start;
+ 	struct timeval stop;
+-	struct header hdr;
++	struct header *hdr = &t->req_header[t->send_index]; 
+ 	int ret;
+ 
+-	build_header(t, &hdr, OP_REQ, t->send_index);
++	if (opts->async && hdr->pending) {
++		return -1;
++	}
++
++	build_header(t, hdr, OP_REQ, t->send_index);
+ 	if (opts->rdma_size && t->send_seq > 10)
+-		rdma_build_req(fd, &hdr, t,
++		rdma_build_req(fd, hdr, t,
+ 				opts->rdma_size,
+ 				opts->req_depth,
+ 				opts->rw_mode,
[email protected]@ -1279,7 +1526,7 @@
+ 
+ 
+ 	gettimeofday(&start, NULL);
+-	ret = send_packet(fd, t, &hdr, opts->req_size);
++	ret = send_packet(fd, t, hdr, opts->req_size, opts, ctl);
+ 	gettimeofday(&stop, NULL);
+ 
+ 	if (ret < 0)
[email protected]@ -1302,10 +1549,15 @@
+ 		struct child_control *ctl)
+ {
+ 	struct header *hdr = &t->ack_header[qindex];
++	struct header *hdr2 = &t->ack2_header[qindex];
+ 	ssize_t ret;
+ 
++	if (opts->async && hdr2->pending) {
++		return -1;
++	}
++
+ 	/* send an ack in response to the req we just got */
+-	ret = send_packet(fd, t, hdr, opts->ack_size);
++	ret = send_packet(fd, t, hdr, opts->ack_size, opts, ctl);
+ 	if (ret < 0)
+ 		return ret;
+ 	if (ret != opts->ack_size)
[email protected]@ -1324,6 +1576,8 @@
+ 		break;
+ 	}
+ 
++	memcpy(hdr2, hdr, sizeof(struct header));
++
+ 	return ret;
+ }
+ 
[email protected]@ -1354,8 +1608,49 @@
+ 			struct child_control *ctl,
+ 			int can_send, int do_work)
+ {
++	struct header *hdr;
++	unsigned int index;
++	int req_size;
++	int num_retries = t->retries;
++	uint64_t token;
++	unsigned int type;
++	unsigned int index2;
++	unsigned int i;
++
++	while (opts->async && num_retries > 0) {
++		index = (t->retry_index - num_retries +
++			(2 * opts->req_depth)) % (2 * opts->req_depth);
++
++		token = t->retry_token[index];
++		type = token & 0x03;
++		index2 = (token & 0xFFFFFFFF) >> 2;
++		i = index2 % opts->req_depth;
++
++		if (type == OP_REQ)
++			hdr = &t->req_header[i];
++		else
++			hdr = &t->ack2_header[i];
++
++		if (!hdr->retry)
++			goto next;
++
++		if (hdr->op == OP_REQ)
++			req_size = opts->req_size;
++		else
++			req_size = opts->ack_size;
++
++		if (resend_packet(fd, t, hdr, req_size, opts, ctl) < 0) {
++			return -1;
++		}
++		hdr->retry = 0;
++next:
++		num_retries--;
++	}
++	t->last_retry_seq = t->retries = 0;
++
+ 	if (ack_anything(fd, t, opts, ctl, can_send) < 0)
+ 		return -1;
++
+ 	while (do_work && t->pending < opts->req_depth) {
+ 		if (!can_send)
+ 			goto eagain;
[email protected]@ -1375,7 +1670,8 @@
+ 		rds_rdma_cookie_t *cookie,
+ 		struct sockaddr_in *sin,
+ 		struct timeval *tstamp,
+-		struct task *tasks)
++		struct task *tasks,
++		struct options *opts)
+ {
+ 	struct cmsghdr *cmsg;
+ 	char cmsgbuf[256];
[email protected]@ -1398,15 +1694,16 @@
+ 
+ 	if (ret < 0)
+ 		return ret;
+-	if (ret && ret < sizeof(struct header))
++	if (ret && !strcmp(RDS_VERSION, peer_version) &&
++		ret < sizeof(struct header))
+ 		die("recvmsg() returned short data: %zd", ret);
+-	if (msg.msg_namelen < sizeof(struct sockaddr_in))
++	if (ret && msg.msg_namelen < sizeof(struct sockaddr_in))
+ 		die("socklen = %d < sizeof(sin) (%zu)\n",
+ 		    msg.msg_namelen, sizeof(struct sockaddr_in));
+ 
+ 	/* See if the message comes with a RDMA destination */
+ 	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+-		struct rds_rdma_notify notify;
++		struct rds_rdma_send_notify notify;
+ 
+ 		if (cmsg->cmsg_level != sol)
+ 			continue;
[email protected]@ -1432,11 +1729,11 @@
+ 			memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie));
+ 			break;
+ 
+-		case RDS_CMSG_RDMA_STATUS:
++		case RDS_CMSG_RDMA_SEND_STATUS:
+ 			if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
+ 				die("RDS_CMSG_RDMA_DEST data too small");
+ 			memcpy(&notify, CMSG_DATA(cmsg), sizeof(notify));
+-			rdma_mark_completed(tasks, notify.user_token, notify.status);
++			rdma_mark_completed(tasks, notify.user_token, notify.status, opts);
+ 			break;
+ 		}
+ 	}
[email protected]@ -1445,7 +1742,8 @@
+ 
+ static int recv_one(int fd, struct task *tasks,
+ 			struct options *opts,
+-		struct child_control *ctl)
++		struct child_control *ctl,
++		struct child_control *all_ctl)
+ {
+ 	char buf[max(opts->req_size, opts->ack_size)];
+ 	rds_rdma_cookie_t rdma_dest = 0;
[email protected]@ -1456,15 +1754,18 @@
+ 	uint16_t expect_index;
+ 	int task_index;
+ 	ssize_t ret;
++	int	check_status;
+ 
+-	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
++
++	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks, opts);
+ 	if (ret < 0)
+ 		return ret;
+ 
+ 	/* If we received only RDMA completions or cong updates,
+ 	 * ret will be 0 */
+-	if (ret == 0)
++	if (ret == 0) {
+ 		return 0;
++	}
+ 
+ 	/* check the incoming sequence number */
+ 	task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
[email protected]@ -1508,16 +1809,32 @@
+ 	hdr.to_port = t->src_addr.sin_port;
+ 	hdr.index = expect_index;
+ 
+-	if (check_hdr(buf, ret, &hdr))
+-		die("header from %s:%u to id %u bogus\n",
+-		    inet_ntoa(sin.sin_addr), htons(sin.sin_port),
+-		    ntohs(t->src_addr.sin_port));
++	check_status = check_hdr(buf, ret, &hdr, opts);
++	if (check_status) {
++		if (check_status > 0) {
++			die("header from %s:%u to id %u bogus\n",
++		    	inet_ntoa(sin.sin_addr), htons(sin.sin_port),
++		    	ntohs(t->src_addr.sin_port));
++		} else
++			return 0;
++	}
+ 
+ 	if (hdr.op == OP_ACK) {
+-		stat_inc(&ctl->cur[S_RTT_USECS],
+-			 usec_sub(&tstamp, &t->send_time[expect_index]));
+-		t->pending -= 1;
++                uint64_t rtt_time = 
++                  usec_sub(&tstamp, &t->send_time[expect_index]);
+ 
++		stat_inc(&ctl->cur[S_RTT_USECS], rtt_time);
++                if (rtt_time > rtt_threshold)
++			print_outlier("Found RTT = 0x%lx\n", rtt_time);
++
++                if (show_histogram)
++                {
++                  ctl->latency_histogram[get_bucket(rtt_time)]++;
++                }
++
++		if (t->pending > 0)
++			t->pending -= 1;
++
+ 		if (in_hdr.rdma_key)
+ 			rdma_process_ack(fd, &in_hdr, ctl);
+ 	} else {
[email protected]@ -1549,6 +1866,7 @@
+ }
+ 
+ static void run_child(pid_t parent_pid, struct child_control *ctl,
++			struct child_control *all_ctl,
+ 		      struct options *opts, uint16_t id, int active)
+ {
+ 	struct sockaddr_in sin;
[email protected]@ -1559,8 +1877,15 @@
+ 	struct task tasks[opts->nr_tasks];
  	struct timeval start;
          int do_work = opts->simplex ? active : 1;
++	int j;
  
++
 +#if defined(__SVR4) && defined(__sun)
 +	set_my_lgrp();
 +	sin.sin_family = AF_INET_OFFLOAD;
@@ -209,7 +970,7 @@
  	sin.sin_port = htons(opts->starting_port + 1 + id);
  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
  
[email protected]@ -1572,7 +1626,11 @@
[email protected]@ -1572,7 +1897,11 @@
  	for (i = 0; i < opts->nr_tasks; i++) {
  		tasks[i].nr = i;
  		tasks[i].src_addr = sin;
@@ -221,7 +982,37 @@
  		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
  		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
  		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
[email protected]@ -1625,6 +1683,10 @@
[email protected]@ -1581,6 +1910,15 @@
+ 		tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ 		tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ 		tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
++		tasks[i].ack2_header = alloca(opts->req_depth * sizeof(struct header));
++		for (j=0;j<opts->req_depth;j++)
++			tasks[i].ack2_header[j].pending = 0;
++
++		tasks[i].req_header = alloca(opts->req_depth * sizeof(struct header));
++		for (j=0;j<opts->req_depth;j++)
++			tasks[i].req_header[j].pending = 0;
++
++		tasks[i].retry_token = alloca(2 * opts->req_depth * sizeof(uint64_t));
+ 		tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
+ 	}
+ 
[email protected]@ -1611,7 +1949,7 @@
+ 
+ 		check_parent(parent_pid);
+ 
+-		ret = poll(&pfd, 1, -1);
++		ret = poll(&pfd, 1, 1000);
+ 		if (ret < 0) {
+ 			if (errno == EINTR)
+ 				continue;
[email protected]@ -1621,10 +1959,14 @@
+ 		pfd.events = POLLIN;
+ 
+ 		if (pfd.revents & POLLIN) {
+-			while (recv_one(fd, tasks, opts, ctl) >= 0)
++			while (recv_one(fd, tasks, opts, ctl, all_ctl) >= 0)
  				;
  		}
  
@@ -232,7 +1023,15 @@
  		/* keep the pipeline full */
  		can_send = !!(pfd.revents & POLLOUT);
  		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
[email protected]@ -1665,8 +1727,12 @@
[email protected]@ -1633,6 +1975,7 @@
+ 			if (t->drain_rdmas)
+ 				continue;
+ 			if (send_anything(fd, t, opts, ctl, can_send, do_work) < 0) {
++
+ 				pfd.events |= POLLOUT;
+ 
+ 				/* If the send queue is full, we will see EAGAIN.
[email protected]@ -1665,8 +2008,12 @@
  	uint32_t i;
  
  	len = opts->nr_tasks * sizeof(*ctl);
@@ -245,7 +1044,16 @@
  	if (ctl == MAP_FAILED)
  		die("mmap of %u child control structs failed", opts->nr_tasks);
  
[email protected]@ -1699,7 +1765,7 @@
[email protected]@ -1688,7 +2035,7 @@
+ 				control_fd = -1;
+ 			}
+ 			rdma_key_o_meter_set_self(i);
+-			run_child(parent, ctl + i, opts, i, active);
++			run_child(parent, ctl + i, ctl, opts, i, active);
+ 			exit(0);
+ 		}
+ 		ctl[i].pid = pid;
[email protected]@ -1699,7 +2046,7 @@
  			continue;
  		pid = waitpid(-1, NULL, WNOHANG);
  		if (pid)
@@ -254,7 +1062,7 @@
  		sleep(1);
  		i--; /* try this child again */
  	}
[email protected]@ -1823,6 +1889,7 @@
[email protected]@ -1823,6 +2170,7 @@
  
  	if (disable)
  		return;
@@ -262,7 +1070,7 @@
  	if ((fp = fopen("/proc/stat", "r")) == NULL) {
  		fprintf(stderr, "Cannot open /proc/stat (%s) - "
  				"not printing cpu stats\n",
[email protected]@ -1856,10 +1923,37 @@
[email protected]@ -1856,10 +2204,37 @@
  		}
  	}
  	fclose(fp);
@@ -300,7 +1108,7 @@
  	} else {
  		struct sys_stats sys;
  		unsigned long sum = 0;
[email protected]@ -1884,6 +1978,7 @@
[email protected]@ -1884,6 +2259,7 @@
  		 *  5	irq
  		 *  6	softirq
  		 */
@@ -308,7 +1116,7 @@
  		printf(",%f,%f,%f,%f,%Lu",
  			(sys.times[0] + sys.times[1]) * scale,
  			sys.times[2] * scale,
[email protected]@ -1890,6 +1985,14 @@
[email protected]@ -1890,6 +2266,14 @@
  			(sys.times[3] + sys.times[4]) * scale,
  			(sys.times[5] + sys.times[6]) * scale,
  			sys.intr);
@@ -323,7 +1131,7 @@
  	}
  	prev = current;
  }
[email protected]@ -1903,6 +2006,10 @@
[email protected]@ -1903,6 +2287,10 @@
  	static socklen_t buflen = 0;
  	static int sock_fd = -1;
  	int i, count, item_size;
@@ -334,7 +1142,7 @@
  
  	if (sock_fd < 0) {
  		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
[email protected]@ -1912,6 +2019,7 @@
[email protected]@ -1912,6 +2300,7 @@
  
  	/* We should only loop once on the first call; after that the
  	 * buffer requirements for RDS counters should not change. */
@@ -342,7 +1150,7 @@
  	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
  		if (errno != ENOSPC)
  			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
[email protected]@ -1919,7 +2027,29 @@
[email protected]@ -1919,7 +2308,29 @@
  		if (!curr)
  			die_errno("Cannot allocate buffer for stats counters");
  	}
@@ -372,7 +1180,7 @@
  	if (item_size > sizeof(*ctr))
  		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
  				item_size, sizeof(*ctr));
[email protected]@ -1932,8 +2062,11 @@
[email protected]@ -1932,8 +2343,11 @@
  	}
  
  	for (i = 0; i < count; ++i)
@@ -385,7 +1193,7 @@
  	gettimeofday(&now, NULL);
  
  	if (initialize) {
[email protected]@ -1957,6 +2090,10 @@
[email protected]@ -1957,6 +2371,10 @@
  	memcpy(prev, ctr, count * sizeof(*ctr));
  	last_ts = now;
  
@@ -396,7 +1204,7 @@
  	get_stats(initialize);
  }
  
[email protected]@ -1967,7 +2104,7 @@
[email protected]@ -1967,7 +2385,7 @@
  
  	pid = waitpid(-1, &status, wflags);
  	if (pid < 0)
@@ -405,7 +1213,7 @@
  	if (pid == 0)
  		return 0;
  
[email protected]@ -1975,15 +2112,15 @@
[email protected]@ -1975,15 +2393,15 @@
  		if (WEXITSTATUS(status) == 0)
  			return 1;
  		die("child pid %u exited with status %d\n",
@@ -424,7 +1232,22 @@
  }
  
  static void release_children_and_wait(struct options *opts,
[email protected]@ -2139,7 +2276,12 @@
[email protected]@ -1995,9 +2413,13 @@
+ 	struct counter summary[NR_STATS];
+ 	struct timeval start, end, now, first_ts, last_ts;
+ 	double cpu_total = 0;
+-	uint16_t i, cpu_samples = 0;
++	uint16_t i, j, cpu_samples = 0;
+ 	uint16_t nr_running;
++        uint64_t latency_histogram[MAX_BUCKETS];
+ 
++	if (show_histogram) 
++	        memset(latency_histogram, 0, sizeof(latency_histogram));
++
+ 	gettimeofday(&start, NULL);
+ 	start.tv_sec += 2;
+ 	for (i = 0; i < opts->nr_tasks; i++)
[email protected]@ -2139,7 +2561,12 @@
  	control_fd = -1;
  
  	if (nr_running) {
@@ -437,7 +1260,131 @@
  			kill(ctl[i].pid, SIGTERM);
  		stop_soakers(soak_arr);
  	}
[email protected]@ -2517,7 +2659,11 @@
[email protected]@ -2167,6 +2594,19 @@
+ 			avg(&summary[S_SENDMSG_USECS]),
+ 			avg(&summary[S_RTT_USECS]),
+ 			soak_arr? scale * cpu_total : -1.0);
++
++		if (show_histogram) 
++		{
++			for (i = 0; i < opts->nr_tasks; i++)
++			  for (j=0;j < MAX_BUCKETS; j++)
++			    latency_histogram[j] += ctl[i].latency_histogram[j];
++			    
++			printf("\nRTT histogram\n");
++			printf("RTT (us)        \t\t    Count\n");
++			for (i=0;i < MAX_BUCKETS; i++)
++			  printf("[%6u - %6u] \t\t %8u\n", 1 << i, 1 << (i+1), 
++			         (unsigned int)latency_histogram[i]);
++		}
+ 	}
+ }
+ 
[email protected]@ -2220,6 +2660,21 @@
+ {
+ 	ssize_t ret;
+ 
++	if (size == sizeof(struct options)) {
++		memset(ptr, 0, size);
++		ret = read(fd, peer_version, VERSION_MAX_LEN);
++		if (ret != VERSION_MAX_LEN)
++			die_errno("Failed to read version");
++
++		if (strcmp(peer_version, RDS_VERSION)) {
++			ptr += ret;
++			memcpy(ptr, peer_version, VERSION_MAX_LEN);
++			size = sizeof(struct options_2_0_6) - ret;
++		} else
++			size -= ret;
++		ptr += ret;
++	}
++
+ 	while (size) {
+ 		ret = read(fd, ptr, size);
+ 		if (ret < 0)
[email protected]@ -2233,6 +2688,7 @@
+ 
+ static void encode_options(struct options *dst, const struct options *src)
+ {
++	memcpy(dst->version, src->version, VERSION_MAX_LEN);
+ 	dst->req_depth = htonl(src->req_depth);
+ 	dst->req_size = htonl(src->req_size);
+ 	dst->ack_size = htonl(src->ack_size);
[email protected]@ -2262,10 +2718,13 @@
+         dst->simplex = src->simplex;                    /* byte sized */
+         dst->rw_mode = src->rw_mode;                    /* byte sized */
+         dst->rdma_vector = htonl(src->rdma_vector);
++	dst->tos = src->tos;
++	dst->async = src->async;
+ }
+ 
+ static void decode_options(struct options *dst, const struct options *src)
+ {
++	memcpy(dst->version, src->version, VERSION_MAX_LEN);
+ 	dst->req_depth = ntohl(src->req_depth);
+ 	dst->req_size = ntohl(src->req_size);
+ 	dst->ack_size = ntohl(src->ack_size);
[email protected]@ -2295,6 +2754,8 @@
+         dst->simplex = src->simplex;                    /* byte sized */
+         dst->rw_mode = src->rw_mode;                    /* byte sized */
+ 	dst->rdma_vector = ntohl(src->rdma_vector);
++	dst->tos = src->tos;
++	dst->async = src->async;
+ }
+ 
+ static void verify_option_encdec(const struct options *opts)
[email protected]@ -2316,6 +2777,25 @@
+ 		die("encode/decode check of options struct failed");
+ }
+ 
++static void reset_conn(struct options *opts)
++{
++	struct rds_reset val;
++	int fd;
++	struct sockaddr_in sin;
++
++	sin.sin_family = AF_INET;
++	sin.sin_port = htons(opts->starting_port);
++	sin.sin_addr.s_addr = htonl(opts->receive_addr);
++
++	fd = bound_socket(pf, SOCK_SEQPACKET, 0, &sin);
++
++	val.tos = opts->tos;
++	val.src.s_addr = htonl(opts->receive_addr);
++	val.dst.s_addr = htonl(opts->send_addr);
++	if (setsockopt(fd, sol, RDS_CONN_RESET, &val, sizeof(val)))
++		die_errno("setsockopt RDS_CONN_RESET failed");
++}
++
+ static int active_parent(struct options *opts, struct soak_control *soak_arr)
+ {
+ 	struct options enc_options;
[email protected]@ -2324,6 +2804,11 @@
+ 	int fd;
+ 	uint8_t ok;
+ 
++	if (reset_connection) {
++		reset_conn(opts);
++		return 0;
++	}
++
+ 	if (opts->show_params) {
+ 		unsigned int k;
+ 
[email protected]@ -2387,7 +2872,11 @@
+ 	 * We just tell the peer what options to use.
+ 	 */
+ 	encode_options(&enc_options, opts);
+-	peer_send(fd, &enc_options, sizeof(struct options));
++	if (opts->tos || opts->async)
++		peer_send(fd, &enc_options, sizeof(struct options));
++	else
++		peer_send(fd, &enc_options.req_depth,
++				sizeof(struct options_2_0_6));
+ 
+ 	printf("negotiated options, tasks will start in 2 seconds\n");
+ 	ctl = start_children(opts, 1);
[email protected]@ -2517,7 +3006,11 @@
  	/* an extra terminating entry which will be all 0s */
  	len = (nr_soak + 1) * sizeof(struct soak_control);
  	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
@@ -449,7 +1396,24 @@
  	if (soak_arr == MAP_FAILED)
  		die("mmap of %ld soak control structs failed", nr_soak);
  
[email protected]@ -2589,6 +2735,7 @@
[email protected]@ -2572,6 +3065,10 @@
+ 	OPT_CONNECT_RETRIES,
+ 	OPT_USE_CONG_MONITOR,
+ 	OPT_PERFDATA,
++        OPT_SHOW_OUTLIERS,
++        OPT_SHOW_HISTOGRAM,
++	OPT_RESET,
++	OPT_ASYNC,
+ };
+ 
+ static struct option long_options[] = {
[email protected]@ -2584,11 +3081,13 @@
+ { "send-addr",		required_argument,	NULL,	's'	},
+ { "port",		required_argument,	NULL,	'p'	},
+ { "time",		required_argument,	NULL,	'T'	},
++{ "tos",                required_argument,      NULL,   'Q'     },
+ { "report-cpu",		no_argument,		NULL,	'c'	},
+ { "report-summary",	no_argument,		NULL,	'z'	},
  { "rtprio",		no_argument,		NULL,	'R'	},
  { "verify",		no_argument,		NULL,	'v'	},
  { "trace",		no_argument,		NULL,	'V'	},
@@ -457,16 +1421,56 @@
  
  { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
  { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
[email protected]@ -2652,7 +2799,7 @@
[email protected]@ -2601,6 +3100,10 @@
+ { "show-perfdata",	no_argument,		NULL,	OPT_PERFDATA },
+ { "connect-retries",	required_argument,	NULL,	OPT_CONNECT_RETRIES },
+ { "use-cong-monitor",	required_argument,	NULL,	OPT_USE_CONG_MONITOR },
++{ "show-outliers",      required_argument,      NULL,   OPT_SHOW_OUTLIERS    },
++{ "show-histogram",     no_argument,            NULL,   OPT_SHOW_HISTOGRAM   },
++{ "reset",              no_argument,            NULL,   OPT_RESET },
++{ "async",              no_argument,            NULL,   OPT_ASYNC },
+ 
+ { NULL }
+ };
[email protected]@ -2640,6 +3143,8 @@
+ 	opts.use_cong_monitor = 1;
+ 	opts.rdma_use_fence = 1;
+ 	opts.rdma_cache_mrs = 0;
++	opts.rdma_use_once = 0;
++	opts.rdma_use_get_mr = 0;
+ 	opts.rdma_alignment = 0;
+ 	opts.rdma_key_o_meter = 0;
+ 	opts.show_params = 0;
[email protected]@ -2648,11 +3153,17 @@
+         opts.simplex = 0;
+         opts.rw_mode = 0;
+ 	opts.rdma_vector = 1;
++        rtt_threshold = ~0U;
++        show_histogram = 0;
++	opts.tos = 0;
++	reset_connection = 0;
++	opts.async = 0;
++	strcpy(opts.version, RDS_VERSION);
+ 
  	while(1) {
  		int c, index;
  
 -		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
-+		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVg:z",
++		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:Q:vVg:z",
  				long_options, &index);
  		if (c == -1)
  			break;
[email protected]@ -2711,6 +2858,10 @@
[email protected]@ -2702,6 +3213,9 @@
+ 			case 'T':
+ 				opts.run_time = parse_ull(optarg, (uint32_t)~0);
+ 				break;
++			case 'Q':
++				opts.tos = parse_ull(optarg, (uint8_t)~0);
++				break;
+ 			case 'z':
+ 				opts.summary_only = 1;
+ 				break;
[email protected]@ -2711,9 +3225,25 @@
  			case 'V':
  				opts.tracing = 1;
  				break;
@@ -474,10 +1478,25 @@
 +				lgrp_id = (lgrp_id_t)parse_ull(optarg,
 +				    (uint32_t)~0);
 +				break;
++                        case OPT_SHOW_OUTLIERS:
++                                rtt_threshold = parse_ull(optarg, ~0U);
++                                break;
++                        case OPT_SHOW_HISTOGRAM:
++                                show_histogram = 1;
++                                break;
  			case OPT_USE_CONG_MONITOR:
  				opts.use_cong_monitor = parse_ull(optarg, 1);
  				break;
[email protected]@ -2786,6 +2937,7 @@
++			case OPT_RESET:
++				reset_connection = 1;
++				break;
++			case OPT_ASYNC:
++				opts.async = 1;
++				break;
+ 			case OPT_RDMA_USE_ONCE:
+ 				opts.rdma_use_once = parse_ull(optarg, 1);
+ 				break;
[email protected]@ -2786,6 +3316,7 @@
  	if (opts.rdma_size && 0)
  		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
  
@@ -544,7 +1563,71 @@
  /* Like inet_ntoa, but can be re-entered several times without clobbering
   * the previously returned string. */
  static const char *paddr(int af, const void *addrp)
[email protected]@ -234,8 +250,10 @@
[email protected]@ -134,18 +150,20 @@
+ {
+ 	struct rds_info_connection conn;
+ 
+-	printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n",
+-		"LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg");
++	printf("\nRDS Connections:\n%15s %15s %4s %16s %16s %4s\n",
++		"LocalAddr", "RemoteAddr", "Tos", "NextTX", "NextRX", "Flgs");
+ 	
+ 	for_each(conn, data, each, len) {
+-		printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n",
++		printf("%15s %15s %4u %16"PRIu64" %16"PRIu64" %c%c%c%c\n",
+ 			ipv4addr(conn.laddr),
+ 			ipv4addr(conn.faddr),
++			conn.tos,
+ 			conn.next_tx_seq,
+ 			conn.next_rx_seq,
+ 			rds_conn_flag(conn, SENDING, 's'),
+ 			rds_conn_flag(conn, CONNECTING, 'c'),
+-			rds_conn_flag(conn, CONNECTED, 'C'));
++			rds_conn_flag(conn, CONNECTED, 'C'),
++			rds_conn_flag(conn, ERROR, 'E'));
+ 	}
+ }
+ 
[email protected]@ -153,16 +171,17 @@
+ {
+ 	struct rds_info_message msg;
+ 
+-	printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n",
++	printf("\n%s Message Queue:\n%15s %5s %15s %5s %4s %16s %10s\n",
+ 		(char *)extra,
+-		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes");
++		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Tos","Seq", "Bytes");
+ 	
+ 	for_each(msg, data, each, len) {
+-		printf("%15s %5u %15s %5u %16"PRIu64" %10u\n",
++		printf("%15s %5u %15s %5u %4u %16"PRIu64" %10u\n",
+ 			ipv4addr(msg.laddr),
+ 			ntohs(msg.lport),
+ 			ipv4addr(msg.faddr),
+ 			ntohs(msg.fport),
++			msg.tos,
+ 			msg.seq, msg.len);
+ 	}
+ }
[email protected]@ -191,13 +210,14 @@
+ {
+ 	struct rds_info_rdma_connection ic;
+ 
+-	printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n",
+-		"LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev");
++	printf("\nRDS IB Connections:\n%15s %15s %4s %3s %32s %32s\n",
++		"LocalAddr", "RemoteAddr", "Tos", "SL", "LocalDev", "RemoteDev");
+ 
+ 	for_each(ic, data, each, len) {
+-		printf("%15s %15s %32s %32s",
++		printf("%15s %15s %4u %3u %32s %32s",
+ 			ipv4addr(ic.src_addr),
+ 			ipv4addr(ic.dst_addr),
++			ic.tos,ic.sl,
+ 			ipv6addr(ic.src_gid),
+ 			ipv6addr(ic.dst_gid));
+ 
[email protected]@ -234,8 +254,10 @@
  		print_msgs, "Send", 0 },
  	['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
  		  print_msgs, "Retransmit", 0 },
@@ -555,7 +1638,7 @@
  	['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
  		  print_ib_conns, NULL, 0 },
  };
[email protected]@ -266,6 +284,9 @@
[email protected]@ -266,6 +288,9 @@
  	char optstring[258] = "v+";
  	int given_options = 0;
  	socklen_t len = 0;
@@ -565,7 +1648,7 @@
  	void *data = NULL;
  	int fd;
  	int each;
[email protected]@ -322,6 +343,7 @@
[email protected]@ -322,6 +347,7 @@
  		    (given_options && !infos[i].option_given))
  			continue;
  
@@ -573,7 +1656,7 @@
  		/* read in the info until we get a full snapshot */
  		while ((each = getsockopt(fd, sol, infos[i].opt_val, data,
  				   &len)) < 0) {
[email protected]@ -345,15 +367,47 @@
[email protected]@ -345,15 +371,47 @@
  				return 1;
  			}
  		}
@@ -725,7 +1808,7 @@
 diff -r -u /tmp/rds-tools-2.0.4/rds-info.1 rds-tools-2.0.7/rds-info.1
 --- /tmp/rds-tools-2.0.4/rds-info.1	Wed Aug  4 15:25:11 2010
 +++ rds-tools-2.0.7/rds-info.1	Thu Feb 24 13:27:51 2011
[email protected]@ -1,162 +1,150 @@
[email protected]@ -1,162 +1,160 @@
 -.Dd October 30, 2006
 -.Dt RDS-INFO 1
 -.Os
@@ -828,6 +1911,8 @@
 +.IP	RemoteAddr
  The IP address of the remote end of the connection.  
 -.It NextTX
++.IP	Tos
++The type of service value for this connection.
 +.IP	NextTX
  The sequence number that will be given to the next message that is sent
  over the connection.
@@ -859,6 +1944,8 @@
 +.IP 		C
 +	The connection to the remote host is connected
 +	and active.
++.IP 		E
++	The connection to the remote host is in error.
 +
 +.TP
 +\fB\-r\fR, \fB\-s\fR, \fB\-t\fR
@@ -875,6 +1962,8 @@
  The remote IP address and port associated with the message. For sent messages
  this is the destination address, for receive messages it is the source address.
 -.It Seq
++.IP	Tos
++The type of service for this message.
 +.IP	Seq
  The sequence number of the message.
 -.It Bytes
@@ -901,6 +1990,10 @@
 +.IP	RemoteAddr
  The remote IP address of this connection.
 -.It LocalDev
++.IP	Tos
++The type of service value for this connection.
++.IP	SL
++The QoS Service Level for this connection.
 +.IP	LocalDev
  The local IB Global Identifier, printed in IPv6 address syntax.
 -.It RemoteDev
@@ -954,7 +2047,7 @@
 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.1 rds-tools-2.0.7/rds-ping.1
 --- /tmp/rds-tools-2.0.4/rds-ping.1	Wed Aug  4 15:25:11 2010
 +++ rds-tools-2.0.7/rds-ping.1	Thu Feb 24 13:27:52 2011
[email protected]@ -1,69 +1,54 @@
[email protected]@ -1,69 +1,63 @@
 -.Dd Apr 22, 2008
 -.Dt RDS-PING 1
 -.Os
@@ -979,7 +2072,11 @@
 -Its interface is designed to operate pretty much the standard
 -.Xr ping 8
 +.SH SYNOPSIS
-+.B rds-ping [-c count] [-i interval] [-I local_addr] remote_addr
++.HP
++.nf
++rds-ping [-c count] [-Q tos] [-i interval] [-I local_addr]
++    remote_addr
++.fi
 +
 +.SH DESCRIPTION
 +.PP
@@ -1012,6 +2109,11 @@
 -.Nm rds-ping
 -will pick the local source address for the RDS socket based
 +.TP
++\fB\-Q tos
++By default, rds-ping sends the ping requests on base (tos = 0) RDS connection.
++With this option, the requests are sent on RDS connection with the specified tos
++value.  Valid values are 0-255.
++.TP
 +\fB\-I address
 +By default, rds-ping will pick the local source address for the RDS socket based
  on routing information for the destination address (i.e. if
@@ -1072,10 +2174,11 @@
 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.c rds-tools-2.0.7/rds-ping.c
 --- /tmp/rds-tools-2.0.4/rds-ping.c	Wed Aug  4 15:25:10 2010
 +++ rds-tools-2.0.7/rds-ping.c	Thu Feb 24 13:27:52 2011
[email protected]@ -48,7 +48,11 @@
[email protected]@ -48,7 +48,12 @@
  #include <sys/poll.h>
  #include <fcntl.h>
  #include <getopt.h>
++#include <sys/ioctl.h>
 +#if defined(__SVR4) && defined(__sun)
 +#include <sys/rds.h>
 +#else
@@ -1084,7 +2187,58 @@
  
  #include "pfhack.h"
  
[email protected]@ -155,7 +159,12 @@
[email protected]@ -67,6 +72,7 @@
+ static unsigned long	opt_count;
+ static struct in_addr	opt_srcaddr;
+ static struct in_addr	opt_dstaddr;
++static uint8_t		opt_tos = 0;
+ 
+ /* For reasons of simplicity, RDS ping does not use a packet
+  * payload that is being echoed, the way ICMP does.
[email protected]@ -91,6 +97,7 @@
+ static int	parse_timeval(const char *, struct timeval *);
+ static int	parse_long(const char *ptr, unsigned long *);
+ static int	parse_addr(const char *ptr, struct in_addr *);
++static unsigned long long	parse_ull(char *ptr, unsigned long long max);
+ 
+ int
+ main(int argc, char **argv)
[email protected]@ -97,7 +104,7 @@
+ {
+ 	int c;
+ 
+-	while ((c = getopt(argc, argv, "c:i:I:")) != -1) {
++	while ((c = getopt(argc, argv, "c:i:I:Q:")) != -1) {
+ 		switch (c) {
+ 		case 'c':
+ 			if (!parse_long(optarg, &opt_count))
[email protected]@ -114,6 +121,9 @@
+ 				die("Bad wait time <%s>\n", optarg);
+ 			break;
+ 
++		case 'Q':
++			opt_tos = parse_ull(optarg, 255);
++			break;
+ 		default:
+ 			usage("Unknown option");
+ 		}
[email protected]@ -142,6 +152,9 @@
+ 	struct timeval	next_ts;
+ 	struct socket	socket[NSOCKETS];
+ 	struct pollfd	pfd[NSOCKETS];
++#if !(defined(__SVR4) && defined(__sun))
++	int             pending[NSOCKETS];
++#endif
+ 	int		i, next = 0;
+ 
+ 	for (i = 0; i < NSOCKETS; ++i) {
[email protected]@ -152,10 +165,18 @@
+ 		socket[i].fd = fd;
+ 		pfd[i].fd = fd;
+ 		pfd[i].events = POLLIN;
++#if !(defined(__SVR4) && defined(__sun))
++		pending[i] = 0;
++#endif
  	}
  
  	memset(&sin, 0, sizeof(sin));
@@ -1097,16 +2251,57 @@
  	sin.sin_addr = opt_dstaddr;
  
  	gettimeofday(&next_ts, NULL);
[email protected]@ -181,7 +190,7 @@
[email protected]@ -180,14 +201,32 @@
+ 			if (opt_count && sent >= opt_count)
  				break;
  
- 			timeradd(&next_ts, &opt_wait, &next_ts);
+-			timeradd(&next_ts, &opt_wait, &next_ts);
 -			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
-+			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
- 				err = errno;
- 			sp->sent_id = ++sent;
- 			sp->sent_ts = now;
[email protected]@ -258,7 +267,11 @@
+-				err = errno;
+-			sp->sent_id = ++sent;
+-			sp->sent_ts = now;
+-			sp->nreplies = 0;
+-			next = (next + 1) % NSOCKETS;
++			timeradd(&now, &opt_wait, &next_ts);
++#if !(defined(__SVR4) && defined(__sun))
++			if (!pending[next]) {
++#endif
++				memset(&sin, 0, sizeof(sin));
++#if defined(__SVR4) && defined(__sun)
++				sin.sin_family = AF_INET_OFFLOAD;
++#else
++				sin.sin_family = AF_INET;
++#endif
++				sin.sin_addr = opt_dstaddr;
+ 
++				if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
++					err = errno;
++				sp->sent_id = ++sent;
++				sp->sent_ts = now;
++				sp->nreplies = 0;
++#if !(defined(__SVR4) && defined(__sun))
++				if (!err)
++					pending[next] = 1;
++#endif
++				next = (next + 1) % NSOCKETS;
++#if !(defined(__SVR4) && defined(__sun))
++			}
++#endif
++
+ 			if (err) {
+ 				static unsigned int nerrs = 0;
+ 
[email protected]@ -223,6 +262,9 @@
+ 					report_packet(sp, &now, NULL, errno);
+ 			} else {
+ 				report_packet(sp, &now, &from.sin_addr, 0);
++#if !(defined(__SVR4) && defined(__sun))
++				pending[i] = 0;
++#endif
+ 				recv++;
+ 			}
+ 		}
[email protected]@ -258,7 +300,11 @@
  	int pf;
  
  	memset(&sin, 0, sizeof(sin));
@@ -1118,7 +2313,7 @@
  
  #ifdef DYNAMIC_PF_RDS
          pf = discover_pf_rds();
[email protected]@ -278,6 +291,9 @@
[email protected]@ -278,6 +324,9 @@
  		if (ufd < 0)
  			die_errno("unable to create UDP socket");
  		sin.sin_addr = *dst;
@@ -1128,7 +2323,7 @@
  		sin.sin_port = htons(1);
  		if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
  			die_errno("unable to connect to %s",
[email protected]@ -289,6 +305,9 @@
[email protected]@ -289,6 +338,9 @@
  
  		*src = sin.sin_addr;
  		close(ufd);
@@ -1138,6 +2333,58 @@
  	}
  
  	sin.sin_addr = *src;
[email protected]@ -297,6 +349,9 @@
+ 	if (bind(fd, (struct sockaddr *) &sin, sizeof(sin)))
+ 		die_errno("bind() failed");
+ 
++	if (opt_tos && ioctl(fd, SIOCRDSSETTOS, &opt_tos)) 
++		die_errno("ERROR: failed to set TOS\n");
++
+ 	return fd;
+ }
+ 
[email protected]@ -309,7 +364,8 @@
+ 		"%s\nUsage: rds-ping [options] dst_addr\n"
+ 		"Options:\n"
+ 		" -c count      limit packet count\n"
+-		" -I interface  source IP address\n",
++		" -I interface  source IP address\n"
++		" -Q tos	type of service\n",
+ 		complaint);
+ 	exit(1);
+ }
[email protected]@ -384,3 +440,31 @@
+ 	return 0;
+ }
+ 
++static unsigned long long parse_ull(char *ptr, unsigned long long max)
++{
++	unsigned long long val;
++	char *endptr;
++
++	val = strtoull(ptr, &endptr, 0);
++	switch (*endptr) {
++	case 'k': case 'K':
++		val <<= 10;
++		endptr++;
++		break;
++
++	case 'm': case 'M':
++		val <<= 20;
++		endptr++;
++		break;
++
++	case 'g': case 'G':
++		val <<= 30;
++		endptr++;
++		break;
++	}
++
++	if (*ptr && !*endptr && val <= max)
++		return val;
++
++	die("invalid number '%s'\n", ptr);
++}
 diff -r -u /tmp/rds-tools-2.0.4/Makefile.in rds-tools-2.0.7/Makefile.in
 --- /tmp/rds-tools-2.0.4/Makefile.in	Wed Aug  4 15:25:11 2010
 +++ rds-tools-2.0.7/Makefile.in	Thu Feb 24 13:27:51 2011
@@ -1290,7 +2537,7 @@
 diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
 --- /tmp/rds-tools-2.0.4/rds-stress.1	Wed Aug  4 15:25:11 2010
 +++ rds-tools-2.0.7/rds-stress.1	Thu Feb 24 13:27:52 2011
[email protected]@ -1,99 +1,102 @@
[email protected]@ -1,99 +1,106 @@
 -.Dd May 15, 2007
 -.Dt RDS-STRESS 1
 -.Os
@@ -1321,8 +2568,8 @@
 +.HP
 +.nf
 +rds-stress [-p port_number] -r [receive_address] [-s send_address]
-+      [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
-+      [-d queue_depth] [-t nr_tasks] [-c] [-R] [-V] [-v]
++      [-Q tos] [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
++      [-d queue_depth] [-t nr_tasks] [-T time] [-c] [-R] [-V] [-v]
 +.fi
  
 -.Sh DESCRIPTION
@@ -1423,6 +2670,10 @@
  which it connects to the destination address.
 -.It Fl a Ar ack_bytes
 +.TP
++\fB\-Q tos
++Uses the RDS connection between IP addresses with the specified tos value. By 
++default, the base (tos = 0) RDS connection is used.  Valid values are 0-255.
++.TP
 +\fB\-a ack_bytes
  This specifies the size of the ack messages, in bytes. There is a minimum size
  which depends on the format of the ack messages, which may change over time.
@@ -1439,7 +2690,7 @@
  RDSv3 is capable of transmitting part of a message via RDMA directly from
  application buffer to application buffer. This option enables RDMA support
  in rds-stress: request packets include parameters for an RDMA READ or WRITE
[email protected]@ -100,20 +103,25 @@
[email protected]@ -100,20 +107,25 @@
  operation, which the receiving process executes at the time the ACK packet
  is sent.
  See section "Message Sizes" below.
@@ -1470,7 +2721,7 @@
  This causes rds-stress to create child tasks which just consume CPU cycles.
  One task is created for each CPU in the system.  First each child observes the
  maximum rate at which it can consume cycles.  This means that this option
[email protected]@ -121,50 +129,67 @@
[email protected]@ -121,50 +133,67 @@
  use of the system by observing the lesser rate at which the children consume
  cycles.  This option is *not* shared between the active and passive instances.
  It must be specified on each rds-stress command line.
@@ -1537,7 +2788,7 @@
  WRITEs for all children.
 -.It tx us/c
 +.TP
-+mbi K/s
++mbo K/s
 +The total number of bytes that are being transmited via RDMA READs and
 +WRITEs for all children.
 +.TP
@@ -1557,7 +2808,7 @@
  This is the percentage of available CPU resources on this machine that are being
  consumed since rds-stress started running.  It will show -1.00 if -c is not
  given.  It is calculated based on the amount of CPU resources that CPU soaking
[email protected]@ -171,4 +196,3 @@
[email protected]@ -171,4 +200,3 @@
  tasks are able to consume.  This lets it measure CPU use by the system, say in
  interrupt handlers, that task-based CPU accounting does not include.
  For this to work rds-stress must be started with -c on an idle system.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/open-fabrics/rds-tools/rds.h	Tue Jul 01 14:49:32 2014 -0700
@@ -0,0 +1,437 @@
+/*
+ * This file contains definitions used in OFED defined user/kernel
+ * interfaces. These are imported from the OFED header <linux/rds.h>. Oracle
+ * elects to have and use the contents of <linux/rds.h> under and governed
+ * by the OpenIB.org BSD license (see below for full license text). However,
+ * the following notice accompanied the original version of this file:
+ */
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+/*
+ * Copyright (c) 2011, 2014, Oracle and/or its affiliates. All rights reserved.
+ */
+#ifndef _SYS_RDS_H
+#define	_SYS_RDS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !(defined(__SVR4) && defined(__sun))
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/socket.h>
+#endif
+
+/*
+ * These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them.
+ */
+#if !(defined(__SVR4) && defined(__sun))
+#ifndef __KERNEL__
+#define	__be16	u_int16_t
+#define	__be32	u_int32_t
+#define	__be64	u_int64_t
+#endif
+#else
+#define	u_int8_t	uint8_t
+#define	u_int16_t	uint16_t
+#define	u_int32_t	uint32_t
+#define	u_int64_t	uint64_t
+#endif
+
+#define	RDS_IB_ABI_VERSION		0x301
+
+#define	AF_RDS	AF_INET_OFFLOAD
+#define	PF_RDS	AF_INET_OFFLOAD
+#define	SOL_RDS	272
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define	RDS_CANCEL_SENT_TO	1
+#define	RDS_GET_MR			2
+#define	RDS_FREE_MR			3
+/* deprecated: RDS_BARRIER 4 */
+#define	RDS_RECVERR			5
+#define	RDS_CONG_MONITOR		6
+#define	RDS_GET_MR_FOR_DEST		7
+#define	RDS_CONN_RESET			8
+
+/* New ioctls for qos */
+#define	SIOCRDSSETTOS	11000
+#define	SIOCRDSGETTOS	11001
+
+typedef		u_int8_t	rds_tos_t;
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * CMSG_RDMA_ARGS (sendmsg)
+ *	Request a RDMA transfer to/from the specified
+ *	memory ranges.
+ *	The cmsg_data is a struct rds_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ *	Kernel informs application about intended
+ *	source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ *	Application asks kernel to map the given
+ *	memory range into a IB MR, and send the
+ *	R_Key along in an RDS extension header.
+ *	The cmsg_data is a struct rds_get_mr_args,
+ *	the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ *	Returns the status of a completed RDMA operation.
+ */
+#define	RDS_CMSG_RDMA_ARGS		1
+#define	RDS_CMSG_RDMA_DEST		2
+#define	RDS_CMSG_RDMA_MAP		3
+#define	RDS_CMSG_RDMA_STATUS		4
+#define	RDS_CMSG_CONG_UPDATE		5
+#define	RDS_CMSG_ATOMIC_FADD		6
+#define	RDS_CMSG_ATOMIC_CSWP		7
+#define	RDS_CMSG_MASKED_ATOMIC_FADD	8
+#define	RDS_CMSG_MASKED_ATOMIC_CSWP	9
+#define	RDS_CMSG_ASYNC_SEND		10
+
+
+/* private ioctl cmds */
+#define	RDS_INFO_FIRST			10000
+#define	RDS_INFO_COUNTERS		10000
+#define	RDS_INFO_CONNECTIONS		10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define	RDS_INFO_SEND_MESSAGES		10003
+#define	RDS_INFO_RETRANS_MESSAGES	10004
+#define	RDS_INFO_RECV_MESSAGES		10005
+#define	RDS_INFO_SOCKETS		10006
+#define	RDS_INFO_TCP_SOCKETS		10007
+#define	RDS_INFO_IB_CONNECTIONS		10008
+#define	RDS_INFO_CONNECTION_STATS	10009
+#define	RDS_INFO_IWARP_CONNECTIONS	10010
+#define	RDS_INFO_LAST			10010
+
+#if defined(__SVR4) && defined(__sun)
+struct rds_info_arg {
+	uint64_t	lenp;
+	uint64_t	datap;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_counter {
+	u_int8_t	name[32];
+	u_int64_t	value;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_counter {
+	u_int8_t	name[32];
+	u_int64_t	value;
+};
+#endif
+
+#define	RDS_INFO_CONNECTION_FLAG_SENDING	0x01
+#define	RDS_INFO_CONNECTION_FLAG_CONNECTING	0x02
+#define	RDS_INFO_CONNECTION_FLAG_CONNECTED	0x04
+#define	RDS_INFO_CONNECTION_FLAG_ERROR		0x08
+
+#define	TRANSNAMSIZ	16
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_connection {
+	u_int64_t	next_tx_seq;
+	u_int64_t	next_rx_seq;
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int8_t	transport[TRANSNAMSIZ];		/* null term ascii */
+	u_int8_t	flags;
+	u_int8_t	tos;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_connection {
+	u_int64_t	next_tx_seq;
+	u_int64_t	next_rx_seq;
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int8_t	transport[TRANSNAMSIZ];		/* null term ascii */
+	u_int8_t	flags;
+	u_int8_t	tos;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_flow {
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int32_t	bytes;
+	u_int16_t	lport;
+	u_int16_t	fport;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_flow {
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int32_t	bytes;
+	u_int16_t	lport;
+	u_int16_t	fport;
+};
+#endif
+
+#define	RDS_INFO_MESSAGE_FLAG_ACK		0x01
+#define	RDS_INFO_MESSAGE_FLAG_FAST_ACK		0x02
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_message {
+	u_int64_t	seq;
+	u_int32_t	len;
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int16_t	lport;
+	u_int16_t	fport;
+	u_int8_t	flags;
+	u_int8_t	tos;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_message {
+	u_int64_t	seq;
+	u_int32_t	len;
+	u_int32_t	laddr;
+	u_int32_t	faddr;
+	u_int16_t	lport;
+	u_int16_t	fport;
+	u_int8_t	flags;
+	u_int8_t	tos;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_socket {
+	u_int32_t	sndbuf;
+	u_int32_t	bound_addr;
+	u_int32_t	connected_addr;
+	u_int16_t	bound_port;
+	u_int16_t	connected_port;
+	u_int32_t	rcvbuf;
+	u_int64_t	inum;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_socket {
+	u_int32_t	sndbuf;
+	u_int32_t	bound_addr;
+	u_int32_t	connected_addr;
+	u_int16_t	bound_port;
+	u_int16_t	connected_port;
+	u_int32_t	rcvbuf;
+	u_int64_t	inum;
+};
+#endif
+
+#ifndef __lock_lint
+#pragma pack(1)
+struct rds_info_tcp_socket {
+	u_int32_t	local_addr;
+	u_int16_t	local_port;
+	u_int32_t	peer_addr;
+	u_int16_t	peer_port;
+	u_int64_t	hdr_rem;
+	u_int64_t	data_rem;
+	u_int32_t	last_sent_nxt;
+	u_int32_t	last_expected_una;
+	u_int32_t	last_seen_una;
+} __attribute__((packed));
+#pragma pack()
+#else
+struct rds_info_tcp_socket {
+	u_int32_t	local_addr;
+	u_int16_t	local_port;
+	u_int32_t	peer_addr;
+	u_int16_t	peer_port;
+	u_int64_t	hdr_rem;
+	u_int64_t	data_rem;
+	u_int32_t	last_sent_nxt;
+	u_int32_t	last_expected_una;
+	u_int32_t	last_seen_una;
+} __attribute__((packed));
+#endif
+
+#define	RDS_IB_GID_LEN	16
+struct rds_info_rdma_connection {
+	u_int32_t	src_addr;
+	u_int32_t	dst_addr;
+	uint8_t		src_gid[RDS_IB_GID_LEN];
+	uint8_t		dst_gid[RDS_IB_GID_LEN];
+
+	uint32_t	max_send_wr;
+	uint32_t	max_recv_wr;
+	uint32_t	max_send_sge;
+	uint32_t	rdma_mr_max;
+	uint32_t	rdma_mr_size;
+	uint8_t		tos;
+	uint8_t		sl;
+};
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ *	1 << (portnum % 64)
+ */
+#define	RDS_CONG_MONITOR_SIZE	64
+#define	RDS_CONG_MONITOR_BIT(port)  \
+	(((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define	RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef u_int64_t	rds_rdma_cookie_t;
+
+struct rds_iovec {
+	u_int64_t	addr;
+	u_int64_t	bytes;
+};
+
+struct rds_get_mr_args {
+	struct rds_iovec vec;
+	u_int64_t	cookie_addr;
+	uint64_t	flags;
+};
+
+struct rds_get_mr_for_dest_args {
+	struct sockaddr_storage dest_addr;
+	struct rds_iovec	vec;
+	u_int64_t		cookie_addr;
+	uint64_t		flags;
+};
+
+
+struct rds_free_mr_args {
+	rds_rdma_cookie_t cookie;
+	u_int64_t	flags;
+};
+
+struct rds_rdma_args {
+	rds_rdma_cookie_t cookie;
+	struct rds_iovec remote_vec;
+	u_int64_t	local_vec_addr;
+	u_int64_t	nr_local;
+	u_int64_t	flags;
+	u_int64_t	user_token;
+};
+
+struct rds_rdma_notify {
+	u_int64_t	user_token;
+	int32_t		status;
+};
+
+struct rds_reset {
+	u_int8_t	tos;
+	struct in_addr	src;
+	struct in_addr	dst;
+};
+
+struct rds_asend_args {
+	u_int64_t	user_token;
+	u_int64_t	flags;
+};
+
+struct rds_rdma_send_notify {
+	u_int64_t	user_token;
+	int32_t		status;
+};
+
+#define	RDS_RDMA_SUCCESS		0
+#define	RDS_RDMA_REMOTE_ERROR		1
+#define	RDS_RDMA_SEND_CANCELED		2
+#define	RDS_RDMA_SEND_DROPPED		3
+#define	RDS_RDMA_SEND_OTHER_ERROR	4
+
+/* Retain these to build pre-NRM rds-tools */
+#define	RDS_RDMA_CANCELED		RDS_RDMA_SEND_CANCELED
+#define	RDS_RDMA_DROPPED		RDS_RDMA_SEND_DROPPED
+#define	RDS_RDMA_OTHER_ERROR		RDS_RDMA_SEND_OTHER_ERROR
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define	RDS_RDMA_READWRITE	0x0001
+#define	RDS_RDMA_FENCE		0x0002	/* use FENCE for immediate send */
+#define	RDS_RDMA_INVALIDATE	0x0004	/* invalidate R_Key after freeing MR */
+#define	RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
+#define	RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
+#define	RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+#define	RDS_RDMA_SILENT		0x0040	/* Do not interrupt remote */
+#define	RDS_RDMA_REMOTE_COMPLETE 0x0080	/* Notify when data is available */
+#define	RDS_SEND_NOTIFY_ME	0x0100	/* Notify when operation completes */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_RDS_H */