components/open-fabrics/rds-tools/patches/base.patch
changeset 1455 74681f26bd4e
parent 715 eed3ed08f692
child 1513 594764ea1267
equal deleted inserted replaced
1454:eb73f5852132 1455:74681f26bd4e
    13  #include <syscall.h>
    13  #include <syscall.h>
    14 +#endif
    14 +#endif
    15  #include <sys/stat.h>
    15  #include <sys/stat.h>
    16  #include <sys/poll.h>
    16  #include <sys/poll.h>
    17  #include <ctype.h>
    17  #include <ctype.h>
    18 @@ -22,10 +28,16 @@
    18 @@ -22,12 +28,51 @@
    19  #include <fcntl.h>
    19  #include <fcntl.h>
    20  #include <sched.h>
    20  #include <sched.h>
    21  #include <getopt.h>
    21  #include <getopt.h>
       
    22 +#include <sys/ioctl.h>
    22 +#if !(defined(__SVR4) && defined(__sun))
    23 +#if !(defined(__SVR4) && defined(__sun))
    23  #include <byteswap.h>
    24  #include <byteswap.h>
    24  #include "rds.h"
    25  #include "rds.h"
    25 -
    26 -
    26 +#else
    27 +#else
    30 +#if defined(__SVR4) && defined(__sun)
    31 +#if defined(__SVR4) && defined(__sun)
    31 +#include <infiniband/ofa_solaris.h>
    32 +#include <infiniband/ofa_solaris.h>
    32 +#endif
    33 +#endif
    33  
    34  
    34  /*
    35  /*
       
    36 + * Define these here until these are defined in rds.h
       
    37 + * (rds_reset, rds_asend_args and rds_rdma_send_notify)
       
    38 + */
       
    39 +#define	SIOCRDSSETTOS		11000
       
    40 +#define	SIOCRDSGETTOS		11001
       
    41 +#define	RDS_SEND_NOTIFY_ME	0x0100
       
    42 +#define	RDS_CMSG_ASYNC_SEND	10
       
    43 +#define	RDS_CONN_RESET		8
       
    44 +
       
    45 +struct rds_reset {
       
    46 +	u_int8_t	tos;
       
    47 +#if defined(__SVR4) && defined(__sun)
       
    48 +	u_int32_t	src;
       
    49 +	u_int32_t	dst;
       
    50 +#else
       
    51 +	struct in_addr	src;
       
    52 +	struct in_addr	dst;
       
    53 +#endif
       
    54 +};
       
    55 +
       
    56 +struct rds_asend_args {
       
    57 +	u_int64_t	user_token;
       
    58 +	u_int64_t	flags;
       
    59 +};
       
    60 +
       
    61 +struct rds_rdma_send_notify {
       
    62 +	u_int64_t	user_token;
       
    63 +	int32_t		status;
       
    64 +};
       
    65 +
       
    66 +
       
    67 +/*
    35   *
    68   *
    36 @@ -102,6 +114,10 @@
    69   * TODO
       
    70   *  - checksum the data some day.
       
    71 @@ -74,11 +119,38 @@
       
    72  	uint32_t        rdma_vector;
       
    73  	uint32_t	rdma_alignment;
       
    74  	uint32_t	connect_retries;
       
    75 +	uint8_t         reset;
       
    76 +	uint8_t         tos;
       
    77 +	uint8_t         async;
       
    78  } __attribute__((packed));
       
    79  
       
    80 +#define MAX_BUCKETS 16
       
    81 +
       
    82  static struct options	opt;
       
    83  static int		control_fd;
       
    84 +static uint64_t         rtt_threshold;
       
    85 +static int              show_histogram;
       
    86  
       
    87 +static int get_bucket(uint64_t rtt_time)
       
    88 +{
       
    89 +  int i;
       
    90 +  uint64_t l_rtt_time = rtt_time;
       
    91 +
       
    92 +  if (!l_rtt_time)
       
    93 +    i = 0;
       
    94 +  else
       
    95 +  {
       
    96 +    i = -1;
       
    97 +    while (l_rtt_time)
       
    98 +    {
       
    99 +      i++;
       
   100 +      l_rtt_time = (l_rtt_time >> 1);
       
   101 +    }
       
   102 +  }
       
   103 +
       
   104 +  return i;
       
   105 +}
       
   106 +
       
   107  struct counter {
       
   108  	uint64_t	nr;
       
   109  	uint64_t	sum;
       
   110 @@ -102,6 +174,10 @@
    37  
   111  
    38  #define NR_STATS S__LAST
   112  #define NR_STATS S__LAST
    39  
   113  
    40 +#if defined(__SVR4) && defined(__sun)
   114 +#if defined(__SVR4) && defined(__sun)
    41 +int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
   115 +int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
    42 +#endif
   116 +#endif
    43 +
   117 +
    44  /*
   118  /*
    45   * Parents share a mapped array of these with their children.  Each child
   119   * Parents share a mapped array of these with their children.  Each child
    46   * gets one.  It's used to communicate between the child and the parent
   120   * gets one.  It's used to communicate between the child and the parent
    47 @@ -110,6 +126,7 @@
   121 @@ -110,9 +186,11 @@
    48  struct child_control {
   122  struct child_control {
    49  	pid_t pid;
   123  	pid_t pid;
    50  	int ready;
   124  	int ready;
    51 +	int stopping;
   125 +	int stopping;
    52  	struct timeval start;
   126  	struct timeval start;
    53  	struct counter cur[NR_STATS];
   127  	struct counter cur[NR_STATS];
    54  	struct counter last[NR_STATS];
   128  	struct counter last[NR_STATS];
    55 @@ -254,7 +271,20 @@
   129 +        uint64_t       latency_histogram[MAX_BUCKETS];
       
   130  } __attribute__((aligned (256))); /* arbitrary */
       
   131  
       
   132  struct soak_control {
       
   133 @@ -132,6 +210,7 @@
       
   134   */
       
   135  #define OP_REQ		1
       
   136  #define OP_ACK		2
       
   137 +#define OP_DUMP		3
       
   138  
       
   139  #define RDMA_OP_READ	1
       
   140  #define RDMA_OP_WRITE	2
       
   141 @@ -143,12 +222,12 @@
       
   142   */
       
   143  struct header {
       
   144  	uint32_t	seq;
       
   145 +	uint8_t         op;
       
   146  	uint32_t	from_addr;
       
   147  	uint32_t	to_addr;
       
   148  	uint16_t	from_port;
       
   149  	uint16_t	to_port;
       
   150  	uint16_t	index;
       
   151 -	uint8_t		op;
       
   152  
       
   153  	/* RDMA related.
       
   154  	 * rdma_op must be the first field, because we
       
   155 @@ -163,11 +242,18 @@
       
   156  	uint32_t        rdma_vector;
       
   157  
       
   158  	uint8_t		data[0];
       
   159 +	uint8_t        retry;
       
   160 +	uint8_t        rdma_remote_err;
       
   161 +	uint8_t         pending;
       
   162  } __attribute__((packed));
       
   163  
       
   164  #define MIN_MSG_BYTES		(sizeof(struct header))
       
   165  #define BASIC_HEADER_SIZE	(size_t)(&((struct header *) 0)->rdma_op)
       
   166  
       
   167 +#define print_outlier(...) do {         \
       
   168 +        fprintf(stderr, __VA_ARGS__);   \
       
   169 +} while (0)
       
   170 +
       
   171  #define die(fmt...) do {		\
       
   172  	fprintf(stderr, fmt);		\
       
   173  	exit(1);			\
       
   174 @@ -254,7 +340,20 @@
    56  
   175  
    57  	die("invalid host name or dotted quad '%s'\n", ptr);
   176  	die("invalid host name or dotted quad '%s'\n", ptr);
    58  }
   177  }
    59 +#if defined(__SVR4) && defined(__sun)
   178 +#if defined(__SVR4) && defined(__sun)
    60 +static lgrp_id_t lgrp_id = -1;
   179 +static lgrp_id_t lgrp_id = -1;
    71 +#endif
   190 +#endif
    72 +
   191 +
    73  static void usage(void)
   192  static void usage(void)
    74  {
   193  {
    75          fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
   194          fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
    76 @@ -281,6 +311,9 @@
   195 @@ -273,6 +372,7 @@
       
   196  	" -d [depth, 1]     request pipeline depth, nr outstanding\n"
       
   197  	" -t [nr, 1]        number of child tasks\n"
       
   198  	" -T [seconds, 0]   runtime of test, 0 means infinite\n"
       
   199 +	" -Q [tos, 0]       Type of Service\n"
       
   200  	" -D [bytes]        RDMA: size\n"
       
   201  	" -I [iovecs, 1]    RDMA: number of user buffers to target (max 512)\n"
       
   202          " -M [nr, 0]        RDMA: mode (0=readwrite,1=readonly,2=writeonly)\n"
       
   203 @@ -281,6 +381,9 @@
    77  	" -c                measure cpu use with per-cpu soak processes\n"
   204  	" -c                measure cpu use with per-cpu soak processes\n"
    78  	" -V                trace execution\n"
   205  	" -V                trace execution\n"
    79  	" -z                print a summary at end of test only\n"
   206  	" -z                print a summary at end of test only\n"
    80 +#if defined(__SVR4) && defined(__sun)
   207 +#if defined(__SVR4) && defined(__sun)
    81 +	" -g [lgrpid]       bind the process to the specified lgrp\n"
   208 +	" -g [lgrpid]       bind the process to the specified lgrp\n"
    82 +#endif
   209 +#endif
    83  	"\n"
   210  	"\n"
    84  	"Example:\n"
   211  	"Example:\n"
    85  	"  recv$ rds-stress\n"
   212  	"  recv$ rds-stress\n"
    86 @@ -310,7 +343,7 @@
   213 @@ -310,7 +413,7 @@
    87  static void check_parent(pid_t pid)
   214  static void check_parent(pid_t pid)
    88  {
   215  {
    89  	if (pid != getppid())
   216  	if (pid != getppid())
    90 -		die("parent %u exited\n", pid);
   217 -		die("parent %u exited\n", pid);
    91 +		die("parent %u exited\n", (int)pid);
   218 +		die("parent %u exited\n", (int)pid);
    92  }
   219  }
    93  
   220  
    94  /*
   221  /*
    95 @@ -334,6 +367,7 @@
   222 @@ -334,6 +437,7 @@
    96  		msg_pattern[i] = k;
   223  		msg_pattern[i] = k;
    97  }
   224  }
    98  
   225  
    99 +#if !(defined(__SVR4) && defined(__sun))
   226 +#if !(defined(__SVR4) && defined(__sun))
   100  #if __BYTE_ORDER == __LITTLE_ENDIAN
   227  #if __BYTE_ORDER == __LITTLE_ENDIAN
   101  #define htonll(x)	bswap_64(x)
   228  #define htonll(x)	bswap_64(x)
   102  #define ntohll(x)	bswap_64(x)
   229  #define ntohll(x)	bswap_64(x)
   103 @@ -341,6 +375,7 @@
   230 @@ -341,6 +445,7 @@
   104  #define htonll(x)	(x)
   231  #define htonll(x)	(x)
   105  #define ntohll(x)	(x)
   232  #define ntohll(x)	(x)
   106  #endif
   233  #endif
   107 +#endif /* Not sun */
   234 +#endif /* Not sun */
   108  
   235  
   109  static void encode_hdr(struct header *dst, const struct header *hdr)
   236  static void encode_hdr(struct header *dst, const struct header *hdr)
   110  {
   237  {
   111 @@ -584,7 +619,11 @@
   238 @@ -361,6 +466,7 @@
       
   239  	dst->rdma_key = htonll(hdr->rdma_key);
       
   240  	dst->rdma_size = htonl(hdr->rdma_size);
       
   241  	dst->rdma_vector = htonl(hdr->rdma_vector);
       
   242 +	dst->retry = hdr->retry;
       
   243  }
       
   244  
       
   245  static void decode_hdr(struct header *dst, const struct header *hdr)
       
   246 @@ -382,6 +488,7 @@
       
   247  	dst->rdma_key = ntohll(hdr->rdma_key);
       
   248  	dst->rdma_size = ntohl(hdr->rdma_size);
       
   249  	dst->rdma_vector = ntohl(hdr->rdma_vector);
       
   250 +	dst->retry = hdr->retry;
       
   251  }
       
   252  
       
   253  static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
       
   254 @@ -412,11 +519,19 @@
       
   255   * Compare incoming message header with expected header. All header fields
       
   256   * are in host byte order except for address and port fields.
       
   257   */
       
   258 -static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
       
   259 +static int check_hdr(void *message, uint32_t bytes, struct header *hdr, struct options *opts)
       
   260  {
       
   261  	struct header msghdr;
       
   262 +	uint32_t	inc_seq;
       
   263 +	uint32_t	my_seq;
       
   264  
       
   265  	decode_hdr(&msghdr, message);
       
   266 +	inc_seq = msghdr.seq;
       
   267 +	my_seq = hdr->seq;
       
   268 +
       
   269 +	if (msghdr.retry && (inc_seq < my_seq))
       
   270 +		return -1;
       
   271 +
       
   272  	if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
       
   273  #define bleh(var, disp)					\
       
   274  		disp(hdr->var),				\
       
   275 @@ -428,7 +543,7 @@
       
   276  		 * with stdout() and we don't get things stomping on each
       
   277  		 * other
       
   278  		 */
       
   279 -		printf( "An incoming message had a header which\n"
       
   280 +		printf( "An incoming message had a %s header which\n"
       
   281  			"didn't contain the fields we expected:\n"
       
   282  			"    member        expected eq             got\n"
       
   283  			"       seq %15u %s %15u\n"
       
   284 @@ -438,6 +553,7 @@
       
   285  			"   to_port %15u %s %15u\n"
       
   286  			"     index %15u %s %15u\n"
       
   287  			"        op %15u %s %15u\n",
       
   288 +			(msghdr.retry) ? "RETRY" : "",
       
   289  			bleh(seq, /**/),
       
   290  			bleh(from_addr, inet_ntoa_32),
       
   291  			bleh(from_port, ntohs),
       
   292 @@ -569,6 +685,9 @@
       
   293  
       
   294  	fcntl(fd, F_SETFL, O_NONBLOCK);
       
   295  
       
   296 +	if (opts->tos && ioctl(fd, SIOCRDSSETTOS, &opts->tos)) 
       
   297 +		die_errno("ERROR: failed to set TOS\n");
       
   298 +
       
   299  	return fd;
       
   300  }
       
   301  
       
   302 @@ -584,7 +703,11 @@
   112  	if (opts->receive_addr == 0)
   303  	if (opts->receive_addr == 0)
   113  		return 1;
   304  		return 1;
   114  
   305  
   115 +#if defined(__SVR4) && defined(__sun)
   306 +#if defined(__SVR4) && defined(__sun)
   116 +	sin.sin_family = AF_INET_OFFLOAD;
   307 +	sin.sin_family = AF_INET_OFFLOAD;
   118  	sin.sin_family = AF_INET;
   309  	sin.sin_family = AF_INET;
   119 +#endif
   310 +#endif
   120  	sin.sin_port = htons(opts->starting_port);
   311  	sin.sin_port = htons(opts->starting_port);
   121  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   312  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   122  
   313  
   123 @@ -677,7 +716,11 @@
   314 @@ -639,7 +762,7 @@
       
   315  	mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
       
   316  #endif
       
   317  	if (setsockopt(fd, sol, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
       
   318 -		die_errno("setsockopt(RDS_FREE_MR) failed");
       
   319 +		return;
       
   320  	mrs_allocated--;
       
   321  }
       
   322  
       
   323 @@ -677,7 +800,11 @@
   124  	size = sizeof(struct rdma_key_o_meter)
   324  	size = sizeof(struct rdma_key_o_meter)
   125  			+ 2 * nr_tasks * sizeof(*kt)
   325  			+ 2 * nr_tasks * sizeof(*kt)
   126  			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
   326  			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
   127 +#if defined(__SVR4) && defined(__sun)
   327 +#if defined(__SVR4) && defined(__sun)
   128 +	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   328 +	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   130  	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   330  	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   131 +#endif
   331 +#endif
   132  	if (base == MAP_FAILED)
   332  	if (base == MAP_FAILED)
   133  		die_errno("alloc_rdma_buffers: mmap failed");
   333  		die_errno("alloc_rdma_buffers: mmap failed");
   134  
   334  
   135 @@ -828,7 +871,7 @@
   335 @@ -828,13 +955,20 @@
   136  	}
   336  	}
   137  
   337  
   138  	if (!failed)
   338  	if (!failed)
   139 -		trace("compare pass pattern %Lx addr %p\n",
   339 -		trace("compare pass pattern %Lx addr %p\n",
   140 +		trace("compare pass pattern 0x%Lx addr %p\n",
   340 +		trace("compare pass pattern 0x%Lx addr %p\n",
   141  			(unsigned long long) pattern, addr);
   341  			(unsigned long long) pattern, addr);
   142  }
   342  }
   143  
   343  
   144 @@ -865,7 +908,11 @@
   344 +struct retry_entry {
       
   345 +	uint32_t	retries;
       
   346 +	uint32_t	seq;
       
   347 +	int		status;
       
   348 +};
       
   349 +
       
   350  struct task {
       
   351  	unsigned int		nr;
       
   352  	unsigned int		pending;
       
   353 +	int			trace;
       
   354  	unsigned int		unacked;
       
   355  	struct sockaddr_in	src_addr;	/* same for all tasks */
       
   356  	struct sockaddr_in	dst_addr;
       
   357 @@ -846,7 +980,14 @@
       
   358  	uint16_t		recv_index;
       
   359  	struct timeval *	send_time;
       
   360  	struct header *		ack_header;
       
   361 +	struct header *         ack2_header;
       
   362 +	struct header *         req_header;
       
   363 +	uint64_t *		retry_token;
       
   364 +	uint32_t		retries;
       
   365 +	uint32_t            	last_retry_seq;
       
   366 +	uint32_t		retry_index;
       
   367  
       
   368 +
       
   369  	/* RDMA related stuff */
       
   370  	uint64_t **		local_buf;
       
   371  	uint64_t **		rdma_buf;
       
   372 @@ -865,7 +1006,11 @@
   145  	/* We use mmap here rather than malloc, because it is always
   373  	/* We use mmap here rather than malloc, because it is always
   146  	 * page aligned. */
   374  	 * page aligned. */
   147  	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
   375  	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
   148 +#if defined(__SVR4) && defined(__sun)
   376 +#if defined(__SVR4) && defined(__sun)
   149 +	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   377 +	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   151  	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
   379  	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
   152 +#endif
   380 +#endif
   153  	if (base == MAP_FAILED)
   381  	if (base == MAP_FAILED)
   154  		die_errno("alloc_rdma_buffers: mmap failed");
   382  		die_errno("alloc_rdma_buffers: mmap failed");
   155  	memset(base, 0x2f, len);
   383  	memset(base, 0x2f, len);
   156 @@ -915,17 +962,16 @@
   384 @@ -915,17 +1060,16 @@
   157  	if (RDMA_OP_READ == hdr->rdma_op) {
   385  	if (RDMA_OP_READ == hdr->rdma_op) {
   158  		if (opt.verify)
   386  		if (opt.verify)
   159  			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
   387  			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
   160 -		trace("Requesting RDMA read for pattern %Lx "
   388 -		trace("Requesting RDMA read for pattern %Lx "
   161 -				"local addr to rdma read %p\n",
   389 -				"local addr to rdma read %p\n",
   175 +		trace("Requesting RDMA write for pattern 0x%Lx",
   403 +		trace("Requesting RDMA write for pattern 0x%Lx",
   176 +				hdr->rdma_pattern);
   404 +				hdr->rdma_pattern);
   177  	}
   405  	}
   178  }
   406  }
   179  
   407  
   180 @@ -947,7 +993,7 @@
   408 @@ -947,7 +1091,7 @@
   181  		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
   409  		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
   182  
   410  
   183  
   411  
   184 -	trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
   412 -	trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
   185 +	trace("RDS received request to issue rdma %s len %lu rva 0x%Lx key 0x%Lx pattern 0x%Lx\n",
   413 +	trace("RDS received request to issue rdma %s len %lu rva 0x%Lx key 0x%Lx pattern 0x%Lx\n",
   186  		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
   414  		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
   187  		rdma_size,
   415  		rdma_size,
   188  		(unsigned long long) in_hdr->rdma_addr,
   416  		(unsigned long long) in_hdr->rdma_addr,
   189 @@ -1007,6 +1053,9 @@
   417 @@ -966,21 +1110,33 @@
       
   418  	hdr->rdma_vector = in_hdr->rdma_vector;
       
   419  }
       
   420  
       
   421 -static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
       
   422 +static inline uint64_t rdma_user_token(struct task *t, unsigned int qindex,  unsigned int type, uint32_t seq)
       
   423  {
       
   424 -	return t->nr * opt.req_depth + qindex;
       
   425 +	uint64_t tmp = seq;
       
   426 +	return (tmp << 32) | ((t->nr * opt.req_depth + qindex) << 2 | type);
       
   427  }
       
   428  
       
   429 -static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
       
   430 +static void rdma_mark_completed(struct task *tasks, uint64_t token, int status, struct options *opts)
       
   431  {
       
   432  	struct task *t;
       
   433  	unsigned int i;
       
   434 +	struct header *hdr = NULL;
       
   435 +	uint32_t seq = token >> 32;
       
   436 +	unsigned int type = token & 0x03;
       
   437 +	unsigned int index = (token & 0xFFFFFFFF) >> 2;
       
   438  
       
   439 -	trace("RDS rdma completion for token %x\n", token);
       
   440 +	trace("RDS rdma completion for token 0x%lx\n", token);
       
   441  
       
   442 -	t = &tasks[token / opt.req_depth];
       
   443 -	i = token % opt.req_depth;
       
   444 +	t = &tasks[index / opt.req_depth];
       
   445 +	i = index % opt.req_depth;
       
   446  
       
   447 +	if (opts->async) {
       
   448 +		if (type == OP_REQ)
       
   449 +			hdr = &t->req_header[i];
       
   450 +		else
       
   451 +			hdr = &t->ack2_header[i];
       
   452 +	}
       
   453 +
       
   454  	if (status) {
       
   455  		const char *errmsg;
       
   456  
       
   457 @@ -987,10 +1143,10 @@
       
   458  		switch (status) {
       
   459  		case RDS_RDMA_REMOTE_ERROR:
       
   460  			errmsg = "remote error"; break;
       
   461 -		case RDS_RDMA_CANCELED:
       
   462 -			errmsg = "operation was cancelled"; break;
       
   463  		case RDS_RDMA_DROPPED:
       
   464  			errmsg = "operation was dropped"; break;
       
   465 +		case RDS_RDMA_CANCELED:
       
   466 +			errmsg = "operation was cancelled"; break;
       
   467  		case RDS_RDMA_OTHER_ERROR:
       
   468  			errmsg = "other error"; break;
       
   469  		default:
       
   470 @@ -997,10 +1153,38 @@
       
   471  			errmsg = "unknown error"; break;
       
   472  		}
       
   473  
       
   474 -		printf("%s:%u: RDMA op %u failed: %s\n",
       
   475 +		trace("%s:%u: %s failed: %s\n",
       
   476  				inet_ntoa(t->dst_addr.sin_addr),
       
   477  				ntohs(t->dst_addr.sin_port),
       
   478 -				i, errmsg);
       
   479 +				type ? "SEND" : "RDMA",
       
   480 +				errmsg);
       
   481 +
       
   482 +		if (hdr &&
       
   483 +			(status == RDS_RDMA_DROPPED ||
       
   484 +			 status == RDS_RDMA_REMOTE_ERROR)) {
       
   485 +
       
   486 +			if (hdr->seq == seq) {
       
   487 +				hdr->retry = 1;
       
   488 +				if (hdr->seq > t->last_retry_seq) {
       
   489 +					if (status == RDS_RDMA_REMOTE_ERROR)
       
   490 +						hdr->rdma_remote_err = 1;
       
   491 +					t->retry_token[t->retry_index] = token;
       
   492 +					t->retry_index = (t->retry_index + 1) %
       
   493 +						(2 * opts->req_depth);
       
   494 +					t->retries += 1;
       
   495 +					t->last_retry_seq = hdr->seq;
       
   496 +					if (t->retries > 2 * opts->req_depth)
       
   497 +						die("Exceeded MAX retry entries..\n");
       
   498 +				}
       
   499 +			} else
       
   500 +				die("SEQ Out-Of-Sync: %u/%u\n", hdr->seq, seq);
       
   501 +		} else if (hdr) {
       
   502 +			hdr->pending = 0;
       
   503 +			hdr->retry = 0;
       
   504 +		}
       
   505 +	} else if (hdr) {
       
   506 +		hdr->pending = 0;
       
   507 +		hdr->retry = 0;
       
   508  	}
       
   509  
       
   510  	t->rdma_inflight[i] = 0;
       
   511 @@ -1007,6 +1191,9 @@
   190  	t->drain_rdmas = 0;
   512  	t->drain_rdmas = 0;
   191  }
   513  }
   192  
   514  
   193 +#if defined(__SVR4) && defined(__sun)
   515 +#if defined(__SVR4) && defined(__sun)
   194 +#undef MSG_MAXIOVLEN
   516 +#undef MSG_MAXIOVLEN
   195 +#endif
   517 +#endif
   196  #define MSG_MAXIOVLEN 2
   518  #define MSG_MAXIOVLEN 2
   197  
   519  
   198  /*
   520  /*
   199 @@ -1560,7 +1609,12 @@
   521 @@ -1018,11 +1205,14 @@
       
   522  	static char ctlbuf[1024];
       
   523  	struct cmsghdr *cmsg;
       
   524  
       
   525 -	msg->msg_control = ctlbuf;
       
   526 -	msg->msg_controllen = CMSG_SPACE(size);
       
   527 -
       
   528 -	cmsg = CMSG_FIRSTHDR(msg);
       
   529 -	cmsg->cmsg_level = sol;
       
   530 +	if (!msg->msg_control) {
       
   531 +		msg->msg_control = ctlbuf;
       
   532 +		msg->msg_controllen = CMSG_SPACE(size);
       
   533 +		cmsg = CMSG_FIRSTHDR(msg);
       
   534 +	} else {
       
   535 +		cmsg = (struct cmsghdr *)((char *)msg->msg_control + msg->msg_controllen);
       
   536 +		msg->msg_controllen += CMSG_SPACE(size);
       
   537 +	}cmsg->cmsg_level = sol;
       
   538  	cmsg->cmsg_type = type;
       
   539  	cmsg->cmsg_len = CMSG_LEN(size);
       
   540  	memcpy(CMSG_DATA(cmsg), ptr, size);
       
   541 @@ -1034,7 +1224,7 @@
       
   542   * the ACK packet.
       
   543   */
       
   544  static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
       
   545 -		unsigned int user_token, void *local_buf)
       
   546 +		uint64_t user_token, void *local_buf)
       
   547  {
       
   548  
       
   549  #define RDS_MAX_IOV 512 /* FIX_ME - put this into rds.h or use socket max ?*/
       
   550 @@ -1048,7 +1238,7 @@
       
   551  	rdma_size = hdr->rdma_size;
       
   552  	rdma_vector = hdr->rdma_vector;
       
   553  
       
   554 -	trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p vector %u\n",
       
   555 +	trace("RDS issuing rdma for token 0x%lx key 0x%llx len %d local_buf %p vector %d\n",
       
   556  			user_token,
       
   557  			(unsigned long long) hdr->rdma_key,
       
   558  			rdma_size, local_buf,
       
   559 @@ -1102,6 +1292,15 @@
       
   560  	rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
       
   561  }
       
   562  
       
   563 +static void build_cmsg_async_send(struct msghdr *msg, uint64_t user_token)
       
   564 +{
       
   565 +	struct rds_asend_args  args;
       
   566 +
       
   567 +	args.flags |= RDS_SEND_NOTIFY_ME;
       
   568 +	args.user_token = user_token;
       
   569 +	rdma_put_cmsg(msg, RDS_CMSG_ASYNC_SEND, &args, sizeof(args));
       
   570 +}
       
   571 +
       
   572  static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
       
   573  {
       
   574  	rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
       
   575 @@ -1174,19 +1373,17 @@
       
   576  	hdr->index = qindex;
       
   577  }
       
   578  
       
   579 -static int send_packet(int fd, struct task *t,
       
   580 -		struct header *hdr, unsigned int size)
       
   581 +static int send_msg(int fd, struct task *t, struct header *hdr,
       
   582 +		    unsigned int size, struct options *opts, 
       
   583 +		    struct child_control *ctl)
       
   584  {
       
   585 -	unsigned char buf[size], *rdma_flight_recorder = NULL;
       
   586 +	unsigned char buf[size];
       
   587 +	uint8_t *rdma_flight_recorder = NULL;
       
   588  	rds_rdma_cookie_t cookie = 0;
       
   589  	struct msghdr msg;
       
   590  	struct iovec iov;
       
   591  	ssize_t ret;
       
   592  
       
   593 -	/* Make sure we always have the current sequence number.
       
   594 -	 * When we send ACK packets, the seq that gets filled in is
       
   595 -	 * stale. */
       
   596 -	hdr->seq = t->send_seq;
       
   597  	fill_hdr(buf, size, hdr);
       
   598  
       
   599  	memset(&msg, 0, sizeof(msg));
       
   600 @@ -1198,27 +1395,10 @@
       
   601  	iov.iov_base = buf;
       
   602  	iov.iov_len = size;
       
   603  
       
   604 -	/* If this is a REQ packet in which we pass the MR to the
       
   605 -	 * peer, extract the RDMA cookie and pass it on in the control
       
   606 -	 * message for now. */
       
   607 -	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
       
   608 -		if (hdr->rdma_key != 0) {
       
   609 -			/* We used GET_MR to obtain a key */
       
   610 -			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
       
   611 -			cookie = hdr->rdma_key;
       
   612 -			hdr->rdma_key = 0;
       
   613 -		} else {
       
   614 -			/* Use the RDMA_MAP cmsg to have sendmsg do the
       
   615 -			 * mapping on the fly. */
       
   616 -			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
       
   617 -					    hdr->rdma_size * hdr->rdma_vector,
       
   618 -					    &cookie);
       
   619 -		}
       
   620 -	}
       
   621  
       
   622  	/* If this is an ACK packet with RDMA, build the cmsg
       
   623 -	 * header that goes with it. */
       
   624 -	if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
       
   625 +	   * header that goes with it. */
       
   626 +	if (hdr->op == OP_ACK && hdr->rdma_op != 0 && !hdr->rdma_remote_err) {
       
   627  		unsigned int qindex = hdr->index;
       
   628  
       
   629  		if (t->rdma_inflight[qindex] != 0) {
       
   630 @@ -1230,16 +1410,35 @@
       
   631  			 *
       
   632  			 * We return one of the more obscure error messages,
       
   633  			 * which we recognize and handle in the top loop. */
       
   634 -			trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
       
   635 +			trace("Drain RDMA 0x%lx\n", rdma_user_token(t, qindex, 0, hdr->seq));
       
   636  			errno = EBADSLT;
       
   637  			return -1;
       
   638  		}
       
   639  		rdma_build_cmsg_xfer(&msg, hdr,
       
   640 -				rdma_user_token(t, qindex),
       
   641 +				rdma_user_token(t, qindex, 0, hdr->seq),
       
   642  				t->local_buf[qindex]);
       
   643  		rdma_flight_recorder = &t->rdma_inflight[qindex];
       
   644 +	} else if (opts->async) {
       
   645 +		if (hdr->op == OP_REQ)
       
   646 +			build_cmsg_async_send(&msg,
       
   647 +				rdma_user_token(t, hdr->index, OP_REQ, hdr->seq));
       
   648 +		else
       
   649 +			build_cmsg_async_send(&msg,
       
   650 +				rdma_user_token(t, hdr->index, OP_ACK, hdr->seq));
       
   651  	}
       
   652  
       
   653 +	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
       
   654 +		if (hdr->rdma_key != 0) {
       
   655 +			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
       
   656 +			cookie = hdr->rdma_key;
       
   657 +			hdr->rdma_key = 0;
       
   658 +		} else {
       
   659 +			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
       
   660 +					hdr->rdma_size * hdr->rdma_vector,
       
   661 +					&cookie);
       
   662 +		}
       
   663 +	}
       
   664 +
       
   665  	ret = sendmsg(fd, &msg, 0);
       
   666  	if (ret < 0) {
       
   667  		if (errno != EAGAIN && errno != ENOBUFS)
       
   668 @@ -1256,10 +1455,41 @@
       
   669  		 * lower 32bit of the cookie */
       
   670  		rdma_key_o_meter_add(cookie);
       
   671  	}
       
   672 +
       
   673 +	hdr->pending = 1;
       
   674 +
       
   675 +	return ret;
       
   676 +}
       
   677 +
       
   678 +static int send_packet(int fd, struct task *t,
       
   679 +		struct header *hdr, unsigned int size,
       
   680 +		struct options *opts, struct child_control *ctl)
       
   681 +{
       
   682 +	ssize_t ret;
       
   683 +
       
   684 +	/* Make sure we always have the current sequence number.
       
   685 +	 * When we send ACK packets, the seq that gets filled in is
       
   686 +	 * stale. */
       
   687 +	hdr->seq = t->send_seq;
       
   688 +
       
   689 +	ret = send_msg(fd, t, hdr, size, opts, ctl);
       
   690 +	if (ret < 0) return ret;
       
   691 +
       
   692  	t->send_seq++;
       
   693  	return ret;
       
   694  }
       
   695  
       
   696 +static int resend_packet(int fd, struct task *t,
       
   697 +		struct header *hdr, unsigned int size,
       
   698 +		struct options *opts, struct child_control *ctl)
       
   699 +{
       
   700 +	ssize_t ret;
       
   701 +
       
   702 +	ret = send_msg(fd, t, hdr, size, opts, ctl);
       
   703 +
       
   704 +	return ret;
       
   705 +}
       
   706 +
       
   707  static int send_one(int fd, struct task *t,
       
   708  		struct options *opts,
       
   709  		struct child_control *ctl)
       
   710 @@ -1266,12 +1496,16 @@
       
   711  {
       
   712  	struct timeval start;
       
   713  	struct timeval stop;
       
   714 -	struct header hdr;
       
   715 +	struct header *hdr = &t->req_header[t->send_index]; 
       
   716  	int ret;
       
   717  
       
   718 -	build_header(t, &hdr, OP_REQ, t->send_index);
       
   719 +	if (opts->async && hdr->pending) {
       
   720 +		return -1;
       
   721 +	}
       
   722 +
       
   723 +	build_header(t, hdr, OP_REQ, t->send_index);
       
   724  	if (opts->rdma_size && t->send_seq > 10)
       
   725 -		rdma_build_req(fd, &hdr, t,
       
   726 +		rdma_build_req(fd, hdr, t,
       
   727  				opts->rdma_size,
       
   728  				opts->req_depth,
       
   729  				opts->rw_mode,
       
   730 @@ -1279,7 +1513,7 @@
       
   731  
       
   732  
       
   733  	gettimeofday(&start, NULL);
       
   734 -	ret = send_packet(fd, t, &hdr, opts->req_size);
       
   735 +	ret = send_packet(fd, t, hdr, opts->req_size, opts, ctl);
       
   736  	gettimeofday(&stop, NULL);
       
   737  
       
   738  	if (ret < 0)
       
   739 @@ -1302,10 +1536,15 @@
       
   740  		struct child_control *ctl)
       
   741  {
       
   742  	struct header *hdr = &t->ack_header[qindex];
       
   743 +	struct header *hdr2 = &t->ack2_header[qindex];
       
   744  	ssize_t ret;
       
   745  
       
   746 +	if (opts->async && hdr2->pending) {
       
   747 +		return -1;
       
   748 +	}
       
   749 +
       
   750  	/* send an ack in response to the req we just got */
       
   751 -	ret = send_packet(fd, t, hdr, opts->ack_size);
       
   752 +	ret = send_packet(fd, t, hdr, opts->ack_size, opts, ctl);
       
   753  	if (ret < 0)
       
   754  		return ret;
       
   755  	if (ret != opts->ack_size)
       
   756 @@ -1324,6 +1563,8 @@
       
   757  		break;
       
   758  	}
       
   759  
       
   760 +	memcpy(hdr2, hdr, sizeof(struct header));
       
   761 +
       
   762  	return ret;
       
   763  }
       
   764  
       
   765 @@ -1354,8 +1595,49 @@
       
   766  			struct child_control *ctl,
       
   767  			int can_send, int do_work)
       
   768  {
       
   769 +	struct header *hdr;
       
   770 +	unsigned int index;
       
   771 +	int req_size;
       
   772 +	int num_retries = t->retries;
       
   773 +	uint64_t token;
       
   774 +	unsigned int type;
       
   775 +	unsigned int index2;
       
   776 +	unsigned int i;
       
   777 +
       
   778 +	while (opts->async && num_retries > 0) {
       
   779 +		index = (t->retry_index - num_retries +
       
   780 +			(2 * opts->req_depth)) % (2 * opts->req_depth);
       
   781 +
       
   782 +		token = t->retry_token[index];
       
   783 +		type = token & 0x03;
       
   784 +		index2 = (token & 0xFFFFFFFF) >> 2;
       
   785 +		i = index2 % opts->req_depth;
       
   786 +
       
   787 +		if (type == OP_REQ)
       
   788 +			hdr = &t->req_header[i];
       
   789 +		else
       
   790 +			hdr = &t->ack2_header[i];
       
   791 +
       
   792 +		if (!hdr->retry)
       
   793 +			goto next;
       
   794 +
       
   795 +		if (hdr->op == OP_REQ)
       
   796 +			req_size = opts->req_size;
       
   797 +		else
       
   798 +			req_size = opts->ack_size;
       
   799 +
       
   800 +		if (resend_packet(fd, t, hdr, req_size, opts, ctl) < 0) {
       
   801 +			return -1;
       
   802 +		}
       
   803 +		hdr->retry = 0;
       
   804 +next:
       
   805 +		num_retries--;
       
   806 +	}
       
   807 +	t->last_retry_seq = t->retries = 0;
       
   808 +
       
   809  	if (ack_anything(fd, t, opts, ctl, can_send) < 0)
       
   810  		return -1;
       
   811 +
       
   812  	while (do_work && t->pending < opts->req_depth) {
       
   813  		if (!can_send)
       
   814  			goto eagain;
       
   815 @@ -1375,7 +1657,8 @@
       
   816  		rds_rdma_cookie_t *cookie,
       
   817  		struct sockaddr_in *sin,
       
   818  		struct timeval *tstamp,
       
   819 -		struct task *tasks)
       
   820 +		struct task *tasks,
       
   821 +		struct options *opts)
       
   822  {
       
   823  	struct cmsghdr *cmsg;
       
   824  	char cmsgbuf[256];
       
   825 @@ -1400,13 +1683,13 @@
       
   826  		return ret;
       
   827  	if (ret && ret < sizeof(struct header))
       
   828  		die("recvmsg() returned short data: %zd", ret);
       
   829 -	if (msg.msg_namelen < sizeof(struct sockaddr_in))
       
   830 +	if (ret && msg.msg_namelen < sizeof(struct sockaddr_in))
       
   831  		die("socklen = %d < sizeof(sin) (%zu)\n",
       
   832  		    msg.msg_namelen, sizeof(struct sockaddr_in));
       
   833  
       
   834  	/* See if the message comes with a RDMA destination */
       
   835  	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
       
   836 -		struct rds_rdma_notify notify;
       
   837 +		struct rds_rdma_send_notify notify;
       
   838  
       
   839  		if (cmsg->cmsg_level != sol)
       
   840  			continue;
       
   841 @@ -1436,7 +1719,7 @@
       
   842  			if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
       
   843  				die("RDS_CMSG_RDMA_DEST data too small");
       
   844  			memcpy(&notify, CMSG_DATA(cmsg), sizeof(notify));
       
   845 -			rdma_mark_completed(tasks, notify.user_token, notify.status);
       
   846 +			rdma_mark_completed(tasks, notify.user_token, notify.status, opts);
       
   847  			break;
       
   848  		}
       
   849  	}
       
   850 @@ -1445,7 +1728,8 @@
       
   851  
       
   852  static int recv_one(int fd, struct task *tasks,
       
   853  			struct options *opts,
       
   854 -		struct child_control *ctl)
       
   855 +		struct child_control *ctl,
       
   856 +		struct child_control *all_ctl)
       
   857  {
       
   858  	char buf[max(opts->req_size, opts->ack_size)];
       
   859  	rds_rdma_cookie_t rdma_dest = 0;
       
   860 @@ -1456,15 +1740,18 @@
       
   861  	uint16_t expect_index;
       
   862  	int task_index;
       
   863  	ssize_t ret;
       
   864 +	int	check_status;
       
   865  
       
   866 -	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
       
   867 +
       
   868 +	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks, opts);
       
   869  	if (ret < 0)
       
   870  		return ret;
       
   871  
       
   872  	/* If we received only RDMA completions or cong updates,
       
   873  	 * ret will be 0 */
       
   874 -	if (ret == 0)
       
   875 +	if (ret == 0) {
       
   876  		return 0;
       
   877 +	}
       
   878  
       
   879  	/* check the incoming sequence number */
       
   880  	task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
       
   881 @@ -1508,16 +1795,32 @@
       
   882  	hdr.to_port = t->src_addr.sin_port;
       
   883  	hdr.index = expect_index;
       
   884  
       
   885 -	if (check_hdr(buf, ret, &hdr))
       
   886 -		die("header from %s:%u to id %u bogus\n",
       
   887 -		    inet_ntoa(sin.sin_addr), htons(sin.sin_port),
       
   888 -		    ntohs(t->src_addr.sin_port));
       
   889 +	check_status = check_hdr(buf, ret, &hdr, opts);
       
   890 +	if (check_status) {
       
   891 +		if (check_status > 0) {
       
   892 +			die("header from %s:%u to id %u bogus\n",
       
   893 +		    	inet_ntoa(sin.sin_addr), htons(sin.sin_port),
       
   894 +		    	ntohs(t->src_addr.sin_port));
       
   895 +		} else
       
   896 +			return 0;
       
   897 +	}
       
   898  
       
   899  	if (hdr.op == OP_ACK) {
       
   900 -		stat_inc(&ctl->cur[S_RTT_USECS],
       
   901 -			 usec_sub(&tstamp, &t->send_time[expect_index]));
       
   902 -		t->pending -= 1;
       
   903 +                uint64_t rtt_time = 
       
   904 +                  usec_sub(&tstamp, &t->send_time[expect_index]);
       
   905  
       
   906 +		stat_inc(&ctl->cur[S_RTT_USECS], rtt_time);
       
   907 +                if (rtt_time > rtt_threshold)
       
   908 +			print_outlier("Found RTT = 0x%lx\n", rtt_time);
       
   909 +
       
   910 +                if (show_histogram)
       
   911 +                {
       
   912 +                  ctl->latency_histogram[get_bucket(rtt_time)]++;
       
   913 +                }
       
   914 +
       
   915 +		if (t->pending > 0)
       
   916 +			t->pending -= 1;
       
   917 +
       
   918  		if (in_hdr.rdma_key)
       
   919  			rdma_process_ack(fd, &in_hdr, ctl);
       
   920  	} else {
       
   921 @@ -1549,6 +1852,7 @@
       
   922  }
       
   923  
       
   924  static void run_child(pid_t parent_pid, struct child_control *ctl,
       
   925 +			struct child_control *all_ctl,
       
   926  		      struct options *opts, uint16_t id, int active)
       
   927  {
       
   928  	struct sockaddr_in sin;
       
   929 @@ -1559,8 +1863,15 @@
       
   930  	struct task tasks[opts->nr_tasks];
   200  	struct timeval start;
   931  	struct timeval start;
   201          int do_work = opts->simplex ? active : 1;
   932          int do_work = opts->simplex ? active : 1;
   202  
   933 +	int j;
       
   934  
       
   935 +
   203 +#if defined(__SVR4) && defined(__sun)
   936 +#if defined(__SVR4) && defined(__sun)
   204 +	set_my_lgrp();
   937 +	set_my_lgrp();
   205 +	sin.sin_family = AF_INET_OFFLOAD;
   938 +	sin.sin_family = AF_INET_OFFLOAD;
   206 +#else
   939 +#else
   207  	sin.sin_family = AF_INET;
   940  	sin.sin_family = AF_INET;
   208 +#endif
   941 +#endif
   209  	sin.sin_port = htons(opts->starting_port + 1 + id);
   942  	sin.sin_port = htons(opts->starting_port + 1 + id);
   210  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   943  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   211  
   944  
   212 @@ -1572,7 +1626,11 @@
   945 @@ -1572,7 +1883,11 @@
   213  	for (i = 0; i < opts->nr_tasks; i++) {
   946  	for (i = 0; i < opts->nr_tasks; i++) {
   214  		tasks[i].nr = i;
   947  		tasks[i].nr = i;
   215  		tasks[i].src_addr = sin;
   948  		tasks[i].src_addr = sin;
   216 +#if defined(__SVR4) && defined(__sun)
   949 +#if defined(__SVR4) && defined(__sun)
   217 +		tasks[i].dst_addr.sin_family = AF_INET_OFFLOAD;
   950 +		tasks[i].dst_addr.sin_family = AF_INET_OFFLOAD;
   219  		tasks[i].dst_addr.sin_family = AF_INET;
   952  		tasks[i].dst_addr.sin_family = AF_INET;
   220 +#endif
   953 +#endif
   221  		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
   954  		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
   222  		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
   955  		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
   223  		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
   956  		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
   224 @@ -1625,6 +1683,10 @@
   957 @@ -1581,6 +1896,15 @@
       
   958  		tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
       
   959  		tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
       
   960  		tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
       
   961 +		tasks[i].ack2_header = alloca(opts->req_depth * sizeof(struct header));
       
   962 +		for (j=0;j<opts->req_depth;j++)
       
   963 +			tasks[i].ack2_header[j].pending = 0;
       
   964 +
       
   965 +		tasks[i].req_header = alloca(opts->req_depth * sizeof(struct header));
       
   966 +		for (j=0;j<opts->req_depth;j++)
       
   967 +			tasks[i].req_header[j].pending = 0;
       
   968 +
       
   969 +		tasks[i].retry_token = alloca(2 * opts->req_depth * sizeof(uint64_t));
       
   970  		tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
       
   971  	}
       
   972  
       
   973 @@ -1611,7 +1935,7 @@
       
   974  
       
   975  		check_parent(parent_pid);
       
   976  
       
   977 -		ret = poll(&pfd, 1, -1);
       
   978 +		ret = poll(&pfd, 1, 1000);
       
   979  		if (ret < 0) {
       
   980  			if (errno == EINTR)
       
   981  				continue;
       
   982 @@ -1621,10 +1945,14 @@
       
   983  		pfd.events = POLLIN;
       
   984  
       
   985  		if (pfd.revents & POLLIN) {
       
   986 -			while (recv_one(fd, tasks, opts, ctl) >= 0)
       
   987 +			while (recv_one(fd, tasks, opts, ctl, all_ctl) >= 0)
   225  				;
   988  				;
   226  		}
   989  		}
   227  
   990  
   228 +		/* stop sending if in shutdown phase */
   991 +		/* stop sending if in shutdown phase */
   229 +		if (ctl->stopping)
   992 +		if (ctl->stopping)
   230 +			continue;
   993 +			continue;
   231 +
   994 +
   232  		/* keep the pipeline full */
   995  		/* keep the pipeline full */
   233  		can_send = !!(pfd.revents & POLLOUT);
   996  		can_send = !!(pfd.revents & POLLOUT);
   234  		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
   997  		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
   235 @@ -1665,8 +1727,12 @@
   998 @@ -1633,6 +1961,7 @@
       
   999  			if (t->drain_rdmas)
       
  1000  				continue;
       
  1001  			if (send_anything(fd, t, opts, ctl, can_send, do_work) < 0) {
       
  1002 +
       
  1003  				pfd.events |= POLLOUT;
       
  1004  
       
  1005  				/* If the send queue is full, we will see EAGAIN.
       
  1006 @@ -1665,8 +1994,12 @@
   236  	uint32_t i;
  1007  	uint32_t i;
   237  
  1008  
   238  	len = opts->nr_tasks * sizeof(*ctl);
  1009  	len = opts->nr_tasks * sizeof(*ctl);
   239 +#if defined(__SVR4) && defined(__sun)
  1010 +#if defined(__SVR4) && defined(__sun)
   240 +	ctl = (struct child_control *)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
  1011 +	ctl = (struct child_control *)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   243  		   0, 0);
  1014  		   0, 0);
   244 +#endif
  1015 +#endif
   245  	if (ctl == MAP_FAILED)
  1016  	if (ctl == MAP_FAILED)
   246  		die("mmap of %u child control structs failed", opts->nr_tasks);
  1017  		die("mmap of %u child control structs failed", opts->nr_tasks);
   247  
  1018  
   248 @@ -1699,7 +1765,7 @@
  1019 @@ -1688,7 +2021,7 @@
       
  1020  				control_fd = -1;
       
  1021  			}
       
  1022  			rdma_key_o_meter_set_self(i);
       
  1023 -			run_child(parent, ctl + i, opts, i, active);
       
  1024 +			run_child(parent, ctl + i, ctl, opts, i, active);
       
  1025  			exit(0);
       
  1026  		}
       
  1027  		ctl[i].pid = pid;
       
  1028 @@ -1699,7 +2032,7 @@
   249  			continue;
  1029  			continue;
   250  		pid = waitpid(-1, NULL, WNOHANG);
  1030  		pid = waitpid(-1, NULL, WNOHANG);
   251  		if (pid)
  1031  		if (pid)
   252 -			die("child %u (pid %u) exited\n", i, pid);
  1032 -			die("child %u (pid %u) exited\n", i, pid);
   253 +			die("child %u (pid %u) exited\n", i, (int)pid);
  1033 +			die("child %u (pid %u) exited\n", i, (int)pid);
   254  		sleep(1);
  1034  		sleep(1);
   255  		i--; /* try this child again */
  1035  		i--; /* try this child again */
   256  	}
  1036  	}
   257 @@ -1823,6 +1889,7 @@
  1037 @@ -1823,6 +2156,7 @@
   258  
  1038  
   259  	if (disable)
  1039  	if (disable)
   260  		return;
  1040  		return;
   261 +#if !(defined(__SVR4) && defined(__sun))
  1041 +#if !(defined(__SVR4) && defined(__sun))
   262  	if ((fp = fopen("/proc/stat", "r")) == NULL) {
  1042  	if ((fp = fopen("/proc/stat", "r")) == NULL) {
   263  		fprintf(stderr, "Cannot open /proc/stat (%s) - "
  1043  		fprintf(stderr, "Cannot open /proc/stat (%s) - "
   264  				"not printing cpu stats\n",
  1044  				"not printing cpu stats\n",
   265 @@ -1856,10 +1923,37 @@
  1045 @@ -1856,10 +2190,37 @@
   266  		}
  1046  		}
   267  	}
  1047  	}
   268  	fclose(fp);
  1048  	fclose(fp);
   269 +#else
  1049 +#else
   270 +#define NSEC_TO_TICK(v)		(v * sysconf(_SC_CLK_TCK)/1000000000)
  1050 +#define NSEC_TO_TICK(v)		(v * sysconf(_SC_CLK_TCK)/1000000000)
   298 +		       ",intr:count");
  1078 +		       ",intr:count");
   299 +#endif
  1079 +#endif
   300  	} else {
  1080  	} else {
   301  		struct sys_stats sys;
  1081  		struct sys_stats sys;
   302  		unsigned long sum = 0;
  1082  		unsigned long sum = 0;
   303 @@ -1884,6 +1978,7 @@
  1083 @@ -1884,6 +2245,7 @@
   304  		 *  5	irq
  1084  		 *  5	irq
   305  		 *  6	softirq
  1085  		 *  6	softirq
   306  		 */
  1086  		 */
   307 +#if !(defined(__SVR4) && defined(__sun))
  1087 +#if !(defined(__SVR4) && defined(__sun))
   308  		printf(",%f,%f,%f,%f,%Lu",
  1088  		printf(",%f,%f,%f,%f,%Lu",
   309  			(sys.times[0] + sys.times[1]) * scale,
  1089  			(sys.times[0] + sys.times[1]) * scale,
   310  			sys.times[2] * scale,
  1090  			sys.times[2] * scale,
   311 @@ -1890,6 +1985,14 @@
  1091 @@ -1890,6 +2252,14 @@
   312  			(sys.times[3] + sys.times[4]) * scale,
  1092  			(sys.times[3] + sys.times[4]) * scale,
   313  			(sys.times[5] + sys.times[6]) * scale,
  1093  			(sys.times[5] + sys.times[6]) * scale,
   314  			sys.intr);
  1094  			sys.intr);
   315 +#else
  1095 +#else
   316 +		/* Solaris kstat doesn't provide irq/softirq info. */
  1096 +		/* Solaris kstat doesn't provide irq/softirq info. */
   321 +			sys.intr);
  1101 +			sys.intr);
   322 +#endif
  1102 +#endif
   323  	}
  1103  	}
   324  	prev = current;
  1104  	prev = current;
   325  }
  1105  }
   326 @@ -1903,6 +2006,10 @@
  1106 @@ -1903,6 +2273,10 @@
   327  	static socklen_t buflen = 0;
  1107  	static socklen_t buflen = 0;
   328  	static int sock_fd = -1;
  1108  	static int sock_fd = -1;
   329  	int i, count, item_size;
  1109  	int i, count, item_size;
   330 +#if defined(__SVR4) && defined(__sun)
  1110 +#if defined(__SVR4) && defined(__sun)
   331 +	socklen_t len;
  1111 +	socklen_t len;
   332 +	struct rds_info_arg arg;
  1112 +	struct rds_info_arg arg;
   333 +#endif
  1113 +#endif
   334  
  1114  
   335  	if (sock_fd < 0) {
  1115  	if (sock_fd < 0) {
   336  		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
  1116  		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
   337 @@ -1912,6 +2019,7 @@
  1117 @@ -1912,6 +2286,7 @@
   338  
  1118  
   339  	/* We should only loop once on the first call; after that the
  1119  	/* We should only loop once on the first call; after that the
   340  	 * buffer requirements for RDS counters should not change. */
  1120  	 * buffer requirements for RDS counters should not change. */
   341 +#if !(defined(__SVR4) && defined(__sun))
  1121 +#if !(defined(__SVR4) && defined(__sun))
   342  	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
  1122  	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
   343  		if (errno != ENOSPC)
  1123  		if (errno != ENOSPC)
   344  			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
  1124  			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
   345 @@ -1919,7 +2027,29 @@
  1125 @@ -1919,7 +2294,29 @@
   346  		if (!curr)
  1126  		if (!curr)
   347  			die_errno("Cannot allocate buffer for stats counters");
  1127  			die_errno("Cannot allocate buffer for stats counters");
   348  	}
  1128  	}
   349 +#else
  1129 +#else
   350 +	int retcode;
  1130 +	int retcode;
   370 +#endif
  1150 +#endif
   371 +
  1151 +
   372  	if (item_size > sizeof(*ctr))
  1152  	if (item_size > sizeof(*ctr))
   373  		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
  1153  		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
   374  				item_size, sizeof(*ctr));
  1154  				item_size, sizeof(*ctr));
   375 @@ -1932,8 +2062,11 @@
  1155 @@ -1932,8 +2329,11 @@
   376  	}
  1156  	}
   377  
  1157  
   378  	for (i = 0; i < count; ++i)
  1158  	for (i = 0; i < count; ++i)
   379 +#if !(defined(__SVR4) && defined(__sun))
  1159 +#if !(defined(__SVR4) && defined(__sun))
   380  		memcpy(ctr + i, curr + i * item_size, item_size);
  1160  		memcpy(ctr + i, curr + i * item_size, item_size);
   383 +		memcpy(ctr + i, ((void *)(uintptr_t)arg.datap) + i * item_size, item_size);
  1163 +		memcpy(ctr + i, ((void *)(uintptr_t)arg.datap) + i * item_size, item_size);
   384 +#endif
  1164 +#endif
   385  	gettimeofday(&now, NULL);
  1165  	gettimeofday(&now, NULL);
   386  
  1166  
   387  	if (initialize) {
  1167  	if (initialize) {
   388 @@ -1957,6 +2090,10 @@
  1168 @@ -1957,6 +2357,10 @@
   389  	memcpy(prev, ctr, count * sizeof(*ctr));
  1169  	memcpy(prev, ctr, count * sizeof(*ctr));
   390  	last_ts = now;
  1170  	last_ts = now;
   391  
  1171  
   392 +#if defined(__SVR4) && defined(__sun)
  1172 +#if defined(__SVR4) && defined(__sun)
   393 +	free((void *)(uintptr_t)arg.datap);
  1173 +	free((void *)(uintptr_t)arg.datap);
   394 +#endif
  1174 +#endif
   395 +
  1175 +
   396  	get_stats(initialize);
  1176  	get_stats(initialize);
   397  }
  1177  }
   398  
  1178  
   399 @@ -1967,7 +2104,7 @@
  1179 @@ -1967,7 +2371,7 @@
   400  
  1180  
   401  	pid = waitpid(-1, &status, wflags);
  1181  	pid = waitpid(-1, &status, wflags);
   402  	if (pid < 0)
  1182  	if (pid < 0)
   403 -		die("waitpid returned %u", pid);
  1183 -		die("waitpid returned %u", pid);
   404 +		die("waitpid returned %u", (int)pid);
  1184 +		die("waitpid returned %u", (int)pid);
   405  	if (pid == 0)
  1185  	if (pid == 0)
   406  		return 0;
  1186  		return 0;
   407  
  1187  
   408 @@ -1975,15 +2112,15 @@
  1188 @@ -1975,15 +2379,15 @@
   409  		if (WEXITSTATUS(status) == 0)
  1189  		if (WEXITSTATUS(status) == 0)
   410  			return 1;
  1190  			return 1;
   411  		die("child pid %u exited with status %d\n",
  1191  		die("child pid %u exited with status %d\n",
   412 -				pid, WEXITSTATUS(status));
  1192 -				pid, WEXITSTATUS(status));
   413 +				(int)pid, WEXITSTATUS(status));
  1193 +				(int)pid, WEXITSTATUS(status));
   422 -	die("child pid %u wait status %d\n", pid, status);
  1202 -	die("child pid %u wait status %d\n", pid, status);
   423 +	die("child pid %u wait status %d\n", (int)pid, status);
  1203 +	die("child pid %u wait status %d\n", (int)pid, status);
   424  }
  1204  }
   425  
  1205  
   426  static void release_children_and_wait(struct options *opts,
  1206  static void release_children_and_wait(struct options *opts,
   427 @@ -2139,7 +2276,12 @@
  1207 @@ -1995,9 +2399,13 @@
       
  1208  	struct counter summary[NR_STATS];
       
  1209  	struct timeval start, end, now, first_ts, last_ts;
       
  1210  	double cpu_total = 0;
       
  1211 -	uint16_t i, cpu_samples = 0;
       
  1212 +	uint16_t i, j, cpu_samples = 0;
       
  1213  	uint16_t nr_running;
       
  1214 +        uint64_t latency_histogram[MAX_BUCKETS];
       
  1215  
       
  1216 +	if (show_histogram) 
       
  1217 +	        memset(latency_histogram, 0, sizeof(latency_histogram));
       
  1218 +
       
  1219  	gettimeofday(&start, NULL);
       
  1220  	start.tv_sec += 2;
       
  1221  	for (i = 0; i < opts->nr_tasks; i++)
       
  1222 @@ -2139,7 +2547,12 @@
   428  	control_fd = -1;
  1223  	control_fd = -1;
   429  
  1224  
   430  	if (nr_running) {
  1225  	if (nr_running) {
   431 +		/* let everything gracefully stop before we kill the chillins */
  1226 +		/* let everything gracefully stop before we kill the chillins */
   432  		for (i = 0; i < opts->nr_tasks; i++)
  1227  		for (i = 0; i < opts->nr_tasks; i++)
   435 +
  1230 +
   436 +		for (i = 0; i < opts->nr_tasks; i++)
  1231 +		for (i = 0; i < opts->nr_tasks; i++)
   437  			kill(ctl[i].pid, SIGTERM);
  1232  			kill(ctl[i].pid, SIGTERM);
   438  		stop_soakers(soak_arr);
  1233  		stop_soakers(soak_arr);
   439  	}
  1234  	}
   440 @@ -2517,7 +2659,11 @@
  1235 @@ -2167,6 +2580,19 @@
       
  1236  			avg(&summary[S_SENDMSG_USECS]),
       
  1237  			avg(&summary[S_RTT_USECS]),
       
  1238  			soak_arr? scale * cpu_total : -1.0);
       
  1239 +
       
  1240 +		if (show_histogram) 
       
  1241 +		{
       
  1242 +			for (i = 0; i < opts->nr_tasks; i++)
       
  1243 +			  for (j=0;j < MAX_BUCKETS; j++)
       
  1244 +			    latency_histogram[j] += ctl[i].latency_histogram[j];
       
  1245 +			    
       
  1246 +			printf("\nRTT histogram\n");
       
  1247 +			printf("RTT (us)        \t\t    Count\n");
       
  1248 +			for (i=0;i < MAX_BUCKETS; i++)
       
  1249 +			  printf("[%6u - %6u] \t\t %8u\n", 1 << i, 1 << (i+1), 
       
  1250 +			         (unsigned int)latency_histogram[i]);
       
  1251 +		}
       
  1252  	}
       
  1253  }
       
  1254  
       
  1255 @@ -2262,6 +2688,9 @@
       
  1256          dst->simplex = src->simplex;                    /* byte sized */
       
  1257          dst->rw_mode = src->rw_mode;                    /* byte sized */
       
  1258          dst->rdma_vector = htonl(src->rdma_vector);
       
  1259 +	dst->tos = src->tos;
       
  1260 +	dst->reset = src->reset;
       
  1261 +	dst->async = src->async;
       
  1262  }
       
  1263  
       
  1264  static void decode_options(struct options *dst, const struct options *src)
       
  1265 @@ -2295,6 +2724,9 @@
       
  1266          dst->simplex = src->simplex;                    /* byte sized */
       
  1267          dst->rw_mode = src->rw_mode;                    /* byte sized */
       
  1268  	dst->rdma_vector = ntohl(src->rdma_vector);
       
  1269 +	dst->tos = src->tos;
       
  1270 +	dst->reset = src->reset;
       
  1271 +	dst->async = src->async;
       
  1272  }
       
  1273  
       
  1274  static void verify_option_encdec(const struct options *opts)
       
  1275 @@ -2316,6 +2748,30 @@
       
  1276  		die("encode/decode check of options struct failed");
       
  1277  }
       
  1278  
       
  1279 +static void reset_conn(struct options *opts)
       
  1280 +{
       
  1281 +	struct rds_reset val;
       
  1282 +	int fd;
       
  1283 +	struct sockaddr_in sin;
       
  1284 +
       
  1285 +	sin.sin_family = AF_INET;
       
  1286 +	sin.sin_port = htons(opts->starting_port);
       
  1287 +	sin.sin_addr.s_addr = htonl(opts->receive_addr);
       
  1288 +
       
  1289 +	fd = bound_socket(pf, SOCK_SEQPACKET, 0, &sin);
       
  1290 +
       
  1291 +	val.tos = opts->tos;
       
  1292 +#if defined(__SVR4) && defined(__sun)
       
  1293 +	val.src = htonl(opts->receive_addr);
       
  1294 +	val.dst = htonl(opts->send_addr);
       
  1295 +#else
       
  1296 +	val.src.s_addr = htonl(opts->receive_addr);
       
  1297 +	val.dst.s_addr = htonl(opts->send_addr);
       
  1298 +#endif
       
  1299 +	if (setsockopt(fd, sol, RDS_CONN_RESET, &val, sizeof(val)))
       
  1300 +		die_errno("setsockopt RDS_CONN_RESET failed");
       
  1301 +}
       
  1302 +
       
  1303  static int active_parent(struct options *opts, struct soak_control *soak_arr)
       
  1304  {
       
  1305  	struct options enc_options;
       
  1306 @@ -2324,6 +2780,11 @@
       
  1307  	int fd;
       
  1308  	uint8_t ok;
       
  1309  
       
  1310 +	if (opts->reset) {
       
  1311 +		reset_conn(opts);
       
  1312 +		return 0;
       
  1313 +	}
       
  1314 +
       
  1315  	if (opts->show_params) {
       
  1316  		unsigned int k;
       
  1317  
       
  1318 @@ -2517,7 +2978,11 @@
   441  	/* an extra terminating entry which will be all 0s */
  1319  	/* an extra terminating entry which will be all 0s */
   442  	len = (nr_soak + 1) * sizeof(struct soak_control);
  1320  	len = (nr_soak + 1) * sizeof(struct soak_control);
   443  	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
  1321  	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
   444 +#if defined(__SVR4) && defined(__sun)
  1322 +#if defined(__SVR4) && defined(__sun)
   445 +			MAP_ANONYMOUS|MAP_SHARED, -1, 0);
  1323 +			MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   447  			MAP_ANONYMOUS|MAP_SHARED, 0, 0);
  1325  			MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   448 +#endif
  1326 +#endif
   449  	if (soak_arr == MAP_FAILED)
  1327  	if (soak_arr == MAP_FAILED)
   450  		die("mmap of %ld soak control structs failed", nr_soak);
  1328  		die("mmap of %ld soak control structs failed", nr_soak);
   451  
  1329  
   452 @@ -2589,6 +2735,7 @@
  1330 @@ -2572,6 +3037,10 @@
       
  1331  	OPT_CONNECT_RETRIES,
       
  1332  	OPT_USE_CONG_MONITOR,
       
  1333  	OPT_PERFDATA,
       
  1334 +        OPT_SHOW_OUTLIERS,
       
  1335 +        OPT_SHOW_HISTOGRAM,
       
  1336 +	OPT_RESET,
       
  1337 +	OPT_ASYNC,
       
  1338  };
       
  1339  
       
  1340  static struct option long_options[] = {
       
  1341 @@ -2584,11 +3053,13 @@
       
  1342  { "send-addr",		required_argument,	NULL,	's'	},
       
  1343  { "port",		required_argument,	NULL,	'p'	},
       
  1344  { "time",		required_argument,	NULL,	'T'	},
       
  1345 +{ "tos",                required_argument,      NULL,   'Q'     },
       
  1346  { "report-cpu",		no_argument,		NULL,	'c'	},
       
  1347  { "report-summary",	no_argument,		NULL,	'z'	},
   453  { "rtprio",		no_argument,		NULL,	'R'	},
  1348  { "rtprio",		no_argument,		NULL,	'R'	},
   454  { "verify",		no_argument,		NULL,	'v'	},
  1349  { "verify",		no_argument,		NULL,	'v'	},
   455  { "trace",		no_argument,		NULL,	'V'	},
  1350  { "trace",		no_argument,		NULL,	'V'	},
   456 +{ "lgrpid",		required_argument,	NULL,	'g'	},
  1351 +{ "lgrpid",		required_argument,	NULL,	'g'	},
   457  
  1352  
   458  { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
  1353  { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
   459  { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
  1354  { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
   460 @@ -2652,7 +2799,7 @@
  1355 @@ -2601,6 +3072,10 @@
       
  1356  { "show-perfdata",	no_argument,		NULL,	OPT_PERFDATA },
       
  1357  { "connect-retries",	required_argument,	NULL,	OPT_CONNECT_RETRIES },
       
  1358  { "use-cong-monitor",	required_argument,	NULL,	OPT_USE_CONG_MONITOR },
       
  1359 +{ "show-outliers",      required_argument,      NULL,   OPT_SHOW_OUTLIERS    },
       
  1360 +{ "show-histogram",     no_argument,            NULL,   OPT_SHOW_HISTOGRAM   },
       
  1361 +{ "reset",              no_argument,            NULL,   OPT_RESET },
       
  1362 +{ "async",              no_argument,            NULL,   OPT_ASYNC },
       
  1363  
       
  1364  { NULL }
       
  1365  };
       
  1366 @@ -2640,6 +3115,8 @@
       
  1367  	opts.use_cong_monitor = 1;
       
  1368  	opts.rdma_use_fence = 1;
       
  1369  	opts.rdma_cache_mrs = 0;
       
  1370 +	opts.rdma_use_once = 0;
       
  1371 +	opts.rdma_use_get_mr = 0;
       
  1372  	opts.rdma_alignment = 0;
       
  1373  	opts.rdma_key_o_meter = 0;
       
  1374  	opts.show_params = 0;
       
  1375 @@ -2648,11 +3125,16 @@
       
  1376          opts.simplex = 0;
       
  1377          opts.rw_mode = 0;
       
  1378  	opts.rdma_vector = 1;
       
  1379 +        rtt_threshold = ~0U;
       
  1380 +        show_histogram = 0;
       
  1381 +	opts.tos = 0;
       
  1382 +	opts.reset = 0;
       
  1383 +	opts.async = 0;
       
  1384  
   461  	while(1) {
  1385  	while(1) {
   462  		int c, index;
  1386  		int c, index;
   463  
  1387  
   464 -		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
  1388 -		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
   465 +		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVg:z",
  1389 +		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:Q:vVg:z",
   466  				long_options, &index);
  1390  				long_options, &index);
   467  		if (c == -1)
  1391  		if (c == -1)
   468  			break;
  1392  			break;
   469 @@ -2711,6 +2858,10 @@
  1393 @@ -2702,6 +3184,9 @@
       
  1394  			case 'T':
       
  1395  				opts.run_time = parse_ull(optarg, (uint32_t)~0);
       
  1396  				break;
       
  1397 +			case 'Q':
       
  1398 +				opts.tos = parse_ull(optarg, (uint8_t)~0);
       
  1399 +				break;
       
  1400  			case 'z':
       
  1401  				opts.summary_only = 1;
       
  1402  				break;
       
  1403 @@ -2711,9 +3196,25 @@
   470  			case 'V':
  1404  			case 'V':
   471  				opts.tracing = 1;
  1405  				opts.tracing = 1;
   472  				break;
  1406  				break;
   473 +			case 'g':
  1407 +			case 'g':
   474 +				lgrp_id = (lgrp_id_t)parse_ull(optarg,
  1408 +				lgrp_id = (lgrp_id_t)parse_ull(optarg,
   475 +				    (uint32_t)~0);
  1409 +				    (uint32_t)~0);
   476 +				break;
  1410 +				break;
       
  1411 +                        case OPT_SHOW_OUTLIERS:
       
  1412 +                                rtt_threshold = parse_ull(optarg, ~0U);
       
  1413 +                                break;
       
  1414 +                        case OPT_SHOW_HISTOGRAM:
       
  1415 +                                show_histogram = 1;
       
  1416 +                                break;
   477  			case OPT_USE_CONG_MONITOR:
  1417  			case OPT_USE_CONG_MONITOR:
   478  				opts.use_cong_monitor = parse_ull(optarg, 1);
  1418  				opts.use_cong_monitor = parse_ull(optarg, 1);
   479  				break;
  1419  				break;
   480 @@ -2786,6 +2937,7 @@
  1420 +			case OPT_RESET:
       
  1421 +				opts.reset = 1;
       
  1422 +				break;
       
  1423 +			case OPT_ASYNC:
       
  1424 +				opts.async = 1;
       
  1425 +				break;
       
  1426  			case OPT_RDMA_USE_ONCE:
       
  1427  				opts.rdma_use_once = parse_ull(optarg, 1);
       
  1428  				break;
       
  1429 @@ -2786,6 +3287,7 @@
   481  	if (opts.rdma_size && 0)
  1430  	if (opts.rdma_size && 0)
   482  		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
  1431  		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
   483  
  1432  
   484 +	set_my_lgrp();
  1433 +	set_my_lgrp();
   485  	opt = opts;
  1434  	opt = opts;
  1288  
  2237  
  1289  
  2238  
  1290 diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
  2239 diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
  1291 --- /tmp/rds-tools-2.0.4/rds-stress.1	Wed Aug  4 15:25:11 2010
  2240 --- /tmp/rds-tools-2.0.4/rds-stress.1	Wed Aug  4 15:25:11 2010
  1292 +++ rds-tools-2.0.7/rds-stress.1	Thu Feb 24 13:27:52 2011
  2241 +++ rds-tools-2.0.7/rds-stress.1	Thu Feb 24 13:27:52 2011
  1293 @@ -1,99 +1,102 @@
  2242 @@ -1,99 +1,106 @@
  1294 -.Dd May 15, 2007
  2243 -.Dd May 15, 2007
  1295 -.Dt RDS-STRESS 1
  2244 -.Dt RDS-STRESS 1
  1296 -.Os
  2245 -.Os
  1297 -.Sh NAME
  2246 -.Sh NAME
  1298 -.Nm rds-stress
  2247 -.Nm rds-stress
  1319 +.PP
  2268 +.PP
  1320 +.SH SYNOPSIS
  2269 +.SH SYNOPSIS
  1321 +.HP
  2270 +.HP
  1322 +.nf
  2271 +.nf
  1323 +rds-stress [-p port_number] -r [receive_address] [-s send_address]
  2272 +rds-stress [-p port_number] -r [receive_address] [-s send_address]
  1324 +      [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
  2273 +      [-Q tos] [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
  1325 +      [-d queue_depth] [-t nr_tasks] [-c] [-R] [-V] [-v]
  2274 +      [-d queue_depth] [-t nr_tasks] [-c] [-R] [-V] [-v]
  1326 +.fi
  2275 +.fi
  1327  
  2276  
  1328 -.Sh DESCRIPTION
  2277 -.Sh DESCRIPTION
  1329 -.Nm rds-stress
  2278 -.Nm rds-stress
  1421  obtain the address once the control connection is established.
  2370  obtain the address once the control connection is established.
  1422  The active process will choose a local address based on the interface through
  2371  The active process will choose a local address based on the interface through
  1423  which it connects to the destination address.
  2372  which it connects to the destination address.
  1424 -.It Fl a Ar ack_bytes
  2373 -.It Fl a Ar ack_bytes
  1425 +.TP
  2374 +.TP
       
  2375 +\fB\-Q tos
       
  2376 +Uses the RDS connection between IP addresses with the specified tos value. By 
       
  2377 +default, the base (tos = 0) RDS connection is used.
       
  2378 +.TP
  1426 +\fB\-a ack_bytes
  2379 +\fB\-a ack_bytes
  1427  This specifies the size of the ack messages, in bytes. There is a minimum size
  2380  This specifies the size of the ack messages, in bytes. There is a minimum size
  1428  which depends on the format of the ack messages, which may change over time.
  2381  which depends on the format of the ack messages, which may change over time.
  1429  See section "Message Sizes" below.
  2382  See section "Message Sizes" below.
  1430 -.It Fl q Ar request_bytes
  2383 -.It Fl q Ar request_bytes
  1437 +.TP
  2390 +.TP
  1438 +\fB\-D rdma_bytes
  2391 +\fB\-D rdma_bytes
  1439  RDSv3 is capable of transmitting part of a message via RDMA directly from
  2392  RDSv3 is capable of transmitting part of a message via RDMA directly from
  1440  application buffer to application buffer. This option enables RDMA support
  2393  application buffer to application buffer. This option enables RDMA support
  1441  in rds-stress: request packets include parameters for an RDMA READ or WRITE
  2394  in rds-stress: request packets include parameters for an RDMA READ or WRITE
  1442 @@ -100,20 +103,25 @@
  2395 @@ -100,20 +107,25 @@
  1443  operation, which the receiving process executes at the time the ACK packet
  2396  operation, which the receiving process executes at the time the ACK packet
  1444  is sent.
  2397  is sent.
  1445  See section "Message Sizes" below.
  2398  See section "Message Sizes" below.
  1446 -.It Fl d Ar queue_depth
  2399 -.It Fl d Ar queue_depth
  1447 +.TP
  2400 +.TP
  1468 +.TP
  2421 +.TP
  1469 +\fB\-c
  2422 +\fB\-c
  1470  This causes rds-stress to create child tasks which just consume CPU cycles.
  2423  This causes rds-stress to create child tasks which just consume CPU cycles.
  1471  One task is created for each CPU in the system.  First each child observes the
  2424  One task is created for each CPU in the system.  First each child observes the
  1472  maximum rate at which it can consume cycles.  This means that this option
  2425  maximum rate at which it can consume cycles.  This means that this option
  1473 @@ -121,50 +129,67 @@
  2426 @@ -121,50 +133,67 @@
  1474  use of the system by observing the lesser rate at which the children consume
  2427  use of the system by observing the lesser rate at which the children consume
  1475  cycles.  This option is *not* shared between the active and passive instances.
  2428  cycles.  This option is *not* shared between the active and passive instances.
  1476  It must be specified on each rds-stress command line.
  2429  It must be specified on each rds-stress command line.
  1477 -.It Fl R
  2430 -.It Fl R
  1478 +.TP
  2431 +.TP
  1535 +mbi K/s
  2488 +mbi K/s
  1536 +The total number of bytes that are being received via RDMA READs and
  2489 +The total number of bytes that are being received via RDMA READs and
  1537  WRITEs for all children.
  2490  WRITEs for all children.
  1538 -.It tx us/c
  2491 -.It tx us/c
  1539 +.TP
  2492 +.TP
  1540 +mbi K/s
  2493 +mbo K/s
  1541 +The total number of bytes that are being transmited via RDMA READs and
  2494 +The total number of bytes that are being transmited via RDMA READs and
  1542 +WRITEs for all children.
  2495 +WRITEs for all children.
  1543 +.TP
  2496 +.TP
  1544 +tx us/c
  2497 +tx us/c
  1545  The average number of microseconds spent in sendmsg() calls.
  2498  The average number of microseconds spent in sendmsg() calls.
  1555 +.TP
  2508 +.TP
  1556 +cpu %
  2509 +cpu %
  1557  This is the percentage of available CPU resources on this machine that are being
  2510  This is the percentage of available CPU resources on this machine that are being
  1558  consumed since rds-stress started running.  It will show -1.00 if -c is not
  2511  consumed since rds-stress started running.  It will show -1.00 if -c is not
  1559  given.  It is calculated based on the amount of CPU resources that CPU soaking
  2512  given.  It is calculated based on the amount of CPU resources that CPU soaking
  1560 @@ -171,4 +196,3 @@
  2513 @@ -171,4 +200,3 @@
  1561  tasks are able to consume.  This lets it measure CPU use by the system, say in
  2514  tasks are able to consume.  This lets it measure CPU use by the system, say in
  1562  interrupt handlers, that task-based CPU accounting does not include.
  2515  interrupt handlers, that task-based CPU accounting does not include.
  1563  For this to work rds-stress must be started with -c on an idle system.
  2516  For this to work rds-stress must be started with -c on an idle system.
  1564 -.El
  2517 -.El
  1565 diff -r -u /tmp/rds-tools-2.0.4/include/rds.h rds-tools-2.0.7/include/rds.h
  2518 diff -r -u /tmp/rds-tools-2.0.4/include/rds.h rds-tools-2.0.7/include/rds.h