components/open-fabrics/rds-tools/patches/base.patch
branchs11-update
changeset 3195 cf6a5a756b74
parent 715 eed3ed08f692
equal deleted inserted replaced
3193:e45380d8d511 3195:cf6a5a756b74
    13  #include <syscall.h>
    13  #include <syscall.h>
    14 +#endif
    14 +#endif
    15  #include <sys/stat.h>
    15  #include <sys/stat.h>
    16  #include <sys/poll.h>
    16  #include <sys/poll.h>
    17  #include <ctype.h>
    17  #include <ctype.h>
    18 @@ -22,10 +28,16 @@
    18 @@ -22,12 +28,27 @@
    19  #include <fcntl.h>
    19  #include <fcntl.h>
    20  #include <sched.h>
    20  #include <sched.h>
    21  #include <getopt.h>
    21  #include <getopt.h>
       
    22 +#include <sys/ioctl.h>
    22 +#if !(defined(__SVR4) && defined(__sun))
    23 +#if !(defined(__SVR4) && defined(__sun))
    23  #include <byteswap.h>
    24  #include <byteswap.h>
    24  #include "rds.h"
    25  #include "rds.h"
    25 -
    26 -
    26 +#else
    27 +#else
    29  #include "pfhack.h"
    30  #include "pfhack.h"
    30 +#if defined(__SVR4) && defined(__sun)
    31 +#if defined(__SVR4) && defined(__sun)
    31 +#include <infiniband/ofa_solaris.h>
    32 +#include <infiniband/ofa_solaris.h>
    32 +#endif
    33 +#endif
    33  
    34  
       
    35 +#if defined(__SVR4) && defined(__sun)
    34  /*
    36  /*
       
    37 + * This definition is forward looking and is not yet present
       
    38 + * in Solaris rds.h file
       
    39 + */
       
    40 +#define RDS_CMSG_RDMA_SEND_STATUS RDS_CMSG_RDMA_STATUS
       
    41 +#endif
       
    42 +
       
    43 +/*
    35   *
    44   *
    36 @@ -102,6 +114,10 @@
    45   * TODO
       
    46   *  - checksum the data some day.
       
    47 @@ -45,8 +66,9 @@
       
    48          M_RDMA_READ_ONLY,
       
    49          M_RDMA_WRITE_ONLY
       
    50  };
       
    51 +#define VERSION_MAX_LEN 16 
       
    52  
       
    53 -struct options {
       
    54 +struct options_2_0_6 {
       
    55  	uint32_t	req_depth;
       
    56  	uint32_t	req_size;
       
    57  	uint32_t	ack_size;
       
    58 @@ -76,9 +98,68 @@
       
    59  	uint32_t	connect_retries;
       
    60  } __attribute__((packed));
       
    61  
       
    62 +struct options {
       
    63 +	char		version[VERSION_MAX_LEN];
       
    64 +        uint32_t        req_depth;
       
    65 +        uint32_t        req_size;
       
    66 +        uint32_t        ack_size;
       
    67 +        uint32_t        rdma_size;
       
    68 +        uint32_t        send_addr;
       
    69 +        uint32_t        receive_addr;
       
    70 +        uint16_t        starting_port;
       
    71 +        uint16_t        nr_tasks;
       
    72 +        uint32_t        run_time;
       
    73 +        uint8_t         summary_only;
       
    74 +        uint8_t         rtprio;
       
    75 +        uint8_t         tracing;
       
    76 +        uint8_t         verify;
       
    77 +        uint8_t         show_params;
       
    78 +        uint8_t         show_perfdata;
       
    79 +        uint8_t         use_cong_monitor;
       
    80 +        uint8_t         rdma_use_once;
       
    81 +        uint8_t         rdma_use_get_mr;
       
    82 +        uint8_t         rdma_use_fence;
       
    83 +        uint8_t         rdma_cache_mrs;
       
    84 +        uint8_t         rdma_key_o_meter;
       
    85 +        uint8_t         suppress_warnings;
       
    86 +        uint8_t         simplex;
       
    87 +        uint8_t         rw_mode;
       
    88 +        uint32_t        rdma_vector;
       
    89 +        uint32_t        rdma_alignment;
       
    90 +        uint32_t        connect_retries;
       
    91 +        uint8_t         tos;
       
    92 +        uint8_t         async;
       
    93 +} __attribute__((packed));
       
    94 +
       
    95 +#define MAX_BUCKETS 16
       
    96 +
       
    97  static struct options	opt;
       
    98  static int		control_fd;
       
    99 +static uint64_t         rtt_threshold;
       
   100 +static int              show_histogram;
       
   101 +static int		reset_connection;
       
   102 +static char		peer_version[VERSION_MAX_LEN];
       
   103  
       
   104 +static int get_bucket(uint64_t rtt_time)
       
   105 +{
       
   106 +  int i;
       
   107 +  uint64_t l_rtt_time = rtt_time;
       
   108 +
       
   109 +  if (!l_rtt_time)
       
   110 +    i = 0;
       
   111 +  else
       
   112 +  {
       
   113 +    i = -1;
       
   114 +    while (l_rtt_time)
       
   115 +    {
       
   116 +      i++;
       
   117 +      l_rtt_time = (l_rtt_time >> 1);
       
   118 +    }
       
   119 +  }
       
   120 +
       
   121 +  return i;
       
   122 +}
       
   123 +
       
   124  struct counter {
       
   125  	uint64_t	nr;
       
   126  	uint64_t	sum;
       
   127 @@ -102,6 +183,10 @@
    37  
   128  
    38  #define NR_STATS S__LAST
   129  #define NR_STATS S__LAST
    39  
   130  
    40 +#if defined(__SVR4) && defined(__sun)
   131 +#if defined(__SVR4) && defined(__sun)
    41 +int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
   132 +int sol_ioctl(int, int, struct rds_info_arg *, socklen_t *, int *);
    42 +#endif
   133 +#endif
    43 +
   134 +
    44  /*
   135  /*
    45   * Parents share a mapped array of these with their children.  Each child
   136   * Parents share a mapped array of these with their children.  Each child
    46   * gets one.  It's used to communicate between the child and the parent
   137   * gets one.  It's used to communicate between the child and the parent
    47 @@ -110,6 +126,7 @@
   138 @@ -110,9 +195,11 @@
    48  struct child_control {
   139  struct child_control {
    49  	pid_t pid;
   140  	pid_t pid;
    50  	int ready;
   141  	int ready;
    51 +	int stopping;
   142 +	int stopping;
    52  	struct timeval start;
   143  	struct timeval start;
    53  	struct counter cur[NR_STATS];
   144  	struct counter cur[NR_STATS];
    54  	struct counter last[NR_STATS];
   145  	struct counter last[NR_STATS];
    55 @@ -254,7 +271,20 @@
   146 +        uint64_t       latency_histogram[MAX_BUCKETS];
       
   147  } __attribute__((aligned (256))); /* arbitrary */
       
   148  
       
   149  struct soak_control {
       
   150 @@ -132,6 +219,7 @@
       
   151   */
       
   152  #define OP_REQ		1
       
   153  #define OP_ACK		2
       
   154 +#define OP_DUMP		3
       
   155  
       
   156  #define RDMA_OP_READ	1
       
   157  #define RDMA_OP_WRITE	2
       
   158 @@ -148,7 +236,7 @@
       
   159  	uint16_t	from_port;
       
   160  	uint16_t	to_port;
       
   161  	uint16_t	index;
       
   162 -	uint8_t		op;
       
   163 +	uint8_t         op;
       
   164  
       
   165  	/* RDMA related.
       
   166  	 * rdma_op must be the first field, because we
       
   167 @@ -162,12 +250,21 @@
       
   168  	uint32_t	rdma_size;
       
   169  	uint32_t        rdma_vector;
       
   170  
       
   171 -	uint8_t		data[0];
       
   172 +	/* Async send related. */
       
   173 +	uint8_t         retry;
       
   174 +	uint8_t         rdma_remote_err;
       
   175 +	uint8_t         pending;
       
   176 +
       
   177 +	uint8_t         data[0];
       
   178  } __attribute__((packed));
       
   179  
       
   180  #define MIN_MSG_BYTES		(sizeof(struct header))
       
   181  #define BASIC_HEADER_SIZE	(size_t)(&((struct header *) 0)->rdma_op)
       
   182  
       
   183 +#define print_outlier(...) do {         \
       
   184 +        fprintf(stderr, __VA_ARGS__);   \
       
   185 +} while (0)
       
   186 +
       
   187  #define die(fmt...) do {		\
       
   188  	fprintf(stderr, fmt);		\
       
   189  	exit(1);			\
       
   190 @@ -254,7 +351,20 @@
    56  
   191  
    57  	die("invalid host name or dotted quad '%s'\n", ptr);
   192  	die("invalid host name or dotted quad '%s'\n", ptr);
    58  }
   193  }
    59 +#if defined(__SVR4) && defined(__sun)
   194 +#if defined(__SVR4) && defined(__sun)
    60 +static lgrp_id_t lgrp_id = -1;
   195 +static lgrp_id_t lgrp_id = -1;
    71 +#endif
   206 +#endif
    72 +
   207 +
    73  static void usage(void)
   208  static void usage(void)
    74  {
   209  {
    75          fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
   210          fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
    76 @@ -281,6 +311,9 @@
   211 @@ -273,6 +383,7 @@
       
   212  	" -d [depth, 1]     request pipeline depth, nr outstanding\n"
       
   213  	" -t [nr, 1]        number of child tasks\n"
       
   214  	" -T [seconds, 0]   runtime of test, 0 means infinite\n"
       
   215 +	" -Q [tos, 0]       Type of Service\n"
       
   216  	" -D [bytes]        RDMA: size\n"
       
   217  	" -I [iovecs, 1]    RDMA: number of user buffers to target (max 512)\n"
       
   218          " -M [nr, 0]        RDMA: mode (0=readwrite,1=readonly,2=writeonly)\n"
       
   219 @@ -281,6 +392,9 @@
    77  	" -c                measure cpu use with per-cpu soak processes\n"
   220  	" -c                measure cpu use with per-cpu soak processes\n"
    78  	" -V                trace execution\n"
   221  	" -V                trace execution\n"
    79  	" -z                print a summary at end of test only\n"
   222  	" -z                print a summary at end of test only\n"
    80 +#if defined(__SVR4) && defined(__sun)
   223 +#if defined(__SVR4) && defined(__sun)
    81 +	" -g [lgrpid]       bind the process to the specified lgrp\n"
   224 +	" -g [lgrpid]       bind the process to the specified lgrp\n"
    82 +#endif
   225 +#endif
    83  	"\n"
   226  	"\n"
    84  	"Example:\n"
   227  	"Example:\n"
    85  	"  recv$ rds-stress\n"
   228  	"  recv$ rds-stress\n"
    86 @@ -310,7 +343,7 @@
   229 @@ -310,7 +424,7 @@
    87  static void check_parent(pid_t pid)
   230  static void check_parent(pid_t pid)
    88  {
   231  {
    89  	if (pid != getppid())
   232  	if (pid != getppid())
    90 -		die("parent %u exited\n", pid);
   233 -		die("parent %u exited\n", pid);
    91 +		die("parent %u exited\n", (int)pid);
   234 +		die("parent %u exited\n", (int)pid);
    92  }
   235  }
    93  
   236  
    94  /*
   237  /*
    95 @@ -334,6 +367,7 @@
   238 @@ -334,6 +448,7 @@
    96  		msg_pattern[i] = k;
   239  		msg_pattern[i] = k;
    97  }
   240  }
    98  
   241  
    99 +#if !(defined(__SVR4) && defined(__sun))
   242 +#if !(defined(__SVR4) && defined(__sun))
   100  #if __BYTE_ORDER == __LITTLE_ENDIAN
   243  #if __BYTE_ORDER == __LITTLE_ENDIAN
   101  #define htonll(x)	bswap_64(x)
   244  #define htonll(x)	bswap_64(x)
   102  #define ntohll(x)	bswap_64(x)
   245  #define ntohll(x)	bswap_64(x)
   103 @@ -341,6 +375,7 @@
   246 @@ -341,6 +456,7 @@
   104  #define htonll(x)	(x)
   247  #define htonll(x)	(x)
   105  #define ntohll(x)	(x)
   248  #define ntohll(x)	(x)
   106  #endif
   249  #endif
   107 +#endif /* Not sun */
   250 +#endif /* Not sun */
   108  
   251  
   109  static void encode_hdr(struct header *dst, const struct header *hdr)
   252  static void encode_hdr(struct header *dst, const struct header *hdr)
   110  {
   253  {
   111 @@ -584,7 +619,11 @@
   254 @@ -361,6 +477,7 @@
       
   255  	dst->rdma_key = htonll(hdr->rdma_key);
       
   256  	dst->rdma_size = htonl(hdr->rdma_size);
       
   257  	dst->rdma_vector = htonl(hdr->rdma_vector);
       
   258 +	dst->retry = hdr->retry;
       
   259  }
       
   260  
       
   261  static void decode_hdr(struct header *dst, const struct header *hdr)
       
   262 @@ -382,6 +499,7 @@
       
   263  	dst->rdma_key = ntohll(hdr->rdma_key);
       
   264  	dst->rdma_size = ntohl(hdr->rdma_size);
       
   265  	dst->rdma_vector = ntohl(hdr->rdma_vector);
       
   266 +	dst->retry = hdr->retry;
       
   267  }
       
   268  
       
   269  static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
       
   270 @@ -412,11 +530,19 @@
       
   271   * Compare incoming message header with expected header. All header fields
       
   272   * are in host byte order except for address and port fields.
       
   273   */
       
   274 -static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
       
   275 +static int check_hdr(void *message, uint32_t bytes, struct header *hdr, struct options *opts)
       
   276  {
       
   277  	struct header msghdr;
       
   278 +	uint32_t	inc_seq;
       
   279 +	uint32_t	my_seq;
       
   280  
       
   281  	decode_hdr(&msghdr, message);
       
   282 +	inc_seq = msghdr.seq;
       
   283 +	my_seq = hdr->seq;
       
   284 +
       
   285 +	if (msghdr.retry && (inc_seq < my_seq))
       
   286 +		return -1;
       
   287 +
       
   288  	if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
       
   289  #define bleh(var, disp)					\
       
   290  		disp(hdr->var),				\
       
   291 @@ -428,7 +554,7 @@
       
   292  		 * with stdout() and we don't get things stomping on each
       
   293  		 * other
       
   294  		 */
       
   295 -		printf( "An incoming message had a header which\n"
       
   296 +		printf( "An incoming message had a %s header which\n"
       
   297  			"didn't contain the fields we expected:\n"
       
   298  			"    member        expected eq             got\n"
       
   299  			"       seq %15u %s %15u\n"
       
   300 @@ -438,6 +564,7 @@
       
   301  			"   to_port %15u %s %15u\n"
       
   302  			"     index %15u %s %15u\n"
       
   303  			"        op %15u %s %15u\n",
       
   304 +			(msghdr.retry) ? "RETRY" : "",
       
   305  			bleh(seq, /**/),
       
   306  			bleh(from_addr, inet_ntoa_32),
       
   307  			bleh(from_port, ntohs),
       
   308 @@ -569,6 +696,9 @@
       
   309  
       
   310  	fcntl(fd, F_SETFL, O_NONBLOCK);
       
   311  
       
   312 +	if (opts->tos && ioctl(fd, SIOCRDSSETTOS, &opts->tos)) 
       
   313 +		die_errno("ERROR: failed to set TOS\n");
       
   314 +
       
   315  	return fd;
       
   316  }
       
   317  
       
   318 @@ -584,7 +714,11 @@
   112  	if (opts->receive_addr == 0)
   319  	if (opts->receive_addr == 0)
   113  		return 1;
   320  		return 1;
   114  
   321  
   115 +#if defined(__SVR4) && defined(__sun)
   322 +#if defined(__SVR4) && defined(__sun)
   116 +	sin.sin_family = AF_INET_OFFLOAD;
   323 +	sin.sin_family = AF_INET_OFFLOAD;
   118  	sin.sin_family = AF_INET;
   325  	sin.sin_family = AF_INET;
   119 +#endif
   326 +#endif
   120  	sin.sin_port = htons(opts->starting_port);
   327  	sin.sin_port = htons(opts->starting_port);
   121  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   328  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   122  
   329  
   123 @@ -677,7 +716,11 @@
   330 @@ -639,7 +773,7 @@
       
   331  	mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
       
   332  #endif
       
   333  	if (setsockopt(fd, sol, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
       
   334 -		die_errno("setsockopt(RDS_FREE_MR) failed");
       
   335 +		return;
       
   336  	mrs_allocated--;
       
   337  }
       
   338  
       
   339 @@ -677,7 +811,11 @@
   124  	size = sizeof(struct rdma_key_o_meter)
   340  	size = sizeof(struct rdma_key_o_meter)
   125  			+ 2 * nr_tasks * sizeof(*kt)
   341  			+ 2 * nr_tasks * sizeof(*kt)
   126  			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
   342  			+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
   127 +#if defined(__SVR4) && defined(__sun)
   343 +#if defined(__SVR4) && defined(__sun)
   128 +	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   344 +	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   130  	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   346  	base = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   131 +#endif
   347 +#endif
   132  	if (base == MAP_FAILED)
   348  	if (base == MAP_FAILED)
   133  		die_errno("alloc_rdma_buffers: mmap failed");
   349  		die_errno("alloc_rdma_buffers: mmap failed");
   134  
   350  
   135 @@ -828,7 +871,7 @@
   351 @@ -828,13 +966,20 @@
   136  	}
   352  	}
   137  
   353  
   138  	if (!failed)
   354  	if (!failed)
   139 -		trace("compare pass pattern %Lx addr %p\n",
   355 -		trace("compare pass pattern %Lx addr %p\n",
   140 +		trace("compare pass pattern 0x%Lx addr %p\n",
   356 +		trace("compare pass pattern 0x%Lx addr %p\n",
   141  			(unsigned long long) pattern, addr);
   357  			(unsigned long long) pattern, addr);
   142  }
   358  }
   143  
   359  
   144 @@ -865,7 +908,11 @@
   360 +struct retry_entry {
       
   361 +	uint32_t	retries;
       
   362 +	uint32_t	seq;
       
   363 +	int		status;
       
   364 +};
       
   365 +
       
   366  struct task {
       
   367  	unsigned int		nr;
       
   368  	unsigned int		pending;
       
   369 +	int			trace;
       
   370  	unsigned int		unacked;
       
   371  	struct sockaddr_in	src_addr;	/* same for all tasks */
       
   372  	struct sockaddr_in	dst_addr;
       
   373 @@ -846,7 +991,14 @@
       
   374  	uint16_t		recv_index;
       
   375  	struct timeval *	send_time;
       
   376  	struct header *		ack_header;
       
   377 +	struct header *         ack2_header;
       
   378 +	struct header *         req_header;
       
   379 +	uint64_t *		retry_token;
       
   380 +	uint32_t		retries;
       
   381 +	uint32_t            	last_retry_seq;
       
   382 +	uint32_t		retry_index;
       
   383  
       
   384 +
       
   385  	/* RDMA related stuff */
       
   386  	uint64_t **		local_buf;
       
   387  	uint64_t **		rdma_buf;
       
   388 @@ -865,7 +1017,11 @@
   145  	/* We use mmap here rather than malloc, because it is always
   389  	/* We use mmap here rather than malloc, because it is always
   146  	 * page aligned. */
   390  	 * page aligned. */
   147  	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
   391  	len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
   148 +#if defined(__SVR4) && defined(__sun)
   392 +#if defined(__SVR4) && defined(__sun)
   149 +	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   393 +	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   151  	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
   395  	base = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
   152 +#endif
   396 +#endif
   153  	if (base == MAP_FAILED)
   397  	if (base == MAP_FAILED)
   154  		die_errno("alloc_rdma_buffers: mmap failed");
   398  		die_errno("alloc_rdma_buffers: mmap failed");
   155  	memset(base, 0x2f, len);
   399  	memset(base, 0x2f, len);
   156 @@ -915,17 +962,16 @@
   400 @@ -915,17 +1071,16 @@
   157  	if (RDMA_OP_READ == hdr->rdma_op) {
   401  	if (RDMA_OP_READ == hdr->rdma_op) {
   158  		if (opt.verify)
   402  		if (opt.verify)
   159  			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
   403  			rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
   160 -		trace("Requesting RDMA read for pattern %Lx "
   404 -		trace("Requesting RDMA read for pattern %Lx "
   161 -				"local addr to rdma read %p\n",
   405 -				"local addr to rdma read %p\n",
   175 +		trace("Requesting RDMA write for pattern 0x%Lx",
   419 +		trace("Requesting RDMA write for pattern 0x%Lx",
   176 +				hdr->rdma_pattern);
   420 +				hdr->rdma_pattern);
   177  	}
   421  	}
   178  }
   422  }
   179  
   423  
   180 @@ -947,7 +993,7 @@
   424 @@ -947,7 +1102,7 @@
   181  		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
   425  		die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
   182  
   426  
   183  
   427  
   184 -	trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
   428 -	trace("RDS received request to issue rdma %s len %lu rva %Lx key %Lx pattern %Lx\n",
   185 +	trace("RDS received request to issue rdma %s len %lu rva 0x%Lx key 0x%Lx pattern 0x%Lx\n",
   429 +	trace("RDS received request to issue rdma %s len %lu rva 0x%Lx key 0x%Lx pattern 0x%Lx\n",
   186  		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
   430  		in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
   187  		rdma_size,
   431  		rdma_size,
   188  		(unsigned long long) in_hdr->rdma_addr,
   432  		(unsigned long long) in_hdr->rdma_addr,
   189 @@ -1007,6 +1053,9 @@
   433 @@ -966,21 +1121,33 @@
       
   434  	hdr->rdma_vector = in_hdr->rdma_vector;
       
   435  }
       
   436  
       
   437 -static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
       
   438 +static inline uint64_t rdma_user_token(struct task *t, unsigned int qindex,  unsigned int type, uint32_t seq)
       
   439  {
       
   440 -	return t->nr * opt.req_depth + qindex;
       
   441 +	uint64_t tmp = seq;
       
   442 +	return (tmp << 32) | ((t->nr * opt.req_depth + qindex) << 2 | type);
       
   443  }
       
   444  
       
   445 -static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
       
   446 +static void rdma_mark_completed(struct task *tasks, uint64_t token, int status, struct options *opts)
       
   447  {
       
   448  	struct task *t;
       
   449  	unsigned int i;
       
   450 +	struct header *hdr = NULL;
       
   451 +	uint32_t seq = token >> 32;
       
   452 +	unsigned int type = token & 0x03;
       
   453 +	unsigned int index = (token & 0xFFFFFFFF) >> 2;
       
   454  
       
   455 -	trace("RDS rdma completion for token %x\n", token);
       
   456 +	trace("RDS rdma completion for token 0x%lx\n", token);
       
   457  
       
   458 -	t = &tasks[token / opt.req_depth];
       
   459 -	i = token % opt.req_depth;
       
   460 +	t = &tasks[index / opt.req_depth];
       
   461 +	i = index % opt.req_depth;
       
   462  
       
   463 +	if (opts->async) {
       
   464 +		if (type == OP_REQ)
       
   465 +			hdr = &t->req_header[i];
       
   466 +		else
       
   467 +			hdr = &t->ack2_header[i];
       
   468 +	}
       
   469 +
       
   470  	if (status) {
       
   471  		const char *errmsg;
       
   472  
       
   473 @@ -987,20 +1154,50 @@
       
   474  		switch (status) {
       
   475  		case RDS_RDMA_REMOTE_ERROR:
       
   476  			errmsg = "remote error"; break;
       
   477 -		case RDS_RDMA_CANCELED:
       
   478 -			errmsg = "operation was cancelled"; break;
       
   479 -		case RDS_RDMA_DROPPED:
       
   480 +		case RDS_RDMA_SEND_DROPPED:
       
   481  			errmsg = "operation was dropped"; break;
       
   482 -		case RDS_RDMA_OTHER_ERROR:
       
   483 +		case RDS_RDMA_SEND_CANCELED:
       
   484 +			errmsg = "operation was cancelled"; break;
       
   485 +		case RDS_RDMA_SEND_OTHER_ERROR:
       
   486  			errmsg = "other error"; break;
       
   487  		default:
       
   488  			errmsg = "unknown error"; break;
       
   489  		}
       
   490  
       
   491 -		printf("%s:%u: RDMA op %u failed: %s\n",
       
   492 +		trace("%s:%u: %s failed: %s\n",
       
   493  				inet_ntoa(t->dst_addr.sin_addr),
       
   494  				ntohs(t->dst_addr.sin_port),
       
   495 -				i, errmsg);
       
   496 +				type ? "SEND" : "RDMA",
       
   497 +				errmsg);
       
   498 +
       
   499 +		if (hdr &&
       
   500 +			(status == RDS_RDMA_SEND_DROPPED ||
       
   501 +			 status == RDS_RDMA_REMOTE_ERROR)) {
       
   502 +
       
   503 +			if (hdr->seq == seq) {
       
   504 +				hdr->retry = 1;
       
   505 +				if (hdr->seq > t->last_retry_seq) {
       
   506 +					if (status == RDS_RDMA_REMOTE_ERROR)
       
   507 +						hdr->rdma_remote_err = 1;
       
   508 +					t->retry_token[t->retry_index] = token;
       
   509 +					t->retry_index = (t->retry_index + 1) %
       
   510 +						(2 * opts->req_depth);
       
   511 +					t->retries += 1;
       
   512 +					t->last_retry_seq = hdr->seq;
       
   513 +					if (t->retries > 2 * opts->req_depth)
       
   514 +						die("Exceeded MAX retry entries..\n");
       
   515 +				}
       
   516 +			} else
       
   517 +				die("SEQ Out-Of-Sync: %u/%u\n", hdr->seq, seq);
       
   518 +		} else if (hdr) {
       
   519 +			hdr->pending = 0;
       
   520 +			hdr->retry = 0;
       
   521 +			hdr->rdma_remote_err = 0;
       
   522 +		}
       
   523 +	} else if (hdr) {
       
   524 +		hdr->pending = 0;
       
   525 +		hdr->retry = 0;
       
   526 +		hdr->rdma_remote_err = 0;
       
   527  	}
       
   528  
       
   529  	t->rdma_inflight[i] = 0;
       
   530 @@ -1007,6 +1204,9 @@
   190  	t->drain_rdmas = 0;
   531  	t->drain_rdmas = 0;
   191  }
   532  }
   192  
   533  
   193 +#if defined(__SVR4) && defined(__sun)
   534 +#if defined(__SVR4) && defined(__sun)
   194 +#undef MSG_MAXIOVLEN
   535 +#undef MSG_MAXIOVLEN
   195 +#endif
   536 +#endif
   196  #define MSG_MAXIOVLEN 2
   537  #define MSG_MAXIOVLEN 2
   197  
   538  
   198  /*
   539  /*
   199 @@ -1560,7 +1609,12 @@
   540 @@ -1018,11 +1218,14 @@
       
   541  	static char ctlbuf[1024];
       
   542  	struct cmsghdr *cmsg;
       
   543  
       
   544 -	msg->msg_control = ctlbuf;
       
   545 -	msg->msg_controllen = CMSG_SPACE(size);
       
   546 -
       
   547 -	cmsg = CMSG_FIRSTHDR(msg);
       
   548 -	cmsg->cmsg_level = sol;
       
   549 +	if (!msg->msg_control) {
       
   550 +		msg->msg_control = ctlbuf;
       
   551 +		msg->msg_controllen = CMSG_SPACE(size);
       
   552 +		cmsg = CMSG_FIRSTHDR(msg);
       
   553 +	} else {
       
   554 +		cmsg = (struct cmsghdr *)((char *)msg->msg_control + msg->msg_controllen);
       
   555 +		msg->msg_controllen += CMSG_SPACE(size);
       
   556 +	}cmsg->cmsg_level = sol;
       
   557  	cmsg->cmsg_type = type;
       
   558  	cmsg->cmsg_len = CMSG_LEN(size);
       
   559  	memcpy(CMSG_DATA(cmsg), ptr, size);
       
   560 @@ -1034,7 +1237,7 @@
       
   561   * the ACK packet.
       
   562   */
       
   563  static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
       
   564 -		unsigned int user_token, void *local_buf)
       
   565 +		uint64_t user_token, void *local_buf)
       
   566  {
       
   567  
       
   568  #define RDS_MAX_IOV 512 /* FIX_ME - put this into rds.h or use socket max ?*/
       
   569 @@ -1048,7 +1251,7 @@
       
   570  	rdma_size = hdr->rdma_size;
       
   571  	rdma_vector = hdr->rdma_vector;
       
   572  
       
   573 -	trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p vector %u\n",
       
   574 +	trace("RDS issuing rdma for token 0x%lx key 0x%llx len %d local_buf %p vector %d\n",
       
   575  			user_token,
       
   576  			(unsigned long long) hdr->rdma_key,
       
   577  			rdma_size, local_buf,
       
   578 @@ -1102,6 +1305,15 @@
       
   579  	rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
       
   580  }
       
   581  
       
   582 +static void build_cmsg_async_send(struct msghdr *msg, uint64_t user_token)
       
   583 +{
       
   584 +	struct rds_asend_args  args;
       
   585 +
       
   586 +	args.flags |= RDS_SEND_NOTIFY_ME;
       
   587 +	args.user_token = user_token;
       
   588 +	rdma_put_cmsg(msg, RDS_CMSG_ASYNC_SEND, &args, sizeof(args));
       
   589 +}
       
   590 +
       
   591  static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
       
   592  {
       
   593  	rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
       
   594 @@ -1174,19 +1386,17 @@
       
   595  	hdr->index = qindex;
       
   596  }
       
   597  
       
   598 -static int send_packet(int fd, struct task *t,
       
   599 -		struct header *hdr, unsigned int size)
       
   600 +static int send_msg(int fd, struct task *t, struct header *hdr,
       
   601 +		    unsigned int size, struct options *opts, 
       
   602 +		    struct child_control *ctl)
       
   603  {
       
   604 -	unsigned char buf[size], *rdma_flight_recorder = NULL;
       
   605 +	unsigned char buf[size];
       
   606 +	uint8_t *rdma_flight_recorder = NULL;
       
   607  	rds_rdma_cookie_t cookie = 0;
       
   608  	struct msghdr msg;
       
   609  	struct iovec iov;
       
   610  	ssize_t ret;
       
   611  
       
   612 -	/* Make sure we always have the current sequence number.
       
   613 -	 * When we send ACK packets, the seq that gets filled in is
       
   614 -	 * stale. */
       
   615 -	hdr->seq = t->send_seq;
       
   616  	fill_hdr(buf, size, hdr);
       
   617  
       
   618  	memset(&msg, 0, sizeof(msg));
       
   619 @@ -1198,27 +1408,10 @@
       
   620  	iov.iov_base = buf;
       
   621  	iov.iov_len = size;
       
   622  
       
   623 -	/* If this is a REQ packet in which we pass the MR to the
       
   624 -	 * peer, extract the RDMA cookie and pass it on in the control
       
   625 -	 * message for now. */
       
   626 -	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
       
   627 -		if (hdr->rdma_key != 0) {
       
   628 -			/* We used GET_MR to obtain a key */
       
   629 -			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
       
   630 -			cookie = hdr->rdma_key;
       
   631 -			hdr->rdma_key = 0;
       
   632 -		} else {
       
   633 -			/* Use the RDMA_MAP cmsg to have sendmsg do the
       
   634 -			 * mapping on the fly. */
       
   635 -			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
       
   636 -					    hdr->rdma_size * hdr->rdma_vector,
       
   637 -					    &cookie);
       
   638 -		}
       
   639 -	}
       
   640  
       
   641  	/* If this is an ACK packet with RDMA, build the cmsg
       
   642 -	 * header that goes with it. */
       
   643 -	if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
       
   644 +	   * header that goes with it. */
       
   645 +	if (hdr->op == OP_ACK && hdr->rdma_op != 0 && !hdr->rdma_remote_err) {
       
   646  		unsigned int qindex = hdr->index;
       
   647  
       
   648  		if (t->rdma_inflight[qindex] != 0) {
       
   649 @@ -1230,16 +1423,35 @@
       
   650  			 *
       
   651  			 * We return one of the more obscure error messages,
       
   652  			 * which we recognize and handle in the top loop. */
       
   653 -			trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
       
   654 +			trace("Drain RDMA 0x%lx\n", rdma_user_token(t, qindex, 0, hdr->seq));
       
   655  			errno = EBADSLT;
       
   656  			return -1;
       
   657  		}
       
   658  		rdma_build_cmsg_xfer(&msg, hdr,
       
   659 -				rdma_user_token(t, qindex),
       
   660 +				rdma_user_token(t, qindex, 0, hdr->seq),
       
   661  				t->local_buf[qindex]);
       
   662  		rdma_flight_recorder = &t->rdma_inflight[qindex];
       
   663 +	} else if (opts->async) {
       
   664 +		if (hdr->op == OP_REQ)
       
   665 +			build_cmsg_async_send(&msg,
       
   666 +				rdma_user_token(t, hdr->index, OP_REQ, hdr->seq));
       
   667 +		else
       
   668 +			build_cmsg_async_send(&msg,
       
   669 +				rdma_user_token(t, hdr->index, OP_ACK, hdr->seq));
       
   670  	}
       
   671  
       
   672 +	if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
       
   673 +		if (hdr->rdma_key != 0) {
       
   674 +			rdma_build_cmsg_dest(&msg, hdr->rdma_key);
       
   675 +			cookie = hdr->rdma_key;
       
   676 +			hdr->rdma_key = 0;
       
   677 +		} else {
       
   678 +			rdma_build_cmsg_map(&msg, hdr->rdma_addr,
       
   679 +					hdr->rdma_size * hdr->rdma_vector,
       
   680 +					&cookie);
       
   681 +		}
       
   682 +	}
       
   683 +
       
   684  	ret = sendmsg(fd, &msg, 0);
       
   685  	if (ret < 0) {
       
   686  		if (errno != EAGAIN && errno != ENOBUFS)
       
   687 @@ -1256,10 +1468,41 @@
       
   688  		 * lower 32bit of the cookie */
       
   689  		rdma_key_o_meter_add(cookie);
       
   690  	}
       
   691 +
       
   692 +	hdr->pending = 1;
       
   693 +
       
   694 +	return ret;
       
   695 +}
       
   696 +
       
   697 +static int send_packet(int fd, struct task *t,
       
   698 +		struct header *hdr, unsigned int size,
       
   699 +		struct options *opts, struct child_control *ctl)
       
   700 +{
       
   701 +	ssize_t ret;
       
   702 +
       
   703 +	/* Make sure we always have the current sequence number.
       
   704 +	 * When we send ACK packets, the seq that gets filled in is
       
   705 +	 * stale. */
       
   706 +	hdr->seq = t->send_seq;
       
   707 +
       
   708 +	ret = send_msg(fd, t, hdr, size, opts, ctl);
       
   709 +	if (ret < 0) return ret;
       
   710 +
       
   711  	t->send_seq++;
       
   712  	return ret;
       
   713  }
       
   714  
       
   715 +static int resend_packet(int fd, struct task *t,
       
   716 +		struct header *hdr, unsigned int size,
       
   717 +		struct options *opts, struct child_control *ctl)
       
   718 +{
       
   719 +	ssize_t ret;
       
   720 +
       
   721 +	ret = send_msg(fd, t, hdr, size, opts, ctl);
       
   722 +
       
   723 +	return ret;
       
   724 +}
       
   725 +
       
   726  static int send_one(int fd, struct task *t,
       
   727  		struct options *opts,
       
   728  		struct child_control *ctl)
       
   729 @@ -1266,12 +1509,16 @@
       
   730  {
       
   731  	struct timeval start;
       
   732  	struct timeval stop;
       
   733 -	struct header hdr;
       
   734 +	struct header *hdr = &t->req_header[t->send_index]; 
       
   735  	int ret;
       
   736  
       
   737 -	build_header(t, &hdr, OP_REQ, t->send_index);
       
   738 +	if (opts->async && hdr->pending) {
       
   739 +		return -1;
       
   740 +	}
       
   741 +
       
   742 +	build_header(t, hdr, OP_REQ, t->send_index);
       
   743  	if (opts->rdma_size && t->send_seq > 10)
       
   744 -		rdma_build_req(fd, &hdr, t,
       
   745 +		rdma_build_req(fd, hdr, t,
       
   746  				opts->rdma_size,
       
   747  				opts->req_depth,
       
   748  				opts->rw_mode,
       
   749 @@ -1279,7 +1526,7 @@
       
   750  
       
   751  
       
   752  	gettimeofday(&start, NULL);
       
   753 -	ret = send_packet(fd, t, &hdr, opts->req_size);
       
   754 +	ret = send_packet(fd, t, hdr, opts->req_size, opts, ctl);
       
   755  	gettimeofday(&stop, NULL);
       
   756  
       
   757  	if (ret < 0)
       
   758 @@ -1302,10 +1549,15 @@
       
   759  		struct child_control *ctl)
       
   760  {
       
   761  	struct header *hdr = &t->ack_header[qindex];
       
   762 +	struct header *hdr2 = &t->ack2_header[qindex];
       
   763  	ssize_t ret;
       
   764  
       
   765 +	if (opts->async && hdr2->pending) {
       
   766 +		return -1;
       
   767 +	}
       
   768 +
       
   769  	/* send an ack in response to the req we just got */
       
   770 -	ret = send_packet(fd, t, hdr, opts->ack_size);
       
   771 +	ret = send_packet(fd, t, hdr, opts->ack_size, opts, ctl);
       
   772  	if (ret < 0)
       
   773  		return ret;
       
   774  	if (ret != opts->ack_size)
       
   775 @@ -1324,6 +1576,8 @@
       
   776  		break;
       
   777  	}
       
   778  
       
   779 +	memcpy(hdr2, hdr, sizeof(struct header));
       
   780 +
       
   781  	return ret;
       
   782  }
       
   783  
       
   784 @@ -1354,8 +1608,49 @@
       
   785  			struct child_control *ctl,
       
   786  			int can_send, int do_work)
       
   787  {
       
   788 +	struct header *hdr;
       
   789 +	unsigned int index;
       
   790 +	int req_size;
       
   791 +	int num_retries = t->retries;
       
   792 +	uint64_t token;
       
   793 +	unsigned int type;
       
   794 +	unsigned int index2;
       
   795 +	unsigned int i;
       
   796 +
       
   797 +	while (opts->async && num_retries > 0) {
       
   798 +		index = (t->retry_index - num_retries +
       
   799 +			(2 * opts->req_depth)) % (2 * opts->req_depth);
       
   800 +
       
   801 +		token = t->retry_token[index];
       
   802 +		type = token & 0x03;
       
   803 +		index2 = (token & 0xFFFFFFFF) >> 2;
       
   804 +		i = index2 % opts->req_depth;
       
   805 +
       
   806 +		if (type == OP_REQ)
       
   807 +			hdr = &t->req_header[i];
       
   808 +		else
       
   809 +			hdr = &t->ack2_header[i];
       
   810 +
       
   811 +		if (!hdr->retry)
       
   812 +			goto next;
       
   813 +
       
   814 +		if (hdr->op == OP_REQ)
       
   815 +			req_size = opts->req_size;
       
   816 +		else
       
   817 +			req_size = opts->ack_size;
       
   818 +
       
   819 +		if (resend_packet(fd, t, hdr, req_size, opts, ctl) < 0) {
       
   820 +			return -1;
       
   821 +		}
       
   822 +		hdr->retry = 0;
       
   823 +next:
       
   824 +		num_retries--;
       
   825 +	}
       
   826 +	t->last_retry_seq = t->retries = 0;
       
   827 +
       
   828  	if (ack_anything(fd, t, opts, ctl, can_send) < 0)
       
   829  		return -1;
       
   830 +
       
   831  	while (do_work && t->pending < opts->req_depth) {
       
   832  		if (!can_send)
       
   833  			goto eagain;
       
   834 @@ -1375,7 +1670,8 @@
       
   835  		rds_rdma_cookie_t *cookie,
       
   836  		struct sockaddr_in *sin,
       
   837  		struct timeval *tstamp,
       
   838 -		struct task *tasks)
       
   839 +		struct task *tasks,
       
   840 +		struct options *opts)
       
   841  {
       
   842  	struct cmsghdr *cmsg;
       
   843  	char cmsgbuf[256];
       
   844 @@ -1398,15 +1694,16 @@
       
   845  
       
   846  	if (ret < 0)
       
   847  		return ret;
       
   848 -	if (ret && ret < sizeof(struct header))
       
   849 +	if (ret && !strcmp(RDS_VERSION, peer_version) &&
       
   850 +		ret < sizeof(struct header))
       
   851  		die("recvmsg() returned short data: %zd", ret);
       
   852 -	if (msg.msg_namelen < sizeof(struct sockaddr_in))
       
   853 +	if (ret && msg.msg_namelen < sizeof(struct sockaddr_in))
       
   854  		die("socklen = %d < sizeof(sin) (%zu)\n",
       
   855  		    msg.msg_namelen, sizeof(struct sockaddr_in));
       
   856  
       
   857  	/* See if the message comes with a RDMA destination */
       
   858  	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
       
   859 -		struct rds_rdma_notify notify;
       
   860 +		struct rds_rdma_send_notify notify;
       
   861  
       
   862  		if (cmsg->cmsg_level != sol)
       
   863  			continue;
       
   864 @@ -1432,11 +1729,11 @@
       
   865  			memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie));
       
   866  			break;
       
   867  
       
   868 -		case RDS_CMSG_RDMA_STATUS:
       
   869 +		case RDS_CMSG_RDMA_SEND_STATUS:
       
   870  			if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
       
   871  				die("RDS_CMSG_RDMA_DEST data too small");
       
   872  			memcpy(&notify, CMSG_DATA(cmsg), sizeof(notify));
       
   873 -			rdma_mark_completed(tasks, notify.user_token, notify.status);
       
   874 +			rdma_mark_completed(tasks, notify.user_token, notify.status, opts);
       
   875  			break;
       
   876  		}
       
   877  	}
       
   878 @@ -1445,7 +1742,8 @@
       
   879  
       
   880  static int recv_one(int fd, struct task *tasks,
       
   881  			struct options *opts,
       
   882 -		struct child_control *ctl)
       
   883 +		struct child_control *ctl,
       
   884 +		struct child_control *all_ctl)
       
   885  {
       
   886  	char buf[max(opts->req_size, opts->ack_size)];
       
   887  	rds_rdma_cookie_t rdma_dest = 0;
       
   888 @@ -1456,15 +1754,18 @@
       
   889  	uint16_t expect_index;
       
   890  	int task_index;
       
   891  	ssize_t ret;
       
   892 +	int	check_status;
       
   893  
       
   894 -	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
       
   895 +
       
   896 +	ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks, opts);
       
   897  	if (ret < 0)
       
   898  		return ret;
       
   899  
       
   900  	/* If we received only RDMA completions or cong updates,
       
   901  	 * ret will be 0 */
       
   902 -	if (ret == 0)
       
   903 +	if (ret == 0) {
       
   904  		return 0;
       
   905 +	}
       
   906  
       
   907  	/* check the incoming sequence number */
       
   908  	task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
       
   909 @@ -1508,16 +1809,32 @@
       
   910  	hdr.to_port = t->src_addr.sin_port;
       
   911  	hdr.index = expect_index;
       
   912  
       
   913 -	if (check_hdr(buf, ret, &hdr))
       
   914 -		die("header from %s:%u to id %u bogus\n",
       
   915 -		    inet_ntoa(sin.sin_addr), htons(sin.sin_port),
       
   916 -		    ntohs(t->src_addr.sin_port));
       
   917 +	check_status = check_hdr(buf, ret, &hdr, opts);
       
   918 +	if (check_status) {
       
   919 +		if (check_status > 0) {
       
   920 +			die("header from %s:%u to id %u bogus\n",
       
   921 +		    	inet_ntoa(sin.sin_addr), htons(sin.sin_port),
       
   922 +		    	ntohs(t->src_addr.sin_port));
       
   923 +		} else
       
   924 +			return 0;
       
   925 +	}
       
   926  
       
   927  	if (hdr.op == OP_ACK) {
       
   928 -		stat_inc(&ctl->cur[S_RTT_USECS],
       
   929 -			 usec_sub(&tstamp, &t->send_time[expect_index]));
       
   930 -		t->pending -= 1;
       
   931 +                uint64_t rtt_time = 
       
   932 +                  usec_sub(&tstamp, &t->send_time[expect_index]);
       
   933  
       
   934 +		stat_inc(&ctl->cur[S_RTT_USECS], rtt_time);
       
   935 +                if (rtt_time > rtt_threshold)
       
   936 +			print_outlier("Found RTT = 0x%lx\n", rtt_time);
       
   937 +
       
   938 +                if (show_histogram)
       
   939 +                {
       
   940 +                  ctl->latency_histogram[get_bucket(rtt_time)]++;
       
   941 +                }
       
   942 +
       
   943 +		if (t->pending > 0)
       
   944 +			t->pending -= 1;
       
   945 +
       
   946  		if (in_hdr.rdma_key)
       
   947  			rdma_process_ack(fd, &in_hdr, ctl);
       
   948  	} else {
       
   949 @@ -1549,6 +1866,7 @@
       
   950  }
       
   951  
       
   952  static void run_child(pid_t parent_pid, struct child_control *ctl,
       
   953 +			struct child_control *all_ctl,
       
   954  		      struct options *opts, uint16_t id, int active)
       
   955  {
       
   956  	struct sockaddr_in sin;
       
   957 @@ -1559,8 +1877,15 @@
       
   958  	struct task tasks[opts->nr_tasks];
   200  	struct timeval start;
   959  	struct timeval start;
   201          int do_work = opts->simplex ? active : 1;
   960          int do_work = opts->simplex ? active : 1;
   202  
   961 +	int j;
       
   962  
       
   963 +
   203 +#if defined(__SVR4) && defined(__sun)
   964 +#if defined(__SVR4) && defined(__sun)
   204 +	set_my_lgrp();
   965 +	set_my_lgrp();
   205 +	sin.sin_family = AF_INET_OFFLOAD;
   966 +	sin.sin_family = AF_INET_OFFLOAD;
   206 +#else
   967 +#else
   207  	sin.sin_family = AF_INET;
   968  	sin.sin_family = AF_INET;
   208 +#endif
   969 +#endif
   209  	sin.sin_port = htons(opts->starting_port + 1 + id);
   970  	sin.sin_port = htons(opts->starting_port + 1 + id);
   210  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   971  	sin.sin_addr.s_addr = htonl(opts->receive_addr);
   211  
   972  
   212 @@ -1572,7 +1626,11 @@
   973 @@ -1572,7 +1897,11 @@
   213  	for (i = 0; i < opts->nr_tasks; i++) {
   974  	for (i = 0; i < opts->nr_tasks; i++) {
   214  		tasks[i].nr = i;
   975  		tasks[i].nr = i;
   215  		tasks[i].src_addr = sin;
   976  		tasks[i].src_addr = sin;
   216 +#if defined(__SVR4) && defined(__sun)
   977 +#if defined(__SVR4) && defined(__sun)
   217 +		tasks[i].dst_addr.sin_family = AF_INET_OFFLOAD;
   978 +		tasks[i].dst_addr.sin_family = AF_INET_OFFLOAD;
   219  		tasks[i].dst_addr.sin_family = AF_INET;
   980  		tasks[i].dst_addr.sin_family = AF_INET;
   220 +#endif
   981 +#endif
   221  		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
   982  		tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
   222  		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
   983  		tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
   223  		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
   984  		tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
   224 @@ -1625,6 +1683,10 @@
   985 @@ -1581,6 +1910,15 @@
       
   986  		tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
       
   987  		tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
       
   988  		tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
       
   989 +		tasks[i].ack2_header = alloca(opts->req_depth * sizeof(struct header));
       
   990 +		for (j=0;j<opts->req_depth;j++)
       
   991 +			tasks[i].ack2_header[j].pending = 0;
       
   992 +
       
   993 +		tasks[i].req_header = alloca(opts->req_depth * sizeof(struct header));
       
   994 +		for (j=0;j<opts->req_depth;j++)
       
   995 +			tasks[i].req_header[j].pending = 0;
       
   996 +
       
   997 +		tasks[i].retry_token = alloca(2 * opts->req_depth * sizeof(uint64_t));
       
   998  		tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
       
   999  	}
       
  1000  
       
  1001 @@ -1611,7 +1949,7 @@
       
  1002  
       
  1003  		check_parent(parent_pid);
       
  1004  
       
  1005 -		ret = poll(&pfd, 1, -1);
       
  1006 +		ret = poll(&pfd, 1, 1000);
       
  1007  		if (ret < 0) {
       
  1008  			if (errno == EINTR)
       
  1009  				continue;
       
  1010 @@ -1621,10 +1959,14 @@
       
  1011  		pfd.events = POLLIN;
       
  1012  
       
  1013  		if (pfd.revents & POLLIN) {
       
  1014 -			while (recv_one(fd, tasks, opts, ctl) >= 0)
       
  1015 +			while (recv_one(fd, tasks, opts, ctl, all_ctl) >= 0)
   225  				;
  1016  				;
   226  		}
  1017  		}
   227  
  1018  
   228 +		/* stop sending if in shutdown phase */
  1019 +		/* stop sending if in shutdown phase */
   229 +		if (ctl->stopping)
  1020 +		if (ctl->stopping)
   230 +			continue;
  1021 +			continue;
   231 +
  1022 +
   232  		/* keep the pipeline full */
  1023  		/* keep the pipeline full */
   233  		can_send = !!(pfd.revents & POLLOUT);
  1024  		can_send = !!(pfd.revents & POLLOUT);
   234  		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
  1025  		for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
   235 @@ -1665,8 +1727,12 @@
  1026 @@ -1633,6 +1975,7 @@
       
  1027  			if (t->drain_rdmas)
       
  1028  				continue;
       
  1029  			if (send_anything(fd, t, opts, ctl, can_send, do_work) < 0) {
       
  1030 +
       
  1031  				pfd.events |= POLLOUT;
       
  1032  
       
  1033  				/* If the send queue is full, we will see EAGAIN.
       
  1034 @@ -1665,8 +2008,12 @@
   236  	uint32_t i;
  1035  	uint32_t i;
   237  
  1036  
   238  	len = opts->nr_tasks * sizeof(*ctl);
  1037  	len = opts->nr_tasks * sizeof(*ctl);
   239 +#if defined(__SVR4) && defined(__sun)
  1038 +#if defined(__SVR4) && defined(__sun)
   240 +	ctl = (struct child_control *)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
  1039 +	ctl = (struct child_control *)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   243  		   0, 0);
  1042  		   0, 0);
   244 +#endif
  1043 +#endif
   245  	if (ctl == MAP_FAILED)
  1044  	if (ctl == MAP_FAILED)
   246  		die("mmap of %u child control structs failed", opts->nr_tasks);
  1045  		die("mmap of %u child control structs failed", opts->nr_tasks);
   247  
  1046  
   248 @@ -1699,7 +1765,7 @@
  1047 @@ -1688,7 +2035,7 @@
       
  1048  				control_fd = -1;
       
  1049  			}
       
  1050  			rdma_key_o_meter_set_self(i);
       
  1051 -			run_child(parent, ctl + i, opts, i, active);
       
  1052 +			run_child(parent, ctl + i, ctl, opts, i, active);
       
  1053  			exit(0);
       
  1054  		}
       
  1055  		ctl[i].pid = pid;
       
  1056 @@ -1699,7 +2046,7 @@
   249  			continue;
  1057  			continue;
   250  		pid = waitpid(-1, NULL, WNOHANG);
  1058  		pid = waitpid(-1, NULL, WNOHANG);
   251  		if (pid)
  1059  		if (pid)
   252 -			die("child %u (pid %u) exited\n", i, pid);
  1060 -			die("child %u (pid %u) exited\n", i, pid);
   253 +			die("child %u (pid %u) exited\n", i, (int)pid);
  1061 +			die("child %u (pid %u) exited\n", i, (int)pid);
   254  		sleep(1);
  1062  		sleep(1);
   255  		i--; /* try this child again */
  1063  		i--; /* try this child again */
   256  	}
  1064  	}
   257 @@ -1823,6 +1889,7 @@
  1065 @@ -1823,6 +2170,7 @@
   258  
  1066  
   259  	if (disable)
  1067  	if (disable)
   260  		return;
  1068  		return;
   261 +#if !(defined(__SVR4) && defined(__sun))
  1069 +#if !(defined(__SVR4) && defined(__sun))
   262  	if ((fp = fopen("/proc/stat", "r")) == NULL) {
  1070  	if ((fp = fopen("/proc/stat", "r")) == NULL) {
   263  		fprintf(stderr, "Cannot open /proc/stat (%s) - "
  1071  		fprintf(stderr, "Cannot open /proc/stat (%s) - "
   264  				"not printing cpu stats\n",
  1072  				"not printing cpu stats\n",
   265 @@ -1856,10 +1923,37 @@
  1073 @@ -1856,10 +2204,37 @@
   266  		}
  1074  		}
   267  	}
  1075  	}
   268  	fclose(fp);
  1076  	fclose(fp);
   269 +#else
  1077 +#else
   270 +#define NSEC_TO_TICK(v)		(v * sysconf(_SC_CLK_TCK)/1000000000)
  1078 +#define NSEC_TO_TICK(v)		(v * sysconf(_SC_CLK_TCK)/1000000000)
   298 +		       ",intr:count");
  1106 +		       ",intr:count");
   299 +#endif
  1107 +#endif
   300  	} else {
  1108  	} else {
   301  		struct sys_stats sys;
  1109  		struct sys_stats sys;
   302  		unsigned long sum = 0;
  1110  		unsigned long sum = 0;
   303 @@ -1884,6 +1978,7 @@
  1111 @@ -1884,6 +2259,7 @@
   304  		 *  5	irq
  1112  		 *  5	irq
   305  		 *  6	softirq
  1113  		 *  6	softirq
   306  		 */
  1114  		 */
   307 +#if !(defined(__SVR4) && defined(__sun))
  1115 +#if !(defined(__SVR4) && defined(__sun))
   308  		printf(",%f,%f,%f,%f,%Lu",
  1116  		printf(",%f,%f,%f,%f,%Lu",
   309  			(sys.times[0] + sys.times[1]) * scale,
  1117  			(sys.times[0] + sys.times[1]) * scale,
   310  			sys.times[2] * scale,
  1118  			sys.times[2] * scale,
   311 @@ -1890,6 +1985,14 @@
  1119 @@ -1890,6 +2266,14 @@
   312  			(sys.times[3] + sys.times[4]) * scale,
  1120  			(sys.times[3] + sys.times[4]) * scale,
   313  			(sys.times[5] + sys.times[6]) * scale,
  1121  			(sys.times[5] + sys.times[6]) * scale,
   314  			sys.intr);
  1122  			sys.intr);
   315 +#else
  1123 +#else
   316 +		/* Solaris kstat doesn't provide irq/softirq info. */
  1124 +		/* Solaris kstat doesn't provide irq/softirq info. */
   321 +			sys.intr);
  1129 +			sys.intr);
   322 +#endif
  1130 +#endif
   323  	}
  1131  	}
   324  	prev = current;
  1132  	prev = current;
   325  }
  1133  }
   326 @@ -1903,6 +2006,10 @@
  1134 @@ -1903,6 +2287,10 @@
   327  	static socklen_t buflen = 0;
  1135  	static socklen_t buflen = 0;
   328  	static int sock_fd = -1;
  1136  	static int sock_fd = -1;
   329  	int i, count, item_size;
  1137  	int i, count, item_size;
   330 +#if defined(__SVR4) && defined(__sun)
  1138 +#if defined(__SVR4) && defined(__sun)
   331 +	socklen_t len;
  1139 +	socklen_t len;
   332 +	struct rds_info_arg arg;
  1140 +	struct rds_info_arg arg;
   333 +#endif
  1141 +#endif
   334  
  1142  
   335  	if (sock_fd < 0) {
  1143  	if (sock_fd < 0) {
   336  		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
  1144  		sock_fd = socket(pf, SOCK_SEQPACKET, 0);
   337 @@ -1912,6 +2019,7 @@
  1145 @@ -1912,6 +2300,7 @@
   338  
  1146  
   339  	/* We should only loop once on the first call; after that the
  1147  	/* We should only loop once on the first call; after that the
   340  	 * buffer requirements for RDS counters should not change. */
  1148  	 * buffer requirements for RDS counters should not change. */
   341 +#if !(defined(__SVR4) && defined(__sun))
  1149 +#if !(defined(__SVR4) && defined(__sun))
   342  	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
  1150  	while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
   343  		if (errno != ENOSPC)
  1151  		if (errno != ENOSPC)
   344  			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
  1152  			die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
   345 @@ -1919,7 +2027,29 @@
  1153 @@ -1919,7 +2308,29 @@
   346  		if (!curr)
  1154  		if (!curr)
   347  			die_errno("Cannot allocate buffer for stats counters");
  1155  			die_errno("Cannot allocate buffer for stats counters");
   348  	}
  1156  	}
   349 +#else
  1157 +#else
   350 +	int retcode;
  1158 +	int retcode;
   370 +#endif
  1178 +#endif
   371 +
  1179 +
   372  	if (item_size > sizeof(*ctr))
  1180  	if (item_size > sizeof(*ctr))
   373  		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
  1181  		die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
   374  				item_size, sizeof(*ctr));
  1182  				item_size, sizeof(*ctr));
   375 @@ -1932,8 +2062,11 @@
  1183 @@ -1932,8 +2343,11 @@
   376  	}
  1184  	}
   377  
  1185  
   378  	for (i = 0; i < count; ++i)
  1186  	for (i = 0; i < count; ++i)
   379 +#if !(defined(__SVR4) && defined(__sun))
  1187 +#if !(defined(__SVR4) && defined(__sun))
   380  		memcpy(ctr + i, curr + i * item_size, item_size);
  1188  		memcpy(ctr + i, curr + i * item_size, item_size);
   383 +		memcpy(ctr + i, ((void *)(uintptr_t)arg.datap) + i * item_size, item_size);
  1191 +		memcpy(ctr + i, ((void *)(uintptr_t)arg.datap) + i * item_size, item_size);
   384 +#endif
  1192 +#endif
   385  	gettimeofday(&now, NULL);
  1193  	gettimeofday(&now, NULL);
   386  
  1194  
   387  	if (initialize) {
  1195  	if (initialize) {
   388 @@ -1957,6 +2090,10 @@
  1196 @@ -1957,6 +2371,10 @@
   389  	memcpy(prev, ctr, count * sizeof(*ctr));
  1197  	memcpy(prev, ctr, count * sizeof(*ctr));
   390  	last_ts = now;
  1198  	last_ts = now;
   391  
  1199  
   392 +#if defined(__SVR4) && defined(__sun)
  1200 +#if defined(__SVR4) && defined(__sun)
   393 +	free((void *)(uintptr_t)arg.datap);
  1201 +	free((void *)(uintptr_t)arg.datap);
   394 +#endif
  1202 +#endif
   395 +
  1203 +
   396  	get_stats(initialize);
  1204  	get_stats(initialize);
   397  }
  1205  }
   398  
  1206  
   399 @@ -1967,7 +2104,7 @@
  1207 @@ -1967,7 +2385,7 @@
   400  
  1208  
   401  	pid = waitpid(-1, &status, wflags);
  1209  	pid = waitpid(-1, &status, wflags);
   402  	if (pid < 0)
  1210  	if (pid < 0)
   403 -		die("waitpid returned %u", pid);
  1211 -		die("waitpid returned %u", pid);
   404 +		die("waitpid returned %u", (int)pid);
  1212 +		die("waitpid returned %u", (int)pid);
   405  	if (pid == 0)
  1213  	if (pid == 0)
   406  		return 0;
  1214  		return 0;
   407  
  1215  
   408 @@ -1975,15 +2112,15 @@
  1216 @@ -1975,15 +2393,15 @@
   409  		if (WEXITSTATUS(status) == 0)
  1217  		if (WEXITSTATUS(status) == 0)
   410  			return 1;
  1218  			return 1;
   411  		die("child pid %u exited with status %d\n",
  1219  		die("child pid %u exited with status %d\n",
   412 -				pid, WEXITSTATUS(status));
  1220 -				pid, WEXITSTATUS(status));
   413 +				(int)pid, WEXITSTATUS(status));
  1221 +				(int)pid, WEXITSTATUS(status));
   422 -	die("child pid %u wait status %d\n", pid, status);
  1230 -	die("child pid %u wait status %d\n", pid, status);
   423 +	die("child pid %u wait status %d\n", (int)pid, status);
  1231 +	die("child pid %u wait status %d\n", (int)pid, status);
   424  }
  1232  }
   425  
  1233  
   426  static void release_children_and_wait(struct options *opts,
  1234  static void release_children_and_wait(struct options *opts,
   427 @@ -2139,7 +2276,12 @@
  1235 @@ -1995,9 +2413,13 @@
       
  1236  	struct counter summary[NR_STATS];
       
  1237  	struct timeval start, end, now, first_ts, last_ts;
       
  1238  	double cpu_total = 0;
       
  1239 -	uint16_t i, cpu_samples = 0;
       
  1240 +	uint16_t i, j, cpu_samples = 0;
       
  1241  	uint16_t nr_running;
       
  1242 +        uint64_t latency_histogram[MAX_BUCKETS];
       
  1243  
       
  1244 +	if (show_histogram) 
       
  1245 +	        memset(latency_histogram, 0, sizeof(latency_histogram));
       
  1246 +
       
  1247  	gettimeofday(&start, NULL);
       
  1248  	start.tv_sec += 2;
       
  1249  	for (i = 0; i < opts->nr_tasks; i++)
       
  1250 @@ -2139,7 +2561,12 @@
   428  	control_fd = -1;
  1251  	control_fd = -1;
   429  
  1252  
   430  	if (nr_running) {
  1253  	if (nr_running) {
   431 +		/* let everything gracefully stop before we kill the chillins */
  1254 +		/* let everything gracefully stop before we kill the chillins */
   432  		for (i = 0; i < opts->nr_tasks; i++)
  1255  		for (i = 0; i < opts->nr_tasks; i++)
   435 +
  1258 +
   436 +		for (i = 0; i < opts->nr_tasks; i++)
  1259 +		for (i = 0; i < opts->nr_tasks; i++)
   437  			kill(ctl[i].pid, SIGTERM);
  1260  			kill(ctl[i].pid, SIGTERM);
   438  		stop_soakers(soak_arr);
  1261  		stop_soakers(soak_arr);
   439  	}
  1262  	}
   440 @@ -2517,7 +2659,11 @@
  1263 @@ -2167,6 +2594,19 @@
       
  1264  			avg(&summary[S_SENDMSG_USECS]),
       
  1265  			avg(&summary[S_RTT_USECS]),
       
  1266  			soak_arr? scale * cpu_total : -1.0);
       
  1267 +
       
  1268 +		if (show_histogram) 
       
  1269 +		{
       
  1270 +			for (i = 0; i < opts->nr_tasks; i++)
       
  1271 +			  for (j=0;j < MAX_BUCKETS; j++)
       
  1272 +			    latency_histogram[j] += ctl[i].latency_histogram[j];
       
  1273 +			    
       
  1274 +			printf("\nRTT histogram\n");
       
  1275 +			printf("RTT (us)        \t\t    Count\n");
       
  1276 +			for (i=0;i < MAX_BUCKETS; i++)
       
  1277 +			  printf("[%6u - %6u] \t\t %8u\n", 1 << i, 1 << (i+1), 
       
  1278 +			         (unsigned int)latency_histogram[i]);
       
  1279 +		}
       
  1280  	}
       
  1281  }
       
  1282  
       
  1283 @@ -2220,6 +2660,21 @@
       
  1284  {
       
  1285  	ssize_t ret;
       
  1286  
       
  1287 +	if (size == sizeof(struct options)) {
       
  1288 +		memset(ptr, 0, size);
       
  1289 +		ret = read(fd, peer_version, VERSION_MAX_LEN);
       
  1290 +		if (ret != VERSION_MAX_LEN)
       
  1291 +			die_errno("Failed to read version");
       
  1292 +
       
  1293 +		if (strcmp(peer_version, RDS_VERSION)) {
       
  1294 +			ptr += ret;
       
  1295 +			memcpy(ptr, peer_version, VERSION_MAX_LEN);
       
  1296 +			size = sizeof(struct options_2_0_6) - ret;
       
  1297 +		} else
       
  1298 +			size -= ret;
       
  1299 +		ptr += ret;
       
  1300 +	}
       
  1301 +
       
  1302  	while (size) {
       
  1303  		ret = read(fd, ptr, size);
       
  1304  		if (ret < 0)
       
  1305 @@ -2233,6 +2688,7 @@
       
  1306  
       
  1307  static void encode_options(struct options *dst, const struct options *src)
       
  1308  {
       
  1309 +	memcpy(dst->version, src->version, VERSION_MAX_LEN);
       
  1310  	dst->req_depth = htonl(src->req_depth);
       
  1311  	dst->req_size = htonl(src->req_size);
       
  1312  	dst->ack_size = htonl(src->ack_size);
       
  1313 @@ -2262,10 +2718,13 @@
       
  1314          dst->simplex = src->simplex;                    /* byte sized */
       
  1315          dst->rw_mode = src->rw_mode;                    /* byte sized */
       
  1316          dst->rdma_vector = htonl(src->rdma_vector);
       
  1317 +	dst->tos = src->tos;
       
  1318 +	dst->async = src->async;
       
  1319  }
       
  1320  
       
  1321  static void decode_options(struct options *dst, const struct options *src)
       
  1322  {
       
  1323 +	memcpy(dst->version, src->version, VERSION_MAX_LEN);
       
  1324  	dst->req_depth = ntohl(src->req_depth);
       
  1325  	dst->req_size = ntohl(src->req_size);
       
  1326  	dst->ack_size = ntohl(src->ack_size);
       
  1327 @@ -2295,6 +2754,8 @@
       
  1328          dst->simplex = src->simplex;                    /* byte sized */
       
  1329          dst->rw_mode = src->rw_mode;                    /* byte sized */
       
  1330  	dst->rdma_vector = ntohl(src->rdma_vector);
       
  1331 +	dst->tos = src->tos;
       
  1332 +	dst->async = src->async;
       
  1333  }
       
  1334  
       
  1335  static void verify_option_encdec(const struct options *opts)
       
  1336 @@ -2316,6 +2777,25 @@
       
  1337  		die("encode/decode check of options struct failed");
       
  1338  }
       
  1339  
       
  1340 +static void reset_conn(struct options *opts)
       
  1341 +{
       
  1342 +	struct rds_reset val;
       
  1343 +	int fd;
       
  1344 +	struct sockaddr_in sin;
       
  1345 +
       
  1346 +	sin.sin_family = AF_INET;
       
  1347 +	sin.sin_port = htons(opts->starting_port);
       
  1348 +	sin.sin_addr.s_addr = htonl(opts->receive_addr);
       
  1349 +
       
  1350 +	fd = bound_socket(pf, SOCK_SEQPACKET, 0, &sin);
       
  1351 +
       
  1352 +	val.tos = opts->tos;
       
  1353 +	val.src.s_addr = htonl(opts->receive_addr);
       
  1354 +	val.dst.s_addr = htonl(opts->send_addr);
       
  1355 +	if (setsockopt(fd, sol, RDS_CONN_RESET, &val, sizeof(val)))
       
  1356 +		die_errno("setsockopt RDS_CONN_RESET failed");
       
  1357 +}
       
  1358 +
       
  1359  static int active_parent(struct options *opts, struct soak_control *soak_arr)
       
  1360  {
       
  1361  	struct options enc_options;
       
  1362 @@ -2324,6 +2804,11 @@
       
  1363  	int fd;
       
  1364  	uint8_t ok;
       
  1365  
       
  1366 +	if (reset_connection) {
       
  1367 +		reset_conn(opts);
       
  1368 +		return 0;
       
  1369 +	}
       
  1370 +
       
  1371  	if (opts->show_params) {
       
  1372  		unsigned int k;
       
  1373  
       
  1374 @@ -2387,7 +2872,11 @@
       
  1375  	 * We just tell the peer what options to use.
       
  1376  	 */
       
  1377  	encode_options(&enc_options, opts);
       
  1378 -	peer_send(fd, &enc_options, sizeof(struct options));
       
  1379 +	if (opts->tos || opts->async)
       
  1380 +		peer_send(fd, &enc_options, sizeof(struct options));
       
  1381 +	else
       
  1382 +		peer_send(fd, &enc_options.req_depth,
       
  1383 +				sizeof(struct options_2_0_6));
       
  1384  
       
  1385  	printf("negotiated options, tasks will start in 2 seconds\n");
       
  1386  	ctl = start_children(opts, 1);
       
  1387 @@ -2517,7 +3006,11 @@
   441  	/* an extra terminating entry which will be all 0s */
  1388  	/* an extra terminating entry which will be all 0s */
   442  	len = (nr_soak + 1) * sizeof(struct soak_control);
  1389  	len = (nr_soak + 1) * sizeof(struct soak_control);
   443  	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
  1390  	soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
   444 +#if defined(__SVR4) && defined(__sun)
  1391 +#if defined(__SVR4) && defined(__sun)
   445 +			MAP_ANONYMOUS|MAP_SHARED, -1, 0);
  1392 +			MAP_ANONYMOUS|MAP_SHARED, -1, 0);
   447  			MAP_ANONYMOUS|MAP_SHARED, 0, 0);
  1394  			MAP_ANONYMOUS|MAP_SHARED, 0, 0);
   448 +#endif
  1395 +#endif
   449  	if (soak_arr == MAP_FAILED)
  1396  	if (soak_arr == MAP_FAILED)
   450  		die("mmap of %ld soak control structs failed", nr_soak);
  1397  		die("mmap of %ld soak control structs failed", nr_soak);
   451  
  1398  
   452 @@ -2589,6 +2735,7 @@
  1399 @@ -2572,6 +3065,10 @@
       
  1400  	OPT_CONNECT_RETRIES,
       
  1401  	OPT_USE_CONG_MONITOR,
       
  1402  	OPT_PERFDATA,
       
  1403 +        OPT_SHOW_OUTLIERS,
       
  1404 +        OPT_SHOW_HISTOGRAM,
       
  1405 +	OPT_RESET,
       
  1406 +	OPT_ASYNC,
       
  1407  };
       
  1408  
       
  1409  static struct option long_options[] = {
       
  1410 @@ -2584,11 +3081,13 @@
       
  1411  { "send-addr",		required_argument,	NULL,	's'	},
       
  1412  { "port",		required_argument,	NULL,	'p'	},
       
  1413  { "time",		required_argument,	NULL,	'T'	},
       
  1414 +{ "tos",                required_argument,      NULL,   'Q'     },
       
  1415  { "report-cpu",		no_argument,		NULL,	'c'	},
       
  1416  { "report-summary",	no_argument,		NULL,	'z'	},
   453  { "rtprio",		no_argument,		NULL,	'R'	},
  1417  { "rtprio",		no_argument,		NULL,	'R'	},
   454  { "verify",		no_argument,		NULL,	'v'	},
  1418  { "verify",		no_argument,		NULL,	'v'	},
   455  { "trace",		no_argument,		NULL,	'V'	},
  1419  { "trace",		no_argument,		NULL,	'V'	},
   456 +{ "lgrpid",		required_argument,	NULL,	'g'	},
  1420 +{ "lgrpid",		required_argument,	NULL,	'g'	},
   457  
  1421  
   458  { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
  1422  { "rdma-use-once",	required_argument,	NULL,	OPT_RDMA_USE_ONCE },
   459  { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
  1423  { "rdma-use-get-mr",	required_argument,	NULL,	OPT_RDMA_USE_GET_MR },
   460 @@ -2652,7 +2799,7 @@
  1424 @@ -2601,6 +3100,10 @@
       
  1425  { "show-perfdata",	no_argument,		NULL,	OPT_PERFDATA },
       
  1426  { "connect-retries",	required_argument,	NULL,	OPT_CONNECT_RETRIES },
       
  1427  { "use-cong-monitor",	required_argument,	NULL,	OPT_USE_CONG_MONITOR },
       
  1428 +{ "show-outliers",      required_argument,      NULL,   OPT_SHOW_OUTLIERS    },
       
  1429 +{ "show-histogram",     no_argument,            NULL,   OPT_SHOW_HISTOGRAM   },
       
  1430 +{ "reset",              no_argument,            NULL,   OPT_RESET },
       
  1431 +{ "async",              no_argument,            NULL,   OPT_ASYNC },
       
  1432  
       
  1433  { NULL }
       
  1434  };
       
  1435 @@ -2640,6 +3143,8 @@
       
  1436  	opts.use_cong_monitor = 1;
       
  1437  	opts.rdma_use_fence = 1;
       
  1438  	opts.rdma_cache_mrs = 0;
       
  1439 +	opts.rdma_use_once = 0;
       
  1440 +	opts.rdma_use_get_mr = 0;
       
  1441  	opts.rdma_alignment = 0;
       
  1442  	opts.rdma_key_o_meter = 0;
       
  1443  	opts.show_params = 0;
       
  1444 @@ -2648,11 +3153,17 @@
       
  1445          opts.simplex = 0;
       
  1446          opts.rw_mode = 0;
       
  1447  	opts.rdma_vector = 1;
       
  1448 +        rtt_threshold = ~0U;
       
  1449 +        show_histogram = 0;
       
  1450 +	opts.tos = 0;
       
  1451 +	reset_connection = 0;
       
  1452 +	opts.async = 0;
       
  1453 +	strcpy(opts.version, RDS_VERSION);
       
  1454  
   461  	while(1) {
  1455  	while(1) {
   462  		int c, index;
  1456  		int c, index;
   463  
  1457  
   464 -		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
  1458 -		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
   465 +		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVg:z",
  1459 +		c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:Q:vVg:z",
   466  				long_options, &index);
  1460  				long_options, &index);
   467  		if (c == -1)
  1461  		if (c == -1)
   468  			break;
  1462  			break;
   469 @@ -2711,6 +2858,10 @@
  1463 @@ -2702,6 +3213,9 @@
       
  1464  			case 'T':
       
  1465  				opts.run_time = parse_ull(optarg, (uint32_t)~0);
       
  1466  				break;
       
  1467 +			case 'Q':
       
  1468 +				opts.tos = parse_ull(optarg, (uint8_t)~0);
       
  1469 +				break;
       
  1470  			case 'z':
       
  1471  				opts.summary_only = 1;
       
  1472  				break;
       
  1473 @@ -2711,9 +3225,25 @@
   470  			case 'V':
  1474  			case 'V':
   471  				opts.tracing = 1;
  1475  				opts.tracing = 1;
   472  				break;
  1476  				break;
   473 +			case 'g':
  1477 +			case 'g':
   474 +				lgrp_id = (lgrp_id_t)parse_ull(optarg,
  1478 +				lgrp_id = (lgrp_id_t)parse_ull(optarg,
   475 +				    (uint32_t)~0);
  1479 +				    (uint32_t)~0);
   476 +				break;
  1480 +				break;
       
  1481 +                        case OPT_SHOW_OUTLIERS:
       
  1482 +                                rtt_threshold = parse_ull(optarg, ~0U);
       
  1483 +                                break;
       
  1484 +                        case OPT_SHOW_HISTOGRAM:
       
  1485 +                                show_histogram = 1;
       
  1486 +                                break;
   477  			case OPT_USE_CONG_MONITOR:
  1487  			case OPT_USE_CONG_MONITOR:
   478  				opts.use_cong_monitor = parse_ull(optarg, 1);
  1488  				opts.use_cong_monitor = parse_ull(optarg, 1);
   479  				break;
  1489  				break;
   480 @@ -2786,6 +2937,7 @@
  1490 +			case OPT_RESET:
       
  1491 +				reset_connection = 1;
       
  1492 +				break;
       
  1493 +			case OPT_ASYNC:
       
  1494 +				opts.async = 1;
       
  1495 +				break;
       
  1496  			case OPT_RDMA_USE_ONCE:
       
  1497  				opts.rdma_use_once = parse_ull(optarg, 1);
       
  1498  				break;
       
  1499 @@ -2786,6 +3316,7 @@
   481  	if (opts.rdma_size && 0)
  1500  	if (opts.rdma_size && 0)
   482  		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
  1501  		opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
   483  
  1502  
   484 +	set_my_lgrp();
  1503 +	set_my_lgrp();
   485  	opt = opts;
  1504  	opt = opts;
   542 +#endif
  1561 +#endif
   543 +
  1562 +
   544  /* Like inet_ntoa, but can be re-entered several times without clobbering
  1563  /* Like inet_ntoa, but can be re-entered several times without clobbering
   545   * the previously returned string. */
  1564   * the previously returned string. */
   546  static const char *paddr(int af, const void *addrp)
  1565  static const char *paddr(int af, const void *addrp)
   547 @@ -234,8 +250,10 @@
  1566 @@ -134,18 +150,20 @@
       
  1567  {
       
  1568  	struct rds_info_connection conn;
       
  1569  
       
  1570 -	printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n",
       
  1571 -		"LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg");
       
  1572 +	printf("\nRDS Connections:\n%15s %15s %4s %16s %16s %4s\n",
       
  1573 +		"LocalAddr", "RemoteAddr", "Tos", "NextTX", "NextRX", "Flgs");
       
  1574  	
       
  1575  	for_each(conn, data, each, len) {
       
  1576 -		printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n",
       
  1577 +		printf("%15s %15s %4u %16"PRIu64" %16"PRIu64" %c%c%c%c\n",
       
  1578  			ipv4addr(conn.laddr),
       
  1579  			ipv4addr(conn.faddr),
       
  1580 +			conn.tos,
       
  1581  			conn.next_tx_seq,
       
  1582  			conn.next_rx_seq,
       
  1583  			rds_conn_flag(conn, SENDING, 's'),
       
  1584  			rds_conn_flag(conn, CONNECTING, 'c'),
       
  1585 -			rds_conn_flag(conn, CONNECTED, 'C'));
       
  1586 +			rds_conn_flag(conn, CONNECTED, 'C'),
       
  1587 +			rds_conn_flag(conn, ERROR, 'E'));
       
  1588  	}
       
  1589  }
       
  1590  
       
  1591 @@ -153,16 +171,17 @@
       
  1592  {
       
  1593  	struct rds_info_message msg;
       
  1594  
       
  1595 -	printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n",
       
  1596 +	printf("\n%s Message Queue:\n%15s %5s %15s %5s %4s %16s %10s\n",
       
  1597  		(char *)extra,
       
  1598 -		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes");
       
  1599 +		"LocalAddr", "LPort", "RemoteAddr", "RPort", "Tos","Seq", "Bytes");
       
  1600  	
       
  1601  	for_each(msg, data, each, len) {
       
  1602 -		printf("%15s %5u %15s %5u %16"PRIu64" %10u\n",
       
  1603 +		printf("%15s %5u %15s %5u %4u %16"PRIu64" %10u\n",
       
  1604  			ipv4addr(msg.laddr),
       
  1605  			ntohs(msg.lport),
       
  1606  			ipv4addr(msg.faddr),
       
  1607  			ntohs(msg.fport),
       
  1608 +			msg.tos,
       
  1609  			msg.seq, msg.len);
       
  1610  	}
       
  1611  }
       
  1612 @@ -191,13 +210,14 @@
       
  1613  {
       
  1614  	struct rds_info_rdma_connection ic;
       
  1615  
       
  1616 -	printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n",
       
  1617 -		"LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev");
       
  1618 +	printf("\nRDS IB Connections:\n%15s %15s %4s %3s %32s %32s\n",
       
  1619 +		"LocalAddr", "RemoteAddr", "Tos", "SL", "LocalDev", "RemoteDev");
       
  1620  
       
  1621  	for_each(ic, data, each, len) {
       
  1622 -		printf("%15s %15s %32s %32s",
       
  1623 +		printf("%15s %15s %4u %3u %32s %32s",
       
  1624  			ipv4addr(ic.src_addr),
       
  1625  			ipv4addr(ic.dst_addr),
       
  1626 +			ic.tos,ic.sl,
       
  1627  			ipv6addr(ic.src_gid),
       
  1628  			ipv6addr(ic.dst_gid));
       
  1629  
       
  1630 @@ -234,8 +254,10 @@
   548  		print_msgs, "Send", 0 },
  1631  		print_msgs, "Send", 0 },
   549  	['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
  1632  	['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
   550  		  print_msgs, "Retransmit", 0 },
  1633  		  print_msgs, "Retransmit", 0 },
   551 +#if !(defined(__SVR4) && defined(__sun))
  1634 +#if !(defined(__SVR4) && defined(__sun))
   552  	['T'] = { RDS_INFO_TCP_SOCKETS, "TCP transport sockets",
  1635  	['T'] = { RDS_INFO_TCP_SOCKETS, "TCP transport sockets",
   553  		  print_tcp_socks, NULL, 0 },
  1636  		  print_tcp_socks, NULL, 0 },
   554 +#endif
  1637 +#endif
   555  	['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
  1638  	['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
   556  		  print_ib_conns, NULL, 0 },
  1639  		  print_ib_conns, NULL, 0 },
   557  };
  1640  };
   558 @@ -266,6 +284,9 @@
  1641 @@ -266,6 +288,9 @@
   559  	char optstring[258] = "v+";
  1642  	char optstring[258] = "v+";
   560  	int given_options = 0;
  1643  	int given_options = 0;
   561  	socklen_t len = 0;
  1644  	socklen_t len = 0;
   562 +#if defined(__SVR4) && defined(__sun)
  1645 +#if defined(__SVR4) && defined(__sun)
   563 +	struct rds_info_arg arg;
  1646 +	struct rds_info_arg arg;
   564 +#endif
  1647 +#endif
   565  	void *data = NULL;
  1648  	void *data = NULL;
   566  	int fd;
  1649  	int fd;
   567  	int each;
  1650  	int each;
   568 @@ -322,6 +343,7 @@
  1651 @@ -322,6 +347,7 @@
   569  		    (given_options && !infos[i].option_given))
  1652  		    (given_options && !infos[i].option_given))
   570  			continue;
  1653  			continue;
   571  
  1654  
   572 +#if !(defined(__SVR4) && defined(__sun))
  1655 +#if !(defined(__SVR4) && defined(__sun))
   573  		/* read in the info until we get a full snapshot */
  1656  		/* read in the info until we get a full snapshot */
   574  		while ((each = getsockopt(fd, sol, infos[i].opt_val, data,
  1657  		while ((each = getsockopt(fd, sol, infos[i].opt_val, data,
   575  				   &len)) < 0) {
  1658  				   &len)) < 0) {
   576 @@ -345,15 +367,47 @@
  1659 @@ -345,15 +371,47 @@
   577  				return 1;
  1660  				return 1;
   578  			}
  1661  			}
   579  		}
  1662  		}
   580 +#else
  1663 +#else
   581 +		int retcode;
  1664 +		int retcode;
   723  .BR rds-rdma (7),
  1806  .BR rds-rdma (7),
   724  .BR socket (2),
  1807  .BR socket (2),
   725 diff -r -u /tmp/rds-tools-2.0.4/rds-info.1 rds-tools-2.0.7/rds-info.1
  1808 diff -r -u /tmp/rds-tools-2.0.4/rds-info.1 rds-tools-2.0.7/rds-info.1
   726 --- /tmp/rds-tools-2.0.4/rds-info.1	Wed Aug  4 15:25:11 2010
  1809 --- /tmp/rds-tools-2.0.4/rds-info.1	Wed Aug  4 15:25:11 2010
   727 +++ rds-tools-2.0.7/rds-info.1	Thu Feb 24 13:27:51 2011
  1810 +++ rds-tools-2.0.7/rds-info.1	Thu Feb 24 13:27:51 2011
   728 @@ -1,162 +1,150 @@
  1811 @@ -1,162 +1,160 @@
   729 -.Dd October 30, 2006
  1812 -.Dd October 30, 2006
   730 -.Dt RDS-INFO 1
  1813 -.Dt RDS-INFO 1
   731 -.Os
  1814 -.Os
   732 -.Sh NAME
  1815 -.Sh NAME
   733 -.Nm rds-info
  1816 -.Nm rds-info
   826  connection establishment.
  1909  connection establishment.
   827 -.It RemoteAddr
  1910 -.It RemoteAddr
   828 +.IP	RemoteAddr
  1911 +.IP	RemoteAddr
   829  The IP address of the remote end of the connection.  
  1912  The IP address of the remote end of the connection.  
   830 -.It NextTX
  1913 -.It NextTX
       
  1914 +.IP	Tos
       
  1915 +The type of service value for this connection.
   831 +.IP	NextTX
  1916 +.IP	NextTX
   832  The sequence number that will be given to the next message that is sent
  1917  The sequence number that will be given to the next message that is sent
   833  over the connection.
  1918  over the connection.
   834 -.It NextRX
  1919 -.It NextRX
   835 +.IP	NextRX
  1920 +.IP	NextRX
   857 +	The transport is attempting to connect to the
  1942 +	The transport is attempting to connect to the
   858 +	remote address.
  1943 +	remote address.
   859 +.IP 		C
  1944 +.IP 		C
   860 +	The connection to the remote host is connected
  1945 +	The connection to the remote host is connected
   861 +	and active.
  1946 +	and active.
       
  1947 +.IP 		E
       
  1948 +	The connection to the remote host is in error.
   862 +
  1949 +
   863 +.TP
  1950 +.TP
   864 +\fB\-r\fR, \fB\-s\fR, \fB\-t\fR
  1951 +\fB\-r\fR, \fB\-s\fR, \fB\-t\fR
   865  Display the messages in the receive, send, or retransmit queues respectively.
  1952  Display the messages in the receive, send, or retransmit queues respectively.
   866 -.Bl -tag -width 4
  1953 -.Bl -tag -width 4
   873 -.It RemoteAddr, RPort
  1960 -.It RemoteAddr, RPort
   874 +.IP	RemoteAddr, RPort
  1961 +.IP	RemoteAddr, RPort
   875  The remote IP address and port associated with the message. For sent messages
  1962  The remote IP address and port associated with the message. For sent messages
   876  this is the destination address, for receive messages it is the source address.
  1963  this is the destination address, for receive messages it is the source address.
   877 -.It Seq
  1964 -.It Seq
       
  1965 +.IP	Tos
       
  1966 +The type of service for this message.
   878 +.IP	Seq
  1967 +.IP	Seq
   879  The sequence number of the message.
  1968  The sequence number of the message.
   880 -.It Bytes
  1969 -.It Bytes
   881 +.IP	Bytes
  1970 +.IP	Bytes
   882  The number of bytes in the message payload.
  1971  The number of bytes in the message payload.
   899  The local IP address of this connection.
  1988  The local IP address of this connection.
   900 -.It RemoteAddr
  1989 -.It RemoteAddr
   901 +.IP	RemoteAddr
  1990 +.IP	RemoteAddr
   902  The remote IP address of this connection.
  1991  The remote IP address of this connection.
   903 -.It LocalDev
  1992 -.It LocalDev
       
  1993 +.IP	Tos
       
  1994 +The type of service value for this connection.
       
  1995 +.IP	SL
       
  1996 +The QoS Service Level for this connection.
   904 +.IP	LocalDev
  1997 +.IP	LocalDev
   905  The local IB Global Identifier, printed in IPv6 address syntax.
  1998  The local IB Global Identifier, printed in IPv6 address syntax.
   906 -.It RemoteDev
  1999 -.It RemoteDev
   907 +.IP	RemoteDev
  2000 +.IP	RemoteDev
   908  The remote IB Global Identifier, printed in IPv6 address syntax.
  2001  The remote IB Global Identifier, printed in IPv6 address syntax.
   952 -.El
  2045 -.El
   953 -.Pp
  2046 -.Pp
   954 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.1 rds-tools-2.0.7/rds-ping.1
  2047 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.1 rds-tools-2.0.7/rds-ping.1
   955 --- /tmp/rds-tools-2.0.4/rds-ping.1	Wed Aug  4 15:25:11 2010
  2048 --- /tmp/rds-tools-2.0.4/rds-ping.1	Wed Aug  4 15:25:11 2010
   956 +++ rds-tools-2.0.7/rds-ping.1	Thu Feb 24 13:27:52 2011
  2049 +++ rds-tools-2.0.7/rds-ping.1	Thu Feb 24 13:27:52 2011
   957 @@ -1,69 +1,54 @@
  2050 @@ -1,69 +1,63 @@
   958 -.Dd Apr 22, 2008
  2051 -.Dd Apr 22, 2008
   959 -.Dt RDS-PING 1
  2052 -.Dt RDS-PING 1
   960 -.Os
  2053 -.Os
   961 -.Sh NAME
  2054 -.Sh NAME
   962 -.Nm rds-ping
  2055 -.Nm rds-ping
   977 -.Nm rds-ping
  2070 -.Nm rds-ping
   978 -is used to test whether a remote node is reachable over RDS.
  2071 -is used to test whether a remote node is reachable over RDS.
   979 -Its interface is designed to operate pretty much the standard
  2072 -Its interface is designed to operate pretty much the standard
   980 -.Xr ping 8
  2073 -.Xr ping 8
   981 +.SH SYNOPSIS
  2074 +.SH SYNOPSIS
   982 +.B rds-ping [-c count] [-i interval] [-I local_addr] remote_addr
  2075 +.HP
       
  2076 +.nf
       
  2077 +rds-ping [-c count] [-Q tos] [-i interval] [-I local_addr]
       
  2078 +    remote_addr
       
  2079 +.fi
   983 +
  2080 +
   984 +.SH DESCRIPTION
  2081 +.SH DESCRIPTION
   985 +.PP
  2082 +.PP
   986 +rds-ping is used to test whether a remote node is reachable over RDS.
  2083 +rds-ping is used to test whether a remote node is reachable over RDS.
   987 +Its interface is designed to operate pretty much the standard ping(1M) 
  2084 +Its interface is designed to operate pretty much the standard ping(1M) 
  1009  packets.
  2106  packets.
  1010 -.It Fl I Ar address
  2107 -.It Fl I Ar address
  1011 -By default,
  2108 -By default,
  1012 -.Nm rds-ping
  2109 -.Nm rds-ping
  1013 -will pick the local source address for the RDS socket based
  2110 -will pick the local source address for the RDS socket based
       
  2111 +.TP
       
  2112 +\fB\-Q tos
       
  2113 +By default, rds-ping sends the ping requests on base (tos = 0) RDS connection.
       
  2114 +With this option, the requests are sent on RDS connection with the specified tos
       
  2115 +value.  Valid values are 0-255.
  1014 +.TP
  2116 +.TP
  1015 +\fB\-I address
  2117 +\fB\-I address
  1016 +By default, rds-ping will pick the local source address for the RDS socket based
  2118 +By default, rds-ping will pick the local source address for the RDS socket based
  1017  on routing information for the destination address (i.e. if
  2119  on routing information for the destination address (i.e. if
  1018  packets to the given destination would be routed through interface
  2120  packets to the given destination would be routed through interface
  1070  
  2172  
  1071  AC_SUBST(VERSION)
  2173  AC_SUBST(VERSION)
  1072 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.c rds-tools-2.0.7/rds-ping.c
  2174 diff -r -u /tmp/rds-tools-2.0.4/rds-ping.c rds-tools-2.0.7/rds-ping.c
  1073 --- /tmp/rds-tools-2.0.4/rds-ping.c	Wed Aug  4 15:25:10 2010
  2175 --- /tmp/rds-tools-2.0.4/rds-ping.c	Wed Aug  4 15:25:10 2010
  1074 +++ rds-tools-2.0.7/rds-ping.c	Thu Feb 24 13:27:52 2011
  2176 +++ rds-tools-2.0.7/rds-ping.c	Thu Feb 24 13:27:52 2011
  1075 @@ -48,7 +48,11 @@
  2177 @@ -48,7 +48,12 @@
  1076  #include <sys/poll.h>
  2178  #include <sys/poll.h>
  1077  #include <fcntl.h>
  2179  #include <fcntl.h>
  1078  #include <getopt.h>
  2180  #include <getopt.h>
       
  2181 +#include <sys/ioctl.h>
  1079 +#if defined(__SVR4) && defined(__sun)
  2182 +#if defined(__SVR4) && defined(__sun)
  1080 +#include <sys/rds.h>
  2183 +#include <sys/rds.h>
  1081 +#else
  2184 +#else
  1082  #include "rds.h"
  2185  #include "rds.h"
  1083 +#endif
  2186 +#endif
  1084  
  2187  
  1085  #include "pfhack.h"
  2188  #include "pfhack.h"
  1086  
  2189  
  1087 @@ -155,7 +159,12 @@
  2190 @@ -67,6 +72,7 @@
       
  2191  static unsigned long	opt_count;
       
  2192  static struct in_addr	opt_srcaddr;
       
  2193  static struct in_addr	opt_dstaddr;
       
  2194 +static uint8_t		opt_tos = 0;
       
  2195  
       
  2196  /* For reasons of simplicity, RDS ping does not use a packet
       
  2197   * payload that is being echoed, the way ICMP does.
       
  2198 @@ -91,6 +97,7 @@
       
  2199  static int	parse_timeval(const char *, struct timeval *);
       
  2200  static int	parse_long(const char *ptr, unsigned long *);
       
  2201  static int	parse_addr(const char *ptr, struct in_addr *);
       
  2202 +static unsigned long long	parse_ull(char *ptr, unsigned long long max);
       
  2203  
       
  2204  int
       
  2205  main(int argc, char **argv)
       
  2206 @@ -97,7 +104,7 @@
       
  2207  {
       
  2208  	int c;
       
  2209  
       
  2210 -	while ((c = getopt(argc, argv, "c:i:I:")) != -1) {
       
  2211 +	while ((c = getopt(argc, argv, "c:i:I:Q:")) != -1) {
       
  2212  		switch (c) {
       
  2213  		case 'c':
       
  2214  			if (!parse_long(optarg, &opt_count))
       
  2215 @@ -114,6 +121,9 @@
       
  2216  				die("Bad wait time <%s>\n", optarg);
       
  2217  			break;
       
  2218  
       
  2219 +		case 'Q':
       
  2220 +			opt_tos = parse_ull(optarg, 255);
       
  2221 +			break;
       
  2222  		default:
       
  2223  			usage("Unknown option");
       
  2224  		}
       
  2225 @@ -142,6 +152,9 @@
       
  2226  	struct timeval	next_ts;
       
  2227  	struct socket	socket[NSOCKETS];
       
  2228  	struct pollfd	pfd[NSOCKETS];
       
  2229 +#if !(defined(__SVR4) && defined(__sun))
       
  2230 +	int             pending[NSOCKETS];
       
  2231 +#endif
       
  2232  	int		i, next = 0;
       
  2233  
       
  2234  	for (i = 0; i < NSOCKETS; ++i) {
       
  2235 @@ -152,10 +165,18 @@
       
  2236  		socket[i].fd = fd;
       
  2237  		pfd[i].fd = fd;
       
  2238  		pfd[i].events = POLLIN;
       
  2239 +#if !(defined(__SVR4) && defined(__sun))
       
  2240 +		pending[i] = 0;
       
  2241 +#endif
  1088  	}
  2242  	}
  1089  
  2243  
  1090  	memset(&sin, 0, sizeof(sin));
  2244  	memset(&sin, 0, sizeof(sin));
  1091 +#if defined(__SVR4) && defined(__sun)
  2245 +#if defined(__SVR4) && defined(__sun)
  1092 +	sin.sin_family = AF_INET_OFFLOAD;
  2246 +	sin.sin_family = AF_INET_OFFLOAD;
  1095 +#endif
  2249 +#endif
  1096 +
  2250 +
  1097  	sin.sin_addr = opt_dstaddr;
  2251  	sin.sin_addr = opt_dstaddr;
  1098  
  2252  
  1099  	gettimeofday(&next_ts, NULL);
  2253  	gettimeofday(&next_ts, NULL);
  1100 @@ -181,7 +190,7 @@
  2254 @@ -180,14 +201,32 @@
       
  2255  			if (opt_count && sent >= opt_count)
  1101  				break;
  2256  				break;
  1102  
  2257  
  1103  			timeradd(&next_ts, &opt_wait, &next_ts);
  2258 -			timeradd(&next_ts, &opt_wait, &next_ts);
  1104 -			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
  2259 -			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
  1105 +			if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
  2260 -				err = errno;
  1106  				err = errno;
  2261 -			sp->sent_id = ++sent;
  1107  			sp->sent_id = ++sent;
  2262 -			sp->sent_ts = now;
  1108  			sp->sent_ts = now;
  2263 -			sp->nreplies = 0;
  1109 @@ -258,7 +267,11 @@
  2264 -			next = (next + 1) % NSOCKETS;
       
  2265 +			timeradd(&now, &opt_wait, &next_ts);
       
  2266 +#if !(defined(__SVR4) && defined(__sun))
       
  2267 +			if (!pending[next]) {
       
  2268 +#endif
       
  2269 +				memset(&sin, 0, sizeof(sin));
       
  2270 +#if defined(__SVR4) && defined(__sun)
       
  2271 +				sin.sin_family = AF_INET_OFFLOAD;
       
  2272 +#else
       
  2273 +				sin.sin_family = AF_INET;
       
  2274 +#endif
       
  2275 +				sin.sin_addr = opt_dstaddr;
       
  2276  
       
  2277 +				if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
       
  2278 +					err = errno;
       
  2279 +				sp->sent_id = ++sent;
       
  2280 +				sp->sent_ts = now;
       
  2281 +				sp->nreplies = 0;
       
  2282 +#if !(defined(__SVR4) && defined(__sun))
       
  2283 +				if (!err)
       
  2284 +					pending[next] = 1;
       
  2285 +#endif
       
  2286 +				next = (next + 1) % NSOCKETS;
       
  2287 +#if !(defined(__SVR4) && defined(__sun))
       
  2288 +			}
       
  2289 +#endif
       
  2290 +
       
  2291  			if (err) {
       
  2292  				static unsigned int nerrs = 0;
       
  2293  
       
  2294 @@ -223,6 +262,9 @@
       
  2295  					report_packet(sp, &now, NULL, errno);
       
  2296  			} else {
       
  2297  				report_packet(sp, &now, &from.sin_addr, 0);
       
  2298 +#if !(defined(__SVR4) && defined(__sun))
       
  2299 +				pending[i] = 0;
       
  2300 +#endif
       
  2301  				recv++;
       
  2302  			}
       
  2303  		}
       
  2304 @@ -258,7 +300,11 @@
  1110  	int pf;
  2305  	int pf;
  1111  
  2306  
  1112  	memset(&sin, 0, sizeof(sin));
  2307  	memset(&sin, 0, sizeof(sin));
  1113 +#if defined(__SVR4) && defined(__sun)
  2308 +#if defined(__SVR4) && defined(__sun)
  1114 +	sin.sin_family = AF_INET_OFFLOAD;
  2309 +	sin.sin_family = AF_INET_OFFLOAD;
  1116  	sin.sin_family = AF_INET;
  2311  	sin.sin_family = AF_INET;
  1117 +#endif
  2312 +#endif
  1118  
  2313  
  1119  #ifdef DYNAMIC_PF_RDS
  2314  #ifdef DYNAMIC_PF_RDS
  1120          pf = discover_pf_rds();
  2315          pf = discover_pf_rds();
  1121 @@ -278,6 +291,9 @@
  2316 @@ -278,6 +324,9 @@
  1122  		if (ufd < 0)
  2317  		if (ufd < 0)
  1123  			die_errno("unable to create UDP socket");
  2318  			die_errno("unable to create UDP socket");
  1124  		sin.sin_addr = *dst;
  2319  		sin.sin_addr = *dst;
  1125 +#if defined(__SVR4) && defined(__sun)
  2320 +#if defined(__SVR4) && defined(__sun)
  1126 +		sin.sin_family = AF_INET;
  2321 +		sin.sin_family = AF_INET;
  1127 +#endif
  2322 +#endif
  1128  		sin.sin_port = htons(1);
  2323  		sin.sin_port = htons(1);
  1129  		if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
  2324  		if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
  1130  			die_errno("unable to connect to %s",
  2325  			die_errno("unable to connect to %s",
  1131 @@ -289,6 +305,9 @@
  2326 @@ -289,6 +338,9 @@
  1132  
  2327  
  1133  		*src = sin.sin_addr;
  2328  		*src = sin.sin_addr;
  1134  		close(ufd);
  2329  		close(ufd);
  1135 +#if defined(__SVR4) && defined(__sun)
  2330 +#if defined(__SVR4) && defined(__sun)
  1136 +		sin.sin_family = AF_INET_OFFLOAD;
  2331 +		sin.sin_family = AF_INET_OFFLOAD;
  1137 +#endif
  2332 +#endif
  1138  	}
  2333  	}
  1139  
  2334  
  1140  	sin.sin_addr = *src;
  2335  	sin.sin_addr = *src;
       
  2336 @@ -297,6 +349,9 @@
       
  2337  	if (bind(fd, (struct sockaddr *) &sin, sizeof(sin)))
       
  2338  		die_errno("bind() failed");
       
  2339  
       
  2340 +	if (opt_tos && ioctl(fd, SIOCRDSSETTOS, &opt_tos)) 
       
  2341 +		die_errno("ERROR: failed to set TOS\n");
       
  2342 +
       
  2343  	return fd;
       
  2344  }
       
  2345  
       
  2346 @@ -309,7 +364,8 @@
       
  2347  		"%s\nUsage: rds-ping [options] dst_addr\n"
       
  2348  		"Options:\n"
       
  2349  		" -c count      limit packet count\n"
       
  2350 -		" -I interface  source IP address\n",
       
  2351 +		" -I interface  source IP address\n"
       
  2352 +		" -Q tos	type of service\n",
       
  2353  		complaint);
       
  2354  	exit(1);
       
  2355  }
       
  2356 @@ -384,3 +440,31 @@
       
  2357  	return 0;
       
  2358  }
       
  2359  
       
  2360 +static unsigned long long parse_ull(char *ptr, unsigned long long max)
       
  2361 +{
       
  2362 +	unsigned long long val;
       
  2363 +	char *endptr;
       
  2364 +
       
  2365 +	val = strtoull(ptr, &endptr, 0);
       
  2366 +	switch (*endptr) {
       
  2367 +	case 'k': case 'K':
       
  2368 +		val <<= 10;
       
  2369 +		endptr++;
       
  2370 +		break;
       
  2371 +
       
  2372 +	case 'm': case 'M':
       
  2373 +		val <<= 20;
       
  2374 +		endptr++;
       
  2375 +		break;
       
  2376 +
       
  2377 +	case 'g': case 'G':
       
  2378 +		val <<= 30;
       
  2379 +		endptr++;
       
  2380 +		break;
       
  2381 +	}
       
  2382 +
       
  2383 +	if (*ptr && !*endptr && val <= max)
       
  2384 +		return val;
       
  2385 +
       
  2386 +	die("invalid number '%s'\n", ptr);
       
  2387 +}
  1141 diff -r -u /tmp/rds-tools-2.0.4/Makefile.in rds-tools-2.0.7/Makefile.in
  2388 diff -r -u /tmp/rds-tools-2.0.4/Makefile.in rds-tools-2.0.7/Makefile.in
  1142 --- /tmp/rds-tools-2.0.4/Makefile.in	Wed Aug  4 15:25:11 2010
  2389 --- /tmp/rds-tools-2.0.4/Makefile.in	Wed Aug  4 15:25:11 2010
  1143 +++ rds-tools-2.0.7/Makefile.in	Thu Feb 24 13:27:51 2011
  2390 +++ rds-tools-2.0.7/Makefile.in	Thu Feb 24 13:27:51 2011
  1144 @@ -4,18 +4,22 @@
  2391 @@ -4,18 +4,22 @@
  1145  mandir		= $(DESTDIR)@mandir@
  2392  mandir		= $(DESTDIR)@mandir@
  1288  
  2535  
  1289  
  2536  
  1290 diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
  2537 diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
  1291 --- /tmp/rds-tools-2.0.4/rds-stress.1	Wed Aug  4 15:25:11 2010
  2538 --- /tmp/rds-tools-2.0.4/rds-stress.1	Wed Aug  4 15:25:11 2010
  1292 +++ rds-tools-2.0.7/rds-stress.1	Thu Feb 24 13:27:52 2011
  2539 +++ rds-tools-2.0.7/rds-stress.1	Thu Feb 24 13:27:52 2011
  1293 @@ -1,99 +1,102 @@
  2540 @@ -1,99 +1,106 @@
  1294 -.Dd May 15, 2007
  2541 -.Dd May 15, 2007
  1295 -.Dt RDS-STRESS 1
  2542 -.Dt RDS-STRESS 1
  1296 -.Os
  2543 -.Os
  1297 -.Sh NAME
  2544 -.Sh NAME
  1298 -.Nm rds-stress
  2545 -.Nm rds-stress
  1319 +.PP
  2566 +.PP
  1320 +.SH SYNOPSIS
  2567 +.SH SYNOPSIS
  1321 +.HP
  2568 +.HP
  1322 +.nf
  2569 +.nf
  1323 +rds-stress [-p port_number] -r [receive_address] [-s send_address]
  2570 +rds-stress [-p port_number] -r [receive_address] [-s send_address]
  1324 +      [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
  2571 +      [-Q tos] [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
  1325 +      [-d queue_depth] [-t nr_tasks] [-c] [-R] [-V] [-v]
  2572 +      [-d queue_depth] [-t nr_tasks] [-T time] [-c] [-R] [-V] [-v]
  1326 +.fi
  2573 +.fi
  1327  
  2574  
  1328 -.Sh DESCRIPTION
  2575 -.Sh DESCRIPTION
  1329 -.Nm rds-stress
  2576 -.Nm rds-stress
  1330 +
  2577 +
  1421  obtain the address once the control connection is established.
  2668  obtain the address once the control connection is established.
  1422  The active process will choose a local address based on the interface through
  2669  The active process will choose a local address based on the interface through
  1423  which it connects to the destination address.
  2670  which it connects to the destination address.
  1424 -.It Fl a Ar ack_bytes
  2671 -.It Fl a Ar ack_bytes
  1425 +.TP
  2672 +.TP
       
  2673 +\fB\-Q tos
       
  2674 +Uses the RDS connection between IP addresses with the specified tos value. By 
       
  2675 +default, the base (tos = 0) RDS connection is used.  Valid values are 0-255.
       
  2676 +.TP
  1426 +\fB\-a ack_bytes
  2677 +\fB\-a ack_bytes
  1427  This specifies the size of the ack messages, in bytes. There is a minimum size
  2678  This specifies the size of the ack messages, in bytes. There is a minimum size
  1428  which depends on the format of the ack messages, which may change over time.
  2679  which depends on the format of the ack messages, which may change over time.
  1429  See section "Message Sizes" below.
  2680  See section "Message Sizes" below.
  1430 -.It Fl q Ar request_bytes
  2681 -.It Fl q Ar request_bytes
  1437 +.TP
  2688 +.TP
  1438 +\fB\-D rdma_bytes
  2689 +\fB\-D rdma_bytes
  1439  RDSv3 is capable of transmitting part of a message via RDMA directly from
  2690  RDSv3 is capable of transmitting part of a message via RDMA directly from
  1440  application buffer to application buffer. This option enables RDMA support
  2691  application buffer to application buffer. This option enables RDMA support
  1441  in rds-stress: request packets include parameters for an RDMA READ or WRITE
  2692  in rds-stress: request packets include parameters for an RDMA READ or WRITE
  1442 @@ -100,20 +103,25 @@
  2693 @@ -100,20 +107,25 @@
  1443  operation, which the receiving process executes at the time the ACK packet
  2694  operation, which the receiving process executes at the time the ACK packet
  1444  is sent.
  2695  is sent.
  1445  See section "Message Sizes" below.
  2696  See section "Message Sizes" below.
  1446 -.It Fl d Ar queue_depth
  2697 -.It Fl d Ar queue_depth
  1447 +.TP
  2698 +.TP
  1468 +.TP
  2719 +.TP
  1469 +\fB\-c
  2720 +\fB\-c
  1470  This causes rds-stress to create child tasks which just consume CPU cycles.
  2721  This causes rds-stress to create child tasks which just consume CPU cycles.
  1471  One task is created for each CPU in the system.  First each child observes the
  2722  One task is created for each CPU in the system.  First each child observes the
  1472  maximum rate at which it can consume cycles.  This means that this option
  2723  maximum rate at which it can consume cycles.  This means that this option
  1473 @@ -121,50 +129,67 @@
  2724 @@ -121,50 +133,67 @@
  1474  use of the system by observing the lesser rate at which the children consume
  2725  use of the system by observing the lesser rate at which the children consume
  1475  cycles.  This option is *not* shared between the active and passive instances.
  2726  cycles.  This option is *not* shared between the active and passive instances.
  1476  It must be specified on each rds-stress command line.
  2727  It must be specified on each rds-stress command line.
  1477 -.It Fl R
  2728 -.It Fl R
  1478 +.TP
  2729 +.TP
  1535 +mbi K/s
  2786 +mbi K/s
  1536 +The total number of bytes that are being received via RDMA READs and
  2787 +The total number of bytes that are being received via RDMA READs and
  1537  WRITEs for all children.
  2788  WRITEs for all children.
  1538 -.It tx us/c
  2789 -.It tx us/c
  1539 +.TP
  2790 +.TP
  1540 +mbi K/s
  2791 +mbo K/s
  1541 +The total number of bytes that are being transmited via RDMA READs and
  2792 +The total number of bytes that are being transmited via RDMA READs and
  1542 +WRITEs for all children.
  2793 +WRITEs for all children.
  1543 +.TP
  2794 +.TP
  1544 +tx us/c
  2795 +tx us/c
  1545  The average number of microseconds spent in sendmsg() calls.
  2796  The average number of microseconds spent in sendmsg() calls.
  1555 +.TP
  2806 +.TP
  1556 +cpu %
  2807 +cpu %
  1557  This is the percentage of available CPU resources on this machine that are being
  2808  This is the percentage of available CPU resources on this machine that are being
  1558  consumed since rds-stress started running.  It will show -1.00 if -c is not
  2809  consumed since rds-stress started running.  It will show -1.00 if -c is not
  1559  given.  It is calculated based on the amount of CPU resources that CPU soaking
  2810  given.  It is calculated based on the amount of CPU resources that CPU soaking
  1560 @@ -171,4 +196,3 @@
  2811 @@ -171,4 +200,3 @@
  1561  tasks are able to consume.  This lets it measure CPU use by the system, say in
  2812  tasks are able to consume.  This lets it measure CPU use by the system, say in
  1562  interrupt handlers, that task-based CPU accounting does not include.
  2813  interrupt handlers, that task-based CPU accounting does not include.
  1563  For this to work rds-stress must be started with -c on an idle system.
  2814  For this to work rds-stress must be started with -c on an idle system.
  1564 -.El
  2815 -.El
  1565 diff -r -u /tmp/rds-tools-2.0.4/include/rds.h rds-tools-2.0.7/include/rds.h
  2816 diff -r -u /tmp/rds-tools-2.0.4/include/rds.h rds-tools-2.0.7/include/rds.h