--- a/components/open-fabrics/rds-tools/patches/base.patch Thu Jun 26 11:51:15 2014 -0700
+++ b/components/open-fabrics/rds-tools/patches/base.patch Tue Jul 01 14:49:32 2014 -0700
@@ -15,10 +15,11 @@
#include <sys/stat.h>
#include <sys/poll.h>
#include <ctype.h>
-@@ -22,10 +28,16 @@
+@@ -22,12 +28,27 @@
#include <fcntl.h>
#include <sched.h>
#include <getopt.h>
++#include <sys/ioctl.h>
+#if !(defined(__SVR4) && defined(__sun))
#include <byteswap.h>
#include "rds.h"
@@ -31,9 +32,99 @@
+#include <infiniband/ofa_solaris.h>
+#endif
++#if defined(__SVR4) && defined(__sun)
/*
++ * This definition is forward looking and is not yet present
++ * in Solaris rds.h file
++ */
++#define RDS_CMSG_RDMA_SEND_STATUS RDS_CMSG_RDMA_STATUS
++#endif
++
++/*
*
-@@ -102,6 +114,10 @@
+ * TODO
+ * - checksum the data some day.
+@@ -45,8 +66,9 @@
+ M_RDMA_READ_ONLY,
+ M_RDMA_WRITE_ONLY
+ };
++#define VERSION_MAX_LEN 16
+
+-struct options {
++struct options_2_0_6 {
+ uint32_t req_depth;
+ uint32_t req_size;
+ uint32_t ack_size;
+@@ -76,9 +98,68 @@
+ uint32_t connect_retries;
+ } __attribute__((packed));
+
++struct options {
++ char version[VERSION_MAX_LEN];
++ uint32_t req_depth;
++ uint32_t req_size;
++ uint32_t ack_size;
++ uint32_t rdma_size;
++ uint32_t send_addr;
++ uint32_t receive_addr;
++ uint16_t starting_port;
++ uint16_t nr_tasks;
++ uint32_t run_time;
++ uint8_t summary_only;
++ uint8_t rtprio;
++ uint8_t tracing;
++ uint8_t verify;
++ uint8_t show_params;
++ uint8_t show_perfdata;
++ uint8_t use_cong_monitor;
++ uint8_t rdma_use_once;
++ uint8_t rdma_use_get_mr;
++ uint8_t rdma_use_fence;
++ uint8_t rdma_cache_mrs;
++ uint8_t rdma_key_o_meter;
++ uint8_t suppress_warnings;
++ uint8_t simplex;
++ uint8_t rw_mode;
++ uint32_t rdma_vector;
++ uint32_t rdma_alignment;
++ uint32_t connect_retries;
++ uint8_t tos;
++ uint8_t async;
++} __attribute__((packed));
++
++#define MAX_BUCKETS 16
++
+ static struct options opt;
+ static int control_fd;
++static uint64_t rtt_threshold;
++static int show_histogram;
++static int reset_connection;
++static char peer_version[VERSION_MAX_LEN];
+
++static int get_bucket(uint64_t rtt_time)
++{
++ int i;
++ uint64_t l_rtt_time = rtt_time;
++
++ if (!l_rtt_time)
++ i = 0;
++ else
++ {
++ i = -1;
++ while (l_rtt_time)
++ {
++ i++;
++ l_rtt_time = (l_rtt_time >> 1);
++ }
++ }
++
++ return i;
++}
++
+ struct counter {
+ uint64_t nr;
+ uint64_t sum;
+@@ -102,6 +183,10 @@
#define NR_STATS S__LAST
@@ -44,7 +135,7 @@
/*
* Parents share a mapped array of these with their children. Each child
* gets one. It's used to communicate between the child and the parent
-@@ -110,6 +126,7 @@
+@@ -110,9 +195,11 @@
struct child_control {
pid_t pid;
int ready;
@@ -52,7 +143,51 @@
struct timeval start;
struct counter cur[NR_STATS];
struct counter last[NR_STATS];
-@@ -254,7 +271,20 @@
++ uint64_t latency_histogram[MAX_BUCKETS];
+ } __attribute__((aligned (256))); /* arbitrary */
+
+ struct soak_control {
+@@ -132,6 +219,7 @@
+ */
+ #define OP_REQ 1
+ #define OP_ACK 2
++#define OP_DUMP 3
+
+ #define RDMA_OP_READ 1
+ #define RDMA_OP_WRITE 2
+@@ -148,7 +236,7 @@
+ uint16_t from_port;
+ uint16_t to_port;
+ uint16_t index;
+- uint8_t op;
++ uint8_t op;
+
+ /* RDMA related.
+ * rdma_op must be the first field, because we
+@@ -162,12 +250,21 @@
+ uint32_t rdma_size;
+ uint32_t rdma_vector;
+
+- uint8_t data[0];
++ /* Async send related. */
++ uint8_t retry;
++ uint8_t rdma_remote_err;
++ uint8_t pending;
++
++ uint8_t data[0];
+ } __attribute__((packed));
+
+ #define MIN_MSG_BYTES (sizeof(struct header))
+ #define BASIC_HEADER_SIZE (size_t)(&((struct header *) 0)->rdma_op)
+
++#define print_outlier(...) do { \
++ fprintf(stderr, __VA_ARGS__); \
++} while (0)
++
+ #define die(fmt...) do { \
+ fprintf(stderr, fmt); \
+ exit(1); \
+@@ -254,7 +351,20 @@
die("invalid host name or dotted quad '%s'\n", ptr);
}
@@ -73,7 +208,15 @@
static void usage(void)
{
fprintf(stderr, "rds-stress version %s\n", RDS_VERSION);
-@@ -281,6 +311,9 @@
+@@ -273,6 +383,7 @@
+ " -d [depth, 1] request pipeline depth, nr outstanding\n"
+ " -t [nr, 1] number of child tasks\n"
+ " -T [seconds, 0] runtime of test, 0 means infinite\n"
++ " -Q [tos, 0] Type of Service\n"
+ " -D [bytes] RDMA: size\n"
+ " -I [iovecs, 1] RDMA: number of user buffers to target (max 512)\n"
+ " -M [nr, 0] RDMA: mode (0=readwrite,1=readonly,2=writeonly)\n"
+@@ -281,6 +392,9 @@
" -c measure cpu use with per-cpu soak processes\n"
" -V trace execution\n"
" -z print a summary at end of test only\n"
@@ -83,7 +226,7 @@
"\n"
"Example:\n"
" recv$ rds-stress\n"
-@@ -310,7 +343,7 @@
+@@ -310,7 +424,7 @@
static void check_parent(pid_t pid)
{
if (pid != getppid())
@@ -92,7 +235,7 @@
}
/*
-@@ -334,6 +367,7 @@
+@@ -334,6 +448,7 @@
msg_pattern[i] = k;
}
@@ -100,7 +243,7 @@
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define htonll(x) bswap_64(x)
#define ntohll(x) bswap_64(x)
-@@ -341,6 +375,7 @@
+@@ -341,6 +456,7 @@
#define htonll(x) (x)
#define ntohll(x) (x)
#endif
@@ -108,7 +251,71 @@
static void encode_hdr(struct header *dst, const struct header *hdr)
{
-@@ -584,7 +619,11 @@
+@@ -361,6 +477,7 @@
+ dst->rdma_key = htonll(hdr->rdma_key);
+ dst->rdma_size = htonl(hdr->rdma_size);
+ dst->rdma_vector = htonl(hdr->rdma_vector);
++ dst->retry = hdr->retry;
+ }
+
+ static void decode_hdr(struct header *dst, const struct header *hdr)
+@@ -382,6 +499,7 @@
+ dst->rdma_key = ntohll(hdr->rdma_key);
+ dst->rdma_size = ntohl(hdr->rdma_size);
+ dst->rdma_vector = ntohl(hdr->rdma_vector);
++ dst->retry = hdr->retry;
+ }
+
+ static void fill_hdr(void *message, uint32_t bytes, struct header *hdr)
+@@ -412,11 +530,19 @@
+ * Compare incoming message header with expected header. All header fields
+ * are in host byte order except for address and port fields.
+ */
+-static int check_hdr(void *message, uint32_t bytes, const struct header *hdr)
++static int check_hdr(void *message, uint32_t bytes, struct header *hdr, struct options *opts)
+ {
+ struct header msghdr;
++ uint32_t inc_seq;
++ uint32_t my_seq;
+
+ decode_hdr(&msghdr, message);
++ inc_seq = msghdr.seq;
++ my_seq = hdr->seq;
++
++ if (msghdr.retry && (inc_seq < my_seq))
++ return -1;
++
+ if (memcmp(&msghdr, hdr, BASIC_HEADER_SIZE)) {
+ #define bleh(var, disp) \
+ disp(hdr->var), \
+@@ -428,7 +554,7 @@
+ * with stdout() and we don't get things stomping on each
+ * other
+ */
+- printf( "An incoming message had a header which\n"
++ printf( "An incoming message had a %s header which\n"
+ "didn't contain the fields we expected:\n"
+ " member expected eq got\n"
+ " seq %15u %s %15u\n"
+@@ -438,6 +564,7 @@
+ " to_port %15u %s %15u\n"
+ " index %15u %s %15u\n"
+ " op %15u %s %15u\n",
++ (msghdr.retry) ? "RETRY" : "",
+ bleh(seq, /**/),
+ bleh(from_addr, inet_ntoa_32),
+ bleh(from_port, ntohs),
+@@ -569,6 +696,9 @@
+
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+
++ if (opts->tos && ioctl(fd, SIOCRDSSETTOS, &opts->tos))
++ die_errno("ERROR: failed to set TOS\n");
++
+ return fd;
+ }
+
+@@ -584,7 +714,11 @@
if (opts->receive_addr == 0)
return 1;
@@ -120,7 +327,16 @@
sin.sin_port = htons(opts->starting_port);
sin.sin_addr.s_addr = htonl(opts->receive_addr);
-@@ -677,7 +716,11 @@
+@@ -639,7 +773,7 @@
+ mr_args.flags = RDS_FREE_MR_ARGS_INVALIDATE;
+ #endif
+ if (setsockopt(fd, sol, RDS_FREE_MR, &mr_args, sizeof(mr_args)))
+- die_errno("setsockopt(RDS_FREE_MR) failed");
++ return;
+ mrs_allocated--;
+ }
+
+@@ -677,7 +811,11 @@
size = sizeof(struct rdma_key_o_meter)
+ 2 * nr_tasks * sizeof(*kt)
+ 2 * RDMA_MAX_TRACKED_KEYS * sizeof(*ks);
@@ -132,7 +348,7 @@
if (base == MAP_FAILED)
die_errno("alloc_rdma_buffers: mmap failed");
-@@ -828,7 +871,7 @@
+@@ -828,13 +966,20 @@
}
if (!failed)
@@ -141,7 +357,35 @@
(unsigned long long) pattern, addr);
}
-@@ -865,7 +908,11 @@
++struct retry_entry {
++ uint32_t retries;
++ uint32_t seq;
++ int status;
++};
++
+ struct task {
+ unsigned int nr;
+ unsigned int pending;
++ int trace;
+ unsigned int unacked;
+ struct sockaddr_in src_addr; /* same for all tasks */
+ struct sockaddr_in dst_addr;
+@@ -846,7 +991,14 @@
+ uint16_t recv_index;
+ struct timeval * send_time;
+ struct header * ack_header;
++ struct header * ack2_header;
++ struct header * req_header;
++ uint64_t * retry_token;
++ uint32_t retries;
++ uint32_t last_retry_seq;
++ uint32_t retry_index;
+
++
+ /* RDMA related stuff */
+ uint64_t ** local_buf;
+ uint64_t ** rdma_buf;
+@@ -865,7 +1017,11 @@
/* We use mmap here rather than malloc, because it is always
* page aligned. */
len = 2 * opts->nr_tasks * opts->req_depth * (opts->rdma_vector * opts->rdma_size) + sys_page_size;
@@ -153,7 +397,7 @@
if (base == MAP_FAILED)
die_errno("alloc_rdma_buffers: mmap failed");
memset(base, 0x2f, len);
-@@ -915,17 +962,16 @@
+@@ -915,17 +1071,16 @@
if (RDMA_OP_READ == hdr->rdma_op) {
if (opt.verify)
rds_fill_buffer(rdma_addr, rdma_size, hdr->rdma_pattern);
@@ -177,7 +421,7 @@
}
}
-@@ -947,7 +993,7 @@
+@@ -947,7 +1102,7 @@
die("Unexpected RDMA op %u in request\n", in_hdr->rdma_op);
@@ -186,7 +430,104 @@
in_hdr->rdma_op == RDMA_OP_WRITE? "write to" : "read from",
rdma_size,
(unsigned long long) in_hdr->rdma_addr,
-@@ -1007,6 +1053,9 @@
+@@ -966,21 +1121,33 @@
+ hdr->rdma_vector = in_hdr->rdma_vector;
+ }
+
+-static inline unsigned int rdma_user_token(struct task *t, unsigned int qindex)
++static inline uint64_t rdma_user_token(struct task *t, unsigned int qindex, unsigned int type, uint32_t seq)
+ {
+- return t->nr * opt.req_depth + qindex;
++ uint64_t tmp = seq;
++ return (tmp << 32) | ((t->nr * opt.req_depth + qindex) << 2 | type);
+ }
+
+-static void rdma_mark_completed(struct task *tasks, unsigned int token, int status)
++static void rdma_mark_completed(struct task *tasks, uint64_t token, int status, struct options *opts)
+ {
+ struct task *t;
+ unsigned int i;
++ struct header *hdr = NULL;
++ uint32_t seq = token >> 32;
++ unsigned int type = token & 0x03;
++ unsigned int index = (token & 0xFFFFFFFF) >> 2;
+
+- trace("RDS rdma completion for token %x\n", token);
++ trace("RDS rdma completion for token 0x%lx\n", token);
+
+- t = &tasks[token / opt.req_depth];
+- i = token % opt.req_depth;
++ t = &tasks[index / opt.req_depth];
++ i = index % opt.req_depth;
+
++ if (opts->async) {
++ if (type == OP_REQ)
++ hdr = &t->req_header[i];
++ else
++ hdr = &t->ack2_header[i];
++ }
++
+ if (status) {
+ const char *errmsg;
+
+@@ -987,20 +1154,50 @@
+ switch (status) {
+ case RDS_RDMA_REMOTE_ERROR:
+ errmsg = "remote error"; break;
+- case RDS_RDMA_CANCELED:
+- errmsg = "operation was cancelled"; break;
+- case RDS_RDMA_DROPPED:
++ case RDS_RDMA_SEND_DROPPED:
+ errmsg = "operation was dropped"; break;
+- case RDS_RDMA_OTHER_ERROR:
++ case RDS_RDMA_SEND_CANCELED:
++ errmsg = "operation was cancelled"; break;
++ case RDS_RDMA_SEND_OTHER_ERROR:
+ errmsg = "other error"; break;
+ default:
+ errmsg = "unknown error"; break;
+ }
+
+- printf("%s:%u: RDMA op %u failed: %s\n",
++ trace("%s:%u: %s failed: %s\n",
+ inet_ntoa(t->dst_addr.sin_addr),
+ ntohs(t->dst_addr.sin_port),
+- i, errmsg);
++ type ? "SEND" : "RDMA",
++ errmsg);
++
++ if (hdr &&
++ (status == RDS_RDMA_SEND_DROPPED ||
++ status == RDS_RDMA_REMOTE_ERROR)) {
++
++ if (hdr->seq == seq) {
++ hdr->retry = 1;
++ if (hdr->seq > t->last_retry_seq) {
++ if (status == RDS_RDMA_REMOTE_ERROR)
++ hdr->rdma_remote_err = 1;
++ t->retry_token[t->retry_index] = token;
++ t->retry_index = (t->retry_index + 1) %
++ (2 * opts->req_depth);
++ t->retries += 1;
++ t->last_retry_seq = hdr->seq;
++ if (t->retries > 2 * opts->req_depth)
++ die("Exceeded MAX retry entries..\n");
++ }
++ } else
++ die("SEQ Out-Of-Sync: %u/%u\n", hdr->seq, seq);
++ } else if (hdr) {
++ hdr->pending = 0;
++ hdr->retry = 0;
++ hdr->rdma_remote_err = 0;
++ }
++ } else if (hdr) {
++ hdr->pending = 0;
++ hdr->retry = 0;
++ hdr->rdma_remote_err = 0;
+ }
+
+ t->rdma_inflight[i] = 0;
+@@ -1007,6 +1204,9 @@
t->drain_rdmas = 0;
}
@@ -196,10 +537,430 @@
#define MSG_MAXIOVLEN 2
/*
-@@ -1560,7 +1609,12 @@
+@@ -1018,11 +1218,14 @@
+ static char ctlbuf[1024];
+ struct cmsghdr *cmsg;
+
+- msg->msg_control = ctlbuf;
+- msg->msg_controllen = CMSG_SPACE(size);
+-
+- cmsg = CMSG_FIRSTHDR(msg);
+- cmsg->cmsg_level = sol;
++ if (!msg->msg_control) {
++ msg->msg_control = ctlbuf;
++ msg->msg_controllen = CMSG_SPACE(size);
++ cmsg = CMSG_FIRSTHDR(msg);
++ } else {
++ cmsg = (struct cmsghdr *)((char *)msg->msg_control + msg->msg_controllen);
++ msg->msg_controllen += CMSG_SPACE(size);
++ }cmsg->cmsg_level = sol;
+ cmsg->cmsg_type = type;
+ cmsg->cmsg_len = CMSG_LEN(size);
+ memcpy(CMSG_DATA(cmsg), ptr, size);
+@@ -1034,7 +1237,7 @@
+ * the ACK packet.
+ */
+ static void rdma_build_cmsg_xfer(struct msghdr *msg, const struct header *hdr,
+- unsigned int user_token, void *local_buf)
++ uint64_t user_token, void *local_buf)
+ {
+
+ #define RDS_MAX_IOV 512 /* FIX_ME - put this into rds.h or use socket max ?*/
+@@ -1048,7 +1251,7 @@
+ rdma_size = hdr->rdma_size;
+ rdma_vector = hdr->rdma_vector;
+
+- trace("RDS issuing rdma for token %x key %Lx len %u local_buf %p vector %u\n",
++ trace("RDS issuing rdma for token 0x%lx key 0x%llx len %d local_buf %p vector %d\n",
+ user_token,
+ (unsigned long long) hdr->rdma_key,
+ rdma_size, local_buf,
+@@ -1102,6 +1305,15 @@
+ rdma_put_cmsg(msg, RDS_CMSG_RDMA_ARGS, &args, sizeof(args));
+ }
+
++static void build_cmsg_async_send(struct msghdr *msg, uint64_t user_token)
++{
++ struct rds_asend_args args;
++
++ args.flags |= RDS_SEND_NOTIFY_ME;
++ args.user_token = user_token;
++ rdma_put_cmsg(msg, RDS_CMSG_ASYNC_SEND, &args, sizeof(args));
++}
++
+ static void rdma_build_cmsg_dest(struct msghdr *msg, rds_rdma_cookie_t rdma_dest)
+ {
+ rdma_put_cmsg(msg, RDS_CMSG_RDMA_DEST, &rdma_dest, sizeof(rdma_dest));
+@@ -1174,19 +1386,17 @@
+ hdr->index = qindex;
+ }
+
+-static int send_packet(int fd, struct task *t,
+- struct header *hdr, unsigned int size)
++static int send_msg(int fd, struct task *t, struct header *hdr,
++ unsigned int size, struct options *opts,
++ struct child_control *ctl)
+ {
+- unsigned char buf[size], *rdma_flight_recorder = NULL;
++ unsigned char buf[size];
++ uint8_t *rdma_flight_recorder = NULL;
+ rds_rdma_cookie_t cookie = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ ssize_t ret;
+
+- /* Make sure we always have the current sequence number.
+- * When we send ACK packets, the seq that gets filled in is
+- * stale. */
+- hdr->seq = t->send_seq;
+ fill_hdr(buf, size, hdr);
+
+ memset(&msg, 0, sizeof(msg));
+@@ -1198,27 +1408,10 @@
+ iov.iov_base = buf;
+ iov.iov_len = size;
+
+- /* If this is a REQ packet in which we pass the MR to the
+- * peer, extract the RDMA cookie and pass it on in the control
+- * message for now. */
+- if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
+- if (hdr->rdma_key != 0) {
+- /* We used GET_MR to obtain a key */
+- rdma_build_cmsg_dest(&msg, hdr->rdma_key);
+- cookie = hdr->rdma_key;
+- hdr->rdma_key = 0;
+- } else {
+- /* Use the RDMA_MAP cmsg to have sendmsg do the
+- * mapping on the fly. */
+- rdma_build_cmsg_map(&msg, hdr->rdma_addr,
+- hdr->rdma_size * hdr->rdma_vector,
+- &cookie);
+- }
+- }
+
+ /* If this is an ACK packet with RDMA, build the cmsg
+- * header that goes with it. */
+- if (hdr->op == OP_ACK && hdr->rdma_op != 0) {
++ * header that goes with it. */
++ if (hdr->op == OP_ACK && hdr->rdma_op != 0 && !hdr->rdma_remote_err) {
+ unsigned int qindex = hdr->index;
+
+ if (t->rdma_inflight[qindex] != 0) {
+@@ -1230,16 +1423,35 @@
+ *
+ * We return one of the more obscure error messages,
+ * which we recognize and handle in the top loop. */
+- trace("Drain RDMA 0x%x\n", rdma_user_token(t, qindex));
++ trace("Drain RDMA 0x%lx\n", rdma_user_token(t, qindex, 0, hdr->seq));
+ errno = EBADSLT;
+ return -1;
+ }
+ rdma_build_cmsg_xfer(&msg, hdr,
+- rdma_user_token(t, qindex),
++ rdma_user_token(t, qindex, 0, hdr->seq),
+ t->local_buf[qindex]);
+ rdma_flight_recorder = &t->rdma_inflight[qindex];
++ } else if (opts->async) {
++ if (hdr->op == OP_REQ)
++ build_cmsg_async_send(&msg,
++ rdma_user_token(t, hdr->index, OP_REQ, hdr->seq));
++ else
++ build_cmsg_async_send(&msg,
++ rdma_user_token(t, hdr->index, OP_ACK, hdr->seq));
+ }
+
++ if (hdr->op == OP_REQ && hdr->rdma_op != 0) {
++ if (hdr->rdma_key != 0) {
++ rdma_build_cmsg_dest(&msg, hdr->rdma_key);
++ cookie = hdr->rdma_key;
++ hdr->rdma_key = 0;
++ } else {
++ rdma_build_cmsg_map(&msg, hdr->rdma_addr,
++ hdr->rdma_size * hdr->rdma_vector,
++ &cookie);
++ }
++ }
++
+ ret = sendmsg(fd, &msg, 0);
+ if (ret < 0) {
+ if (errno != EAGAIN && errno != ENOBUFS)
+@@ -1256,10 +1468,41 @@
+ * lower 32bit of the cookie */
+ rdma_key_o_meter_add(cookie);
+ }
++
++ hdr->pending = 1;
++
++ return ret;
++}
++
++static int send_packet(int fd, struct task *t,
++ struct header *hdr, unsigned int size,
++ struct options *opts, struct child_control *ctl)
++{
++ ssize_t ret;
++
++ /* Make sure we always have the current sequence number.
++ * When we send ACK packets, the seq that gets filled in is
++ * stale. */
++ hdr->seq = t->send_seq;
++
++ ret = send_msg(fd, t, hdr, size, opts, ctl);
++ if (ret < 0) return ret;
++
+ t->send_seq++;
+ return ret;
+ }
+
++static int resend_packet(int fd, struct task *t,
++ struct header *hdr, unsigned int size,
++ struct options *opts, struct child_control *ctl)
++{
++ ssize_t ret;
++
++ ret = send_msg(fd, t, hdr, size, opts, ctl);
++
++ return ret;
++}
++
+ static int send_one(int fd, struct task *t,
+ struct options *opts,
+ struct child_control *ctl)
+@@ -1266,12 +1509,16 @@
+ {
+ struct timeval start;
+ struct timeval stop;
+- struct header hdr;
++ struct header *hdr = &t->req_header[t->send_index];
+ int ret;
+
+- build_header(t, &hdr, OP_REQ, t->send_index);
++ if (opts->async && hdr->pending) {
++ return -1;
++ }
++
++ build_header(t, hdr, OP_REQ, t->send_index);
+ if (opts->rdma_size && t->send_seq > 10)
+- rdma_build_req(fd, &hdr, t,
++ rdma_build_req(fd, hdr, t,
+ opts->rdma_size,
+ opts->req_depth,
+ opts->rw_mode,
+@@ -1279,7 +1526,7 @@
+
+
+ gettimeofday(&start, NULL);
+- ret = send_packet(fd, t, &hdr, opts->req_size);
++ ret = send_packet(fd, t, hdr, opts->req_size, opts, ctl);
+ gettimeofday(&stop, NULL);
+
+ if (ret < 0)
+@@ -1302,10 +1549,15 @@
+ struct child_control *ctl)
+ {
+ struct header *hdr = &t->ack_header[qindex];
++ struct header *hdr2 = &t->ack2_header[qindex];
+ ssize_t ret;
+
++ if (opts->async && hdr2->pending) {
++ return -1;
++ }
++
+ /* send an ack in response to the req we just got */
+- ret = send_packet(fd, t, hdr, opts->ack_size);
++ ret = send_packet(fd, t, hdr, opts->ack_size, opts, ctl);
+ if (ret < 0)
+ return ret;
+ if (ret != opts->ack_size)
+@@ -1324,6 +1576,8 @@
+ break;
+ }
+
++ memcpy(hdr2, hdr, sizeof(struct header));
++
+ return ret;
+ }
+
+@@ -1354,8 +1608,49 @@
+ struct child_control *ctl,
+ int can_send, int do_work)
+ {
++ struct header *hdr;
++ unsigned int index;
++ int req_size;
++ int num_retries = t->retries;
++ uint64_t token;
++ unsigned int type;
++ unsigned int index2;
++ unsigned int i;
++
++ while (opts->async && num_retries > 0) {
++ index = (t->retry_index - num_retries +
++ (2 * opts->req_depth)) % (2 * opts->req_depth);
++
++ token = t->retry_token[index];
++ type = token & 0x03;
++ index2 = (token & 0xFFFFFFFF) >> 2;
++ i = index2 % opts->req_depth;
++
++ if (type == OP_REQ)
++ hdr = &t->req_header[i];
++ else
++ hdr = &t->ack2_header[i];
++
++ if (!hdr->retry)
++ goto next;
++
++ if (hdr->op == OP_REQ)
++ req_size = opts->req_size;
++ else
++ req_size = opts->ack_size;
++
++ if (resend_packet(fd, t, hdr, req_size, opts, ctl) < 0) {
++ return -1;
++ }
++ hdr->retry = 0;
++next:
++ num_retries--;
++ }
++ t->last_retry_seq = t->retries = 0;
++
+ if (ack_anything(fd, t, opts, ctl, can_send) < 0)
+ return -1;
++
+ while (do_work && t->pending < opts->req_depth) {
+ if (!can_send)
+ goto eagain;
+@@ -1375,7 +1670,8 @@
+ rds_rdma_cookie_t *cookie,
+ struct sockaddr_in *sin,
+ struct timeval *tstamp,
+- struct task *tasks)
++ struct task *tasks,
++ struct options *opts)
+ {
+ struct cmsghdr *cmsg;
+ char cmsgbuf[256];
+@@ -1398,15 +1694,16 @@
+
+ if (ret < 0)
+ return ret;
+- if (ret && ret < sizeof(struct header))
++ if (ret && !strcmp(RDS_VERSION, peer_version) &&
++ ret < sizeof(struct header))
+ die("recvmsg() returned short data: %zd", ret);
+- if (msg.msg_namelen < sizeof(struct sockaddr_in))
++ if (ret && msg.msg_namelen < sizeof(struct sockaddr_in))
+ die("socklen = %d < sizeof(sin) (%zu)\n",
+ msg.msg_namelen, sizeof(struct sockaddr_in));
+
+ /* See if the message comes with a RDMA destination */
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+- struct rds_rdma_notify notify;
++ struct rds_rdma_send_notify notify;
+
+ if (cmsg->cmsg_level != sol)
+ continue;
+@@ -1432,11 +1729,11 @@
+ memcpy(cookie, CMSG_DATA(cmsg), sizeof(*cookie));
+ break;
+
+- case RDS_CMSG_RDMA_STATUS:
++ case RDS_CMSG_RDMA_SEND_STATUS:
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(notify)))
+ die("RDS_CMSG_RDMA_DEST data too small");
+ memcpy(¬ify, CMSG_DATA(cmsg), sizeof(notify));
+- rdma_mark_completed(tasks, notify.user_token, notify.status);
++ rdma_mark_completed(tasks, notify.user_token, notify.status, opts);
+ break;
+ }
+ }
+@@ -1445,7 +1742,8 @@
+
+ static int recv_one(int fd, struct task *tasks,
+ struct options *opts,
+- struct child_control *ctl)
++ struct child_control *ctl,
++ struct child_control *all_ctl)
+ {
+ char buf[max(opts->req_size, opts->ack_size)];
+ rds_rdma_cookie_t rdma_dest = 0;
+@@ -1456,15 +1754,18 @@
+ uint16_t expect_index;
+ int task_index;
+ ssize_t ret;
++ int check_status;
+
+- ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks);
++
++ ret = recv_message(fd, buf, sizeof(buf), &rdma_dest, &sin, &tstamp, tasks, opts);
+ if (ret < 0)
+ return ret;
+
+ /* If we received only RDMA completions or cong updates,
+ * ret will be 0 */
+- if (ret == 0)
++ if (ret == 0) {
+ return 0;
++ }
+
+ /* check the incoming sequence number */
+ task_index = ntohs(sin.sin_port) - opts->starting_port - 1;
+@@ -1508,16 +1809,32 @@
+ hdr.to_port = t->src_addr.sin_port;
+ hdr.index = expect_index;
+
+- if (check_hdr(buf, ret, &hdr))
+- die("header from %s:%u to id %u bogus\n",
+- inet_ntoa(sin.sin_addr), htons(sin.sin_port),
+- ntohs(t->src_addr.sin_port));
++ check_status = check_hdr(buf, ret, &hdr, opts);
++ if (check_status) {
++ if (check_status > 0) {
++ die("header from %s:%u to id %u bogus\n",
++ inet_ntoa(sin.sin_addr), htons(sin.sin_port),
++ ntohs(t->src_addr.sin_port));
++ } else
++ return 0;
++ }
+
+ if (hdr.op == OP_ACK) {
+- stat_inc(&ctl->cur[S_RTT_USECS],
+- usec_sub(&tstamp, &t->send_time[expect_index]));
+- t->pending -= 1;
++ uint64_t rtt_time =
++ usec_sub(&tstamp, &t->send_time[expect_index]);
+
++ stat_inc(&ctl->cur[S_RTT_USECS], rtt_time);
++ if (rtt_time > rtt_threshold)
++ print_outlier("Found RTT = 0x%lx\n", rtt_time);
++
++ if (show_histogram)
++ {
++ ctl->latency_histogram[get_bucket(rtt_time)]++;
++ }
++
++ if (t->pending > 0)
++ t->pending -= 1;
++
+ if (in_hdr.rdma_key)
+ rdma_process_ack(fd, &in_hdr, ctl);
+ } else {
+@@ -1549,6 +1866,7 @@
+ }
+
+ static void run_child(pid_t parent_pid, struct child_control *ctl,
++ struct child_control *all_ctl,
+ struct options *opts, uint16_t id, int active)
+ {
+ struct sockaddr_in sin;
+@@ -1559,8 +1877,15 @@
+ struct task tasks[opts->nr_tasks];
struct timeval start;
int do_work = opts->simplex ? active : 1;
++ int j;
++
+#if defined(__SVR4) && defined(__sun)
+ set_my_lgrp();
+ sin.sin_family = AF_INET_OFFLOAD;
@@ -209,7 +970,7 @@
sin.sin_port = htons(opts->starting_port + 1 + id);
sin.sin_addr.s_addr = htonl(opts->receive_addr);
-@@ -1572,7 +1626,11 @@
+@@ -1572,7 +1897,11 @@
for (i = 0; i < opts->nr_tasks; i++) {
tasks[i].nr = i;
tasks[i].src_addr = sin;
@@ -221,7 +982,37 @@
tasks[i].dst_addr.sin_addr.s_addr = htonl(opts->send_addr);
tasks[i].dst_addr.sin_port = htons(opts->starting_port + 1 + i);
tasks[i].send_time = alloca(opts->req_depth * sizeof(struct timeval));
-@@ -1625,6 +1683,10 @@
+@@ -1581,6 +1910,15 @@
+ tasks[i].rdma_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ tasks[i].local_buf = alloca(opts->req_depth * sizeof(uint64_t *));
+ tasks[i].ack_header = alloca(opts->req_depth * sizeof(struct header));
++ tasks[i].ack2_header = alloca(opts->req_depth * sizeof(struct header));
++ for (j=0;j<opts->req_depth;j++)
++ tasks[i].ack2_header[j].pending = 0;
++
++ tasks[i].req_header = alloca(opts->req_depth * sizeof(struct header));
++ for (j=0;j<opts->req_depth;j++)
++ tasks[i].req_header[j].pending = 0;
++
++ tasks[i].retry_token = alloca(2 * opts->req_depth * sizeof(uint64_t));
+ tasks[i].rdma_next_op = (i & 1)? RDMA_OP_READ : RDMA_OP_WRITE;
+ }
+
+@@ -1611,7 +1949,7 @@
+
+ check_parent(parent_pid);
+
+- ret = poll(&pfd, 1, -1);
++ ret = poll(&pfd, 1, 1000);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+@@ -1621,10 +1959,14 @@
+ pfd.events = POLLIN;
+
+ if (pfd.revents & POLLIN) {
+- while (recv_one(fd, tasks, opts, ctl) >= 0)
++ while (recv_one(fd, tasks, opts, ctl, all_ctl) >= 0)
;
}
@@ -232,7 +1023,15 @@
/* keep the pipeline full */
can_send = !!(pfd.revents & POLLOUT);
for (i = 0, t = tasks; i < opts->nr_tasks; i++, t++) {
-@@ -1665,8 +1727,12 @@
+@@ -1633,6 +1975,7 @@
+ if (t->drain_rdmas)
+ continue;
+ if (send_anything(fd, t, opts, ctl, can_send, do_work) < 0) {
++
+ pfd.events |= POLLOUT;
+
+ /* If the send queue is full, we will see EAGAIN.
+@@ -1665,8 +2008,12 @@
uint32_t i;
len = opts->nr_tasks * sizeof(*ctl);
@@ -245,7 +1044,16 @@
if (ctl == MAP_FAILED)
die("mmap of %u child control structs failed", opts->nr_tasks);
-@@ -1699,7 +1765,7 @@
+@@ -1688,7 +2035,7 @@
+ control_fd = -1;
+ }
+ rdma_key_o_meter_set_self(i);
+- run_child(parent, ctl + i, opts, i, active);
++ run_child(parent, ctl + i, ctl, opts, i, active);
+ exit(0);
+ }
+ ctl[i].pid = pid;
+@@ -1699,7 +2046,7 @@
continue;
pid = waitpid(-1, NULL, WNOHANG);
if (pid)
@@ -254,7 +1062,7 @@
sleep(1);
i--; /* try this child again */
}
-@@ -1823,6 +1889,7 @@
+@@ -1823,6 +2170,7 @@
if (disable)
return;
@@ -262,7 +1070,7 @@
if ((fp = fopen("/proc/stat", "r")) == NULL) {
fprintf(stderr, "Cannot open /proc/stat (%s) - "
"not printing cpu stats\n",
-@@ -1856,10 +1923,37 @@
+@@ -1856,10 +2204,37 @@
}
}
fclose(fp);
@@ -300,7 +1108,7 @@
} else {
struct sys_stats sys;
unsigned long sum = 0;
-@@ -1884,6 +1978,7 @@
+@@ -1884,6 +2259,7 @@
* 5 irq
* 6 softirq
*/
@@ -308,7 +1116,7 @@
printf(",%f,%f,%f,%f,%Lu",
(sys.times[0] + sys.times[1]) * scale,
sys.times[2] * scale,
-@@ -1890,6 +1985,14 @@
+@@ -1890,6 +2266,14 @@
(sys.times[3] + sys.times[4]) * scale,
(sys.times[5] + sys.times[6]) * scale,
sys.intr);
@@ -323,7 +1131,7 @@
}
prev = current;
}
-@@ -1903,6 +2006,10 @@
+@@ -1903,6 +2287,10 @@
static socklen_t buflen = 0;
static int sock_fd = -1;
int i, count, item_size;
@@ -334,7 +1142,7 @@
if (sock_fd < 0) {
sock_fd = socket(pf, SOCK_SEQPACKET, 0);
-@@ -1912,6 +2019,7 @@
+@@ -1912,6 +2300,7 @@
/* We should only loop once on the first call; after that the
* buffer requirements for RDS counters should not change. */
@@ -342,7 +1150,7 @@
while ((item_size = getsockopt(sock_fd, sol, RDS_INFO_COUNTERS, curr, &buflen)) < 0) {
if (errno != ENOSPC)
die_errno("getsockopt(RDS_INFO_COUNTERS) failed");
-@@ -1919,7 +2027,29 @@
+@@ -1919,7 +2308,29 @@
if (!curr)
die_errno("Cannot allocate buffer for stats counters");
}
@@ -372,7 +1180,7 @@
if (item_size > sizeof(*ctr))
die("Bad counter item size in RDS_INFO_COUNTERS (got %d, max %zd)\n",
item_size, sizeof(*ctr));
-@@ -1932,8 +2062,11 @@
+@@ -1932,8 +2343,11 @@
}
for (i = 0; i < count; ++i)
@@ -385,7 +1193,7 @@
gettimeofday(&now, NULL);
if (initialize) {
-@@ -1957,6 +2090,10 @@
+@@ -1957,6 +2371,10 @@
memcpy(prev, ctr, count * sizeof(*ctr));
last_ts = now;
@@ -396,7 +1204,7 @@
get_stats(initialize);
}
-@@ -1967,7 +2104,7 @@
+@@ -1967,7 +2385,7 @@
pid = waitpid(-1, &status, wflags);
if (pid < 0)
@@ -405,7 +1213,7 @@
if (pid == 0)
return 0;
-@@ -1975,15 +2112,15 @@
+@@ -1975,15 +2393,15 @@
if (WEXITSTATUS(status) == 0)
return 1;
die("child pid %u exited with status %d\n",
@@ -424,7 +1232,22 @@
}
static void release_children_and_wait(struct options *opts,
-@@ -2139,7 +2276,12 @@
+@@ -1995,9 +2413,13 @@
+ struct counter summary[NR_STATS];
+ struct timeval start, end, now, first_ts, last_ts;
+ double cpu_total = 0;
+- uint16_t i, cpu_samples = 0;
++ uint16_t i, j, cpu_samples = 0;
+ uint16_t nr_running;
++ uint64_t latency_histogram[MAX_BUCKETS];
+
++ if (show_histogram)
++ memset(latency_histogram, 0, sizeof(latency_histogram));
++
+ gettimeofday(&start, NULL);
+ start.tv_sec += 2;
+ for (i = 0; i < opts->nr_tasks; i++)
+@@ -2139,7 +2561,12 @@
control_fd = -1;
if (nr_running) {
@@ -437,7 +1260,131 @@
kill(ctl[i].pid, SIGTERM);
stop_soakers(soak_arr);
}
-@@ -2517,7 +2659,11 @@
+@@ -2167,6 +2594,19 @@
+ avg(&summary[S_SENDMSG_USECS]),
+ avg(&summary[S_RTT_USECS]),
+ soak_arr? scale * cpu_total : -1.0);
++
++ if (show_histogram)
++ {
++ for (i = 0; i < opts->nr_tasks; i++)
++ for (j=0;j < MAX_BUCKETS; j++)
++ latency_histogram[j] += ctl[i].latency_histogram[j];
++
++ printf("\nRTT histogram\n");
++ printf("RTT (us) \t\t Count\n");
++ for (i=0;i < MAX_BUCKETS; i++)
++ printf("[%6u - %6u] \t\t %8u\n", 1 << i, 1 << (i+1),
++ (unsigned int)latency_histogram[i]);
++ }
+ }
+ }
+
+@@ -2220,6 +2660,21 @@
+ {
+ ssize_t ret;
+
++ if (size == sizeof(struct options)) {
++ memset(ptr, 0, size);
++ ret = read(fd, peer_version, VERSION_MAX_LEN);
++ if (ret != VERSION_MAX_LEN)
++ die_errno("Failed to read version");
++
++ if (strcmp(peer_version, RDS_VERSION)) {
++ ptr += ret;
++ memcpy(ptr, peer_version, VERSION_MAX_LEN);
++ size = sizeof(struct options_2_0_6) - ret;
++ } else
++ size -= ret;
++ ptr += ret;
++ }
++
+ while (size) {
+ ret = read(fd, ptr, size);
+ if (ret < 0)
+@@ -2233,6 +2688,7 @@
+
+ static void encode_options(struct options *dst, const struct options *src)
+ {
++ memcpy(dst->version, src->version, VERSION_MAX_LEN);
+ dst->req_depth = htonl(src->req_depth);
+ dst->req_size = htonl(src->req_size);
+ dst->ack_size = htonl(src->ack_size);
+@@ -2262,10 +2718,13 @@
+ dst->simplex = src->simplex; /* byte sized */
+ dst->rw_mode = src->rw_mode; /* byte sized */
+ dst->rdma_vector = htonl(src->rdma_vector);
++ dst->tos = src->tos;
++ dst->async = src->async;
+ }
+
+ static void decode_options(struct options *dst, const struct options *src)
+ {
++ memcpy(dst->version, src->version, VERSION_MAX_LEN);
+ dst->req_depth = ntohl(src->req_depth);
+ dst->req_size = ntohl(src->req_size);
+ dst->ack_size = ntohl(src->ack_size);
+@@ -2295,6 +2754,8 @@
+ dst->simplex = src->simplex; /* byte sized */
+ dst->rw_mode = src->rw_mode; /* byte sized */
+ dst->rdma_vector = ntohl(src->rdma_vector);
++ dst->tos = src->tos;
++ dst->async = src->async;
+ }
+
+ static void verify_option_encdec(const struct options *opts)
+@@ -2316,6 +2777,25 @@
+ die("encode/decode check of options struct failed");
+ }
+
++static void reset_conn(struct options *opts)
++{
++ struct rds_reset val;
++ int fd;
++ struct sockaddr_in sin;
++
++ sin.sin_family = AF_INET;
++ sin.sin_port = htons(opts->starting_port);
++ sin.sin_addr.s_addr = htonl(opts->receive_addr);
++
++ fd = bound_socket(pf, SOCK_SEQPACKET, 0, &sin);
++
++ val.tos = opts->tos;
++ val.src.s_addr = htonl(opts->receive_addr);
++ val.dst.s_addr = htonl(opts->send_addr);
++ if (setsockopt(fd, sol, RDS_CONN_RESET, &val, sizeof(val)))
++ die_errno("setsockopt RDS_CONN_RESET failed");
++}
++
+ static int active_parent(struct options *opts, struct soak_control *soak_arr)
+ {
+ struct options enc_options;
+@@ -2324,6 +2804,11 @@
+ int fd;
+ uint8_t ok;
+
++ if (reset_connection) {
++ reset_conn(opts);
++ return 0;
++ }
++
+ if (opts->show_params) {
+ unsigned int k;
+
+@@ -2387,7 +2872,11 @@
+ * We just tell the peer what options to use.
+ */
+ encode_options(&enc_options, opts);
+- peer_send(fd, &enc_options, sizeof(struct options));
++ if (opts->tos || opts->async)
++ peer_send(fd, &enc_options, sizeof(struct options));
++ else
++ peer_send(fd, &enc_options.req_depth,
++ sizeof(struct options_2_0_6));
+
+ printf("negotiated options, tasks will start in 2 seconds\n");
+ ctl = start_children(opts, 1);
+@@ -2517,7 +3006,11 @@
/* an extra terminating entry which will be all 0s */
len = (nr_soak + 1) * sizeof(struct soak_control);
soak_arr = mmap(NULL, len, PROT_READ|PROT_WRITE,
@@ -449,7 +1396,24 @@
if (soak_arr == MAP_FAILED)
die("mmap of %ld soak control structs failed", nr_soak);
-@@ -2589,6 +2735,7 @@
+@@ -2572,6 +3065,10 @@
+ OPT_CONNECT_RETRIES,
+ OPT_USE_CONG_MONITOR,
+ OPT_PERFDATA,
++ OPT_SHOW_OUTLIERS,
++ OPT_SHOW_HISTOGRAM,
++ OPT_RESET,
++ OPT_ASYNC,
+ };
+
+ static struct option long_options[] = {
+@@ -2584,11 +3081,13 @@
+ { "send-addr", required_argument, NULL, 's' },
+ { "port", required_argument, NULL, 'p' },
+ { "time", required_argument, NULL, 'T' },
++{ "tos", required_argument, NULL, 'Q' },
+ { "report-cpu", no_argument, NULL, 'c' },
+ { "report-summary", no_argument, NULL, 'z' },
{ "rtprio", no_argument, NULL, 'R' },
{ "verify", no_argument, NULL, 'v' },
{ "trace", no_argument, NULL, 'V' },
@@ -457,16 +1421,56 @@
{ "rdma-use-once", required_argument, NULL, OPT_RDMA_USE_ONCE },
{ "rdma-use-get-mr", required_argument, NULL, OPT_RDMA_USE_GET_MR },
-@@ -2652,7 +2799,7 @@
+@@ -2601,6 +3100,10 @@
+ { "show-perfdata", no_argument, NULL, OPT_PERFDATA },
+ { "connect-retries", required_argument, NULL, OPT_CONNECT_RETRIES },
+ { "use-cong-monitor", required_argument, NULL, OPT_USE_CONG_MONITOR },
++{ "show-outliers", required_argument, NULL, OPT_SHOW_OUTLIERS },
++{ "show-histogram", no_argument, NULL, OPT_SHOW_HISTOGRAM },
++{ "reset", no_argument, NULL, OPT_RESET },
++{ "async", no_argument, NULL, OPT_ASYNC },
+
+ { NULL }
+ };
+@@ -2640,6 +3143,8 @@
+ opts.use_cong_monitor = 1;
+ opts.rdma_use_fence = 1;
+ opts.rdma_cache_mrs = 0;
++ opts.rdma_use_once = 0;
++ opts.rdma_use_get_mr = 0;
+ opts.rdma_alignment = 0;
+ opts.rdma_key_o_meter = 0;
+ opts.show_params = 0;
+@@ -2648,11 +3153,17 @@
+ opts.simplex = 0;
+ opts.rw_mode = 0;
+ opts.rdma_vector = 1;
++ rtt_threshold = ~0U;
++ show_histogram = 0;
++ opts.tos = 0;
++ reset_connection = 0;
++ opts.async = 0;
++ strcpy(opts.version, RDS_VERSION);
+
while(1) {
int c, index;
- c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVz",
-+ c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:vVg:z",
++ c = getopt_long(argc, argv, "+a:cD:d:hI:M:op:q:Rr:s:t:T:Q:vVg:z",
long_options, &index);
if (c == -1)
break;
-@@ -2711,6 +2858,10 @@
+@@ -2702,6 +3213,9 @@
+ case 'T':
+ opts.run_time = parse_ull(optarg, (uint32_t)~0);
+ break;
++ case 'Q':
++ opts.tos = parse_ull(optarg, (uint8_t)~0);
++ break;
+ case 'z':
+ opts.summary_only = 1;
+ break;
+@@ -2711,9 +3225,25 @@
case 'V':
opts.tracing = 1;
break;
@@ -474,10 +1478,25 @@
+ lgrp_id = (lgrp_id_t)parse_ull(optarg,
+ (uint32_t)~0);
+ break;
++ case OPT_SHOW_OUTLIERS:
++ rtt_threshold = parse_ull(optarg, ~0U);
++ break;
++ case OPT_SHOW_HISTOGRAM:
++ show_histogram = 1;
++ break;
case OPT_USE_CONG_MONITOR:
opts.use_cong_monitor = parse_ull(optarg, 1);
break;
-@@ -2786,6 +2937,7 @@
++ case OPT_RESET:
++ reset_connection = 1;
++ break;
++ case OPT_ASYNC:
++ opts.async = 1;
++ break;
+ case OPT_RDMA_USE_ONCE:
+ opts.rdma_use_once = parse_ull(optarg, 1);
+ break;
+@@ -2786,6 +3316,7 @@
if (opts.rdma_size && 0)
opts.rdma_size = (opts.rdma_size + 4095) & ~4095;
@@ -544,7 +1563,71 @@
/* Like inet_ntoa, but can be re-entered several times without clobbering
* the previously returned string. */
static const char *paddr(int af, const void *addrp)
-@@ -234,8 +250,10 @@
+@@ -134,18 +150,20 @@
+ {
+ struct rds_info_connection conn;
+
+- printf("\nRDS Connections:\n%15s %15s %16s %16s %3s\n",
+- "LocalAddr", "RemoteAddr", "NextTX", "NextRX", "Flg");
++ printf("\nRDS Connections:\n%15s %15s %4s %16s %16s %4s\n",
++ "LocalAddr", "RemoteAddr", "Tos", "NextTX", "NextRX", "Flgs");
+
+ for_each(conn, data, each, len) {
+- printf("%15s %15s %16"PRIu64" %16"PRIu64" %c%c%c\n",
++ printf("%15s %15s %4u %16"PRIu64" %16"PRIu64" %c%c%c%c\n",
+ ipv4addr(conn.laddr),
+ ipv4addr(conn.faddr),
++ conn.tos,
+ conn.next_tx_seq,
+ conn.next_rx_seq,
+ rds_conn_flag(conn, SENDING, 's'),
+ rds_conn_flag(conn, CONNECTING, 'c'),
+- rds_conn_flag(conn, CONNECTED, 'C'));
++ rds_conn_flag(conn, CONNECTED, 'C'),
++ rds_conn_flag(conn, ERROR, 'E'));
+ }
+ }
+
+@@ -153,16 +171,17 @@
+ {
+ struct rds_info_message msg;
+
+- printf("\n%s Message Queue:\n%15s %5s %15s %5s %16s %10s\n",
++ printf("\n%s Message Queue:\n%15s %5s %15s %5s %4s %16s %10s\n",
+ (char *)extra,
+- "LocalAddr", "LPort", "RemoteAddr", "RPort", "Seq", "Bytes");
++ "LocalAddr", "LPort", "RemoteAddr", "RPort", "Tos","Seq", "Bytes");
+
+ for_each(msg, data, each, len) {
+- printf("%15s %5u %15s %5u %16"PRIu64" %10u\n",
++ printf("%15s %5u %15s %5u %4u %16"PRIu64" %10u\n",
+ ipv4addr(msg.laddr),
+ ntohs(msg.lport),
+ ipv4addr(msg.faddr),
+ ntohs(msg.fport),
++ msg.tos,
+ msg.seq, msg.len);
+ }
+ }
+@@ -191,13 +210,14 @@
+ {
+ struct rds_info_rdma_connection ic;
+
+- printf("\nRDS IB Connections:\n%15s %15s %32s %32s\n",
+- "LocalAddr", "RemoteAddr", "LocalDev", "RemoteDev");
++ printf("\nRDS IB Connections:\n%15s %15s %4s %3s %32s %32s\n",
++ "LocalAddr", "RemoteAddr", "Tos", "SL", "LocalDev", "RemoteDev");
+
+ for_each(ic, data, each, len) {
+- printf("%15s %15s %32s %32s",
++ printf("%15s %15s %4u %3u %32s %32s",
+ ipv4addr(ic.src_addr),
+ ipv4addr(ic.dst_addr),
++ ic.tos,ic.sl,
+ ipv6addr(ic.src_gid),
+ ipv6addr(ic.dst_gid));
+
+@@ -234,8 +254,10 @@
print_msgs, "Send", 0 },
['t'] = { RDS_INFO_RETRANS_MESSAGES, "retransmit queue messages",
print_msgs, "Retransmit", 0 },
@@ -555,7 +1638,7 @@
['I'] = { RDS_INFO_IB_CONNECTIONS, "IB transport connections",
print_ib_conns, NULL, 0 },
};
-@@ -266,6 +284,9 @@
+@@ -266,6 +288,9 @@
char optstring[258] = "v+";
int given_options = 0;
socklen_t len = 0;
@@ -565,7 +1648,7 @@
void *data = NULL;
int fd;
int each;
-@@ -322,6 +343,7 @@
+@@ -322,6 +347,7 @@
(given_options && !infos[i].option_given))
continue;
@@ -573,7 +1656,7 @@
/* read in the info until we get a full snapshot */
while ((each = getsockopt(fd, sol, infos[i].opt_val, data,
&len)) < 0) {
-@@ -345,15 +367,47 @@
+@@ -345,15 +371,47 @@
return 1;
}
}
@@ -725,7 +1808,7 @@
diff -r -u /tmp/rds-tools-2.0.4/rds-info.1 rds-tools-2.0.7/rds-info.1
--- /tmp/rds-tools-2.0.4/rds-info.1 Wed Aug 4 15:25:11 2010
+++ rds-tools-2.0.7/rds-info.1 Thu Feb 24 13:27:51 2011
-@@ -1,162 +1,150 @@
+@@ -1,162 +1,160 @@
-.Dd October 30, 2006
-.Dt RDS-INFO 1
-.Os
@@ -828,6 +1911,8 @@
+.IP RemoteAddr
The IP address of the remote end of the connection.
-.It NextTX
++.IP Tos
++The type of service value for this connection.
+.IP NextTX
The sequence number that will be given to the next message that is sent
over the connection.
@@ -859,6 +1944,8 @@
+.IP C
+ The connection to the remote host is connected
+ and active.
++.IP E
++ The connection to the remote host is in error.
+
+.TP
+\fB\-r\fR, \fB\-s\fR, \fB\-t\fR
@@ -875,6 +1962,8 @@
The remote IP address and port associated with the message. For sent messages
this is the destination address, for receive messages it is the source address.
-.It Seq
++.IP Tos
++The type of service for this message.
+.IP Seq
The sequence number of the message.
-.It Bytes
@@ -901,6 +1990,10 @@
+.IP RemoteAddr
The remote IP address of this connection.
-.It LocalDev
++.IP Tos
++The type of service value for this connection.
++.IP SL
++The QoS Service Level for this connection.
+.IP LocalDev
The local IB Global Identifier, printed in IPv6 address syntax.
-.It RemoteDev
@@ -954,7 +2047,7 @@
diff -r -u /tmp/rds-tools-2.0.4/rds-ping.1 rds-tools-2.0.7/rds-ping.1
--- /tmp/rds-tools-2.0.4/rds-ping.1 Wed Aug 4 15:25:11 2010
+++ rds-tools-2.0.7/rds-ping.1 Thu Feb 24 13:27:52 2011
-@@ -1,69 +1,54 @@
+@@ -1,69 +1,63 @@
-.Dd Apr 22, 2008
-.Dt RDS-PING 1
-.Os
@@ -979,7 +2072,11 @@
-Its interface is designed to operate pretty much the standard
-.Xr ping 8
+.SH SYNOPSIS
-+.B rds-ping [-c count] [-i interval] [-I local_addr] remote_addr
++.HP
++.nf
++rds-ping [-c count] [-Q tos] [-i interval] [-I local_addr]
++ remote_addr
++.fi
+
+.SH DESCRIPTION
+.PP
@@ -1012,6 +2109,11 @@
-.Nm rds-ping
-will pick the local source address for the RDS socket based
+.TP
++\fB\-Q tos
++By default, rds-ping sends the ping requests on base (tos = 0) RDS connection.
++With this option, the requests are sent on RDS connection with the specified tos
++value. Valid values are 0-255.
++.TP
+\fB\-I address
+By default, rds-ping will pick the local source address for the RDS socket based
on routing information for the destination address (i.e. if
@@ -1072,10 +2174,11 @@
diff -r -u /tmp/rds-tools-2.0.4/rds-ping.c rds-tools-2.0.7/rds-ping.c
--- /tmp/rds-tools-2.0.4/rds-ping.c Wed Aug 4 15:25:10 2010
+++ rds-tools-2.0.7/rds-ping.c Thu Feb 24 13:27:52 2011
-@@ -48,7 +48,11 @@
+@@ -48,7 +48,12 @@
#include <sys/poll.h>
#include <fcntl.h>
#include <getopt.h>
++#include <sys/ioctl.h>
+#if defined(__SVR4) && defined(__sun)
+#include <sys/rds.h>
+#else
@@ -1084,7 +2187,58 @@
#include "pfhack.h"
-@@ -155,7 +159,12 @@
+@@ -67,6 +72,7 @@
+ static unsigned long opt_count;
+ static struct in_addr opt_srcaddr;
+ static struct in_addr opt_dstaddr;
++static uint8_t opt_tos = 0;
+
+ /* For reasons of simplicity, RDS ping does not use a packet
+ * payload that is being echoed, the way ICMP does.
+@@ -91,6 +97,7 @@
+ static int parse_timeval(const char *, struct timeval *);
+ static int parse_long(const char *ptr, unsigned long *);
+ static int parse_addr(const char *ptr, struct in_addr *);
++static unsigned long long parse_ull(char *ptr, unsigned long long max);
+
+ int
+ main(int argc, char **argv)
+@@ -97,7 +104,7 @@
+ {
+ int c;
+
+- while ((c = getopt(argc, argv, "c:i:I:")) != -1) {
++ while ((c = getopt(argc, argv, "c:i:I:Q:")) != -1) {
+ switch (c) {
+ case 'c':
+ if (!parse_long(optarg, &opt_count))
+@@ -114,6 +121,9 @@
+ die("Bad wait time <%s>\n", optarg);
+ break;
+
++ case 'Q':
++ opt_tos = parse_ull(optarg, 255);
++ break;
+ default:
+ usage("Unknown option");
+ }
+@@ -142,6 +152,9 @@
+ struct timeval next_ts;
+ struct socket socket[NSOCKETS];
+ struct pollfd pfd[NSOCKETS];
++#if !(defined(__SVR4) && defined(__sun))
++ int pending[NSOCKETS];
++#endif
+ int i, next = 0;
+
+ for (i = 0; i < NSOCKETS; ++i) {
+@@ -152,10 +165,18 @@
+ socket[i].fd = fd;
+ pfd[i].fd = fd;
+ pfd[i].events = POLLIN;
++#if !(defined(__SVR4) && defined(__sun))
++ pending[i] = 0;
++#endif
}
memset(&sin, 0, sizeof(sin));
@@ -1097,16 +2251,57 @@
sin.sin_addr = opt_dstaddr;
gettimeofday(&next_ts, NULL);
-@@ -181,7 +190,7 @@
+@@ -180,14 +201,32 @@
+ if (opt_count && sent >= opt_count)
break;
- timeradd(&next_ts, &opt_wait, &next_ts);
+- timeradd(&next_ts, &opt_wait, &next_ts);
- if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)))
-+ if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
- err = errno;
- sp->sent_id = ++sent;
- sp->sent_ts = now;
-@@ -258,7 +267,11 @@
+- err = errno;
+- sp->sent_id = ++sent;
+- sp->sent_ts = now;
+- sp->nreplies = 0;
+- next = (next + 1) % NSOCKETS;
++ timeradd(&now, &opt_wait, &next_ts);
++#if !(defined(__SVR4) && defined(__sun))
++ if (!pending[next]) {
++#endif
++ memset(&sin, 0, sizeof(sin));
++#if defined(__SVR4) && defined(__sun)
++ sin.sin_family = AF_INET_OFFLOAD;
++#else
++ sin.sin_family = AF_INET;
++#endif
++ sin.sin_addr = opt_dstaddr;
+
++ if (sendto(sp->fd, NULL, 0, 0, (struct sockaddr *) &sin, sizeof(sin)) < 0)
++ err = errno;
++ sp->sent_id = ++sent;
++ sp->sent_ts = now;
++ sp->nreplies = 0;
++#if !(defined(__SVR4) && defined(__sun))
++ if (!err)
++ pending[next] = 1;
++#endif
++ next = (next + 1) % NSOCKETS;
++#if !(defined(__SVR4) && defined(__sun))
++ }
++#endif
++
+ if (err) {
+ static unsigned int nerrs = 0;
+
+@@ -223,6 +262,9 @@
+ report_packet(sp, &now, NULL, errno);
+ } else {
+ report_packet(sp, &now, &from.sin_addr, 0);
++#if !(defined(__SVR4) && defined(__sun))
++ pending[i] = 0;
++#endif
+ recv++;
+ }
+ }
+@@ -258,7 +300,11 @@
int pf;
memset(&sin, 0, sizeof(sin));
@@ -1118,7 +2313,7 @@
#ifdef DYNAMIC_PF_RDS
pf = discover_pf_rds();
-@@ -278,6 +291,9 @@
+@@ -278,6 +324,9 @@
if (ufd < 0)
die_errno("unable to create UDP socket");
sin.sin_addr = *dst;
@@ -1128,7 +2323,7 @@
sin.sin_port = htons(1);
if (connect(ufd, (struct sockaddr *) &sin, sizeof(sin)) < 0)
die_errno("unable to connect to %s",
-@@ -289,6 +305,9 @@
+@@ -289,6 +338,9 @@
*src = sin.sin_addr;
close(ufd);
@@ -1138,6 +2333,58 @@
}
sin.sin_addr = *src;
+@@ -297,6 +349,9 @@
+ if (bind(fd, (struct sockaddr *) &sin, sizeof(sin)))
+ die_errno("bind() failed");
+
++ if (opt_tos && ioctl(fd, SIOCRDSSETTOS, &opt_tos))
++ die_errno("ERROR: failed to set TOS\n");
++
+ return fd;
+ }
+
+@@ -309,7 +364,8 @@
+ "%s\nUsage: rds-ping [options] dst_addr\n"
+ "Options:\n"
+ " -c count limit packet count\n"
+- " -I interface source IP address\n",
++ " -I interface source IP address\n"
++ " -Q tos type of service\n",
+ complaint);
+ exit(1);
+ }
+@@ -384,3 +440,31 @@
+ return 0;
+ }
+
++static unsigned long long parse_ull(char *ptr, unsigned long long max)
++{
++ unsigned long long val;
++ char *endptr;
++
++ val = strtoull(ptr, &endptr, 0);
++ switch (*endptr) {
++ case 'k': case 'K':
++ val <<= 10;
++ endptr++;
++ break;
++
++ case 'm': case 'M':
++ val <<= 20;
++ endptr++;
++ break;
++
++ case 'g': case 'G':
++ val <<= 30;
++ endptr++;
++ break;
++ }
++
++ if (*ptr && !*endptr && val <= max)
++ return val;
++
++ die("invalid number '%s'\n", ptr);
++}
diff -r -u /tmp/rds-tools-2.0.4/Makefile.in rds-tools-2.0.7/Makefile.in
--- /tmp/rds-tools-2.0.4/Makefile.in Wed Aug 4 15:25:11 2010
+++ rds-tools-2.0.7/Makefile.in Thu Feb 24 13:27:51 2011
@@ -1290,7 +2537,7 @@
diff -r -u /tmp/rds-tools-2.0.4/rds-stress.1 rds-tools-2.0.7/rds-stress.1
--- /tmp/rds-tools-2.0.4/rds-stress.1 Wed Aug 4 15:25:11 2010
+++ rds-tools-2.0.7/rds-stress.1 Thu Feb 24 13:27:52 2011
-@@ -1,99 +1,102 @@
+@@ -1,99 +1,106 @@
-.Dd May 15, 2007
-.Dt RDS-STRESS 1
-.Os
@@ -1321,8 +2568,8 @@
+.HP
+.nf
+rds-stress [-p port_number] -r [receive_address] [-s send_address]
-+ [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
-+ [-d queue_depth] [-t nr_tasks] [-c] [-R] [-V] [-v]
++ [-Q tos] [-a ack_bytes] [-q request_bytes] [-D rdma_bytes]
++ [-d queue_depth] [-t nr_tasks] [-T time] [-c] [-R] [-V] [-v]
+.fi
-.Sh DESCRIPTION
@@ -1423,6 +2670,10 @@
which it connects to the destination address.
-.It Fl a Ar ack_bytes
+.TP
++\fB\-Q tos
++Uses the RDS connection between IP addresses with the specified tos value. By
++default, the base (tos = 0) RDS connection is used. Valid values are 0-255.
++.TP
+\fB\-a ack_bytes
This specifies the size of the ack messages, in bytes. There is a minimum size
which depends on the format of the ack messages, which may change over time.
@@ -1439,7 +2690,7 @@
RDSv3 is capable of transmitting part of a message via RDMA directly from
application buffer to application buffer. This option enables RDMA support
in rds-stress: request packets include parameters for an RDMA READ or WRITE
-@@ -100,20 +103,25 @@
+@@ -100,20 +107,25 @@
operation, which the receiving process executes at the time the ACK packet
is sent.
See section "Message Sizes" below.
@@ -1470,7 +2721,7 @@
This causes rds-stress to create child tasks which just consume CPU cycles.
One task is created for each CPU in the system. First each child observes the
maximum rate at which it can consume cycles. This means that this option
-@@ -121,50 +129,67 @@
+@@ -121,50 +133,67 @@
use of the system by observing the lesser rate at which the children consume
cycles. This option is *not* shared between the active and passive instances.
It must be specified on each rds-stress command line.
@@ -1537,7 +2788,7 @@
WRITEs for all children.
-.It tx us/c
+.TP
-+mbi K/s
++mbo K/s
+The total number of bytes that are being transmited via RDMA READs and
+WRITEs for all children.
+.TP
@@ -1557,7 +2808,7 @@
This is the percentage of available CPU resources on this machine that are being
consumed since rds-stress started running. It will show -1.00 if -c is not
given. It is calculated based on the amount of CPU resources that CPU soaking
-@@ -171,4 +196,3 @@
+@@ -171,4 +200,3 @@
tasks are able to consume. This lets it measure CPU use by the system, say in
interrupt handlers, that task-based CPU accounting does not include.
For this to work rds-stress must be started with -c on an idle system.