PSARC 2007/347 NFS/RDMA - Transport Version Update
authorRobert Gordon <Robert.Gordon@Sun.COM>
Thu, 21 Aug 2008 18:01:07 -0500
changeset 7387 0b3a92e31fd8
parent 7386 78eaaa8a7347
child 7388 f9a21a761e49
PSARC 2007/347 NFS/RDMA - Transport Version Update 6661313 mountd does not deal well with malformed authentication requests from NFS server kernel 6551906 Update the kernel RPC/RDMA transport protocol to latest definitions Portions contributed by Ranjit Noronha <[email protected]> Lei Chai <[email protected]> Weikuan Yu <[email protected]>
usr/src/cmd/fs.d/nfs/mountd/nfsauth.c
usr/src/uts/common/fs/nfs/nfs3_srv.c
usr/src/uts/common/fs/nfs/nfs3_vnops.c
usr/src/uts/common/fs/nfs/nfs3_xdr.c
usr/src/uts/common/fs/nfs/nfs4_srv.c
usr/src/uts/common/fs/nfs/nfs4_vnops.c
usr/src/uts/common/fs/nfs/nfs4_xdr.c
usr/src/uts/common/fs/nfs/nfs_srv.c
usr/src/uts/common/fs/nfs/nfs_vnops.c
usr/src/uts/common/fs/nfs/nfs_xdr.c
usr/src/uts/common/io/lvm/md/md_med.c
usr/src/uts/common/nfs/nfs.h
usr/src/uts/common/nfs/nfs4_kprot.h
usr/src/uts/common/rpc/clnt.h
usr/src/uts/common/rpc/clnt_rdma.c
usr/src/uts/common/rpc/ib.h
usr/src/uts/common/rpc/rdma_subr.c
usr/src/uts/common/rpc/rpc_prot.c
usr/src/uts/common/rpc/rpc_rdma.h
usr/src/uts/common/rpc/rpcib.c
usr/src/uts/common/rpc/rpcsec_gss.h
usr/src/uts/common/rpc/sec_gss/rpcsec_gss.c
usr/src/uts/common/rpc/svc.h
usr/src/uts/common/rpc/svc_rdma.c
usr/src/uts/common/rpc/xdr.c
usr/src/uts/common/rpc/xdr.h
usr/src/uts/common/rpc/xdr_array.c
usr/src/uts/common/rpc/xdr_mblk.c
usr/src/uts/common/rpc/xdr_rdma.c
usr/src/uts/common/rpc/xdrrdma_sizeof.c
usr/src/uts/intel/ia32/ml/modstubs.s
usr/src/uts/sparc/ml/modstubs.s
--- a/usr/src/cmd/fs.d/nfs/mountd/nfsauth.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/cmd/fs.d/nfs/mountd/nfsauth.c	Thu Aug 21 18:01:07 2008 -0500
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
@@ -75,6 +73,9 @@
 	nbuf.len = argp->req_client.n_len;
 	nbuf.buf = argp->req_client.n_bytes;
 
+	if (nbuf.len == 0 || nbuf.buf == NULL)
+		return;
+
 	if (netdir_getbyaddr(nconf, &clnames, &nbuf)) {
 		host = &tmp[0];
 		if (strcmp(nconf->nc_protofmly, NC_INET) == 0) {
@@ -88,7 +89,7 @@
 			/* LINTED pointer */
 			sa = (struct sockaddr_in6 *)nbuf.buf;
 			(void) inet_ntop(AF_INET6, sa->sin6_addr.s6_addr,
-				    tmp, INET6_ADDRSTRLEN);
+			    tmp, INET6_ADDRSTRLEN);
 		}
 		clnames = anon_client(host);
 	}
@@ -108,7 +109,7 @@
 
 	if (result->auth_perm == NFSAUTH_DENIED) {
 		syslog(LOG_ERR, "%s denied access to %s",
-			clnames->h_hostservs[0].h_host, argp->req_path);
+		    clnames->h_hostservs[0].h_host, argp->req_path);
 	}
 
 done:
--- a/usr/src/uts/common/fs/nfs/nfs3_srv.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c	Thu Aug 21 18:01:07 2008 -0500
@@ -26,8 +26,6 @@
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
 /* All Rights Reserved */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -52,6 +50,7 @@
 #include <rpc/types.h>
 #include <rpc/auth.h>
 #include <rpc/svc.h>
+#include <rpc/rpc_rdma.h>
 
 #include <nfs/nfs.h>
 #include <nfs/export.h>
@@ -83,6 +82,7 @@
 static int	vattr_to_wcc_attr(struct vattr *, wcc_attr *);
 static void	vattr_to_pre_op_attr(struct vattr *, pre_op_attr *);
 static void	vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *);
+static int	rdma_setup_read_data3(READ3args *, READ3resok *);
 
 u_longlong_t nfs3_srv_caller_id;
 
@@ -906,6 +906,10 @@
 		kmem_free(resp->resok.data, MAXPATHLEN + 1);
 }
 
+/*
+ * Server routine to handle read
+ * May handle RDMA data as well as mblks
+ */
 /* ARGSUSED */
 void
 rfs3_read(READ3args *args, READ3res *resp, struct exportinfo *exi,
@@ -1030,6 +1034,9 @@
 		resp->resok.data.data_len = 0;
 		resp->resok.data.data_val = NULL;
 		resp->resok.data.mp = NULL;
+		/* RDMA */
+		resp->resok.wlist = args->wlist;
+		resp->resok.wlist_len = resp->resok.count;
 		goto done;
 	}
 
@@ -1044,6 +1051,9 @@
 		resp->resok.data.data_len = 0;
 		resp->resok.data.data_val = NULL;
 		resp->resok.data.mp = NULL;
+		/* RDMA */
+		resp->resok.wlist = args->wlist;
+		resp->resok.wlist_len = resp->resok.count;
 		goto done;
 	}
 
@@ -1055,18 +1065,30 @@
 		args->count = rfs3_tsize(req);
 
 	/*
-	 * mp will contain the data to be sent out in the read reply.
-	 * This will be freed after the reply has been sent out (by the
-	 * driver).
-	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
-	 * that the call to xdrmblk_putmblk() never fails.
+	 * If returning data via RDMA Write, then grab the chunk list.
+	 * If we aren't returning READ data w/RDMA_WRITE, then grab
+	 * a mblk.
 	 */
-	mp = allocb_wait(RNDUP(args->count), BPRI_MED, STR_NOSIG, &alloc_err);
-	ASSERT(mp != NULL);
-	ASSERT(alloc_err == 0);
-
-	iov.iov_base = (caddr_t)mp->b_datap->db_base;
-	iov.iov_len = args->count;
+	if (args->wlist) {
+		mp = NULL;
+		(void) rdma_get_wchunk(req, &iov, args->wlist);
+	} else {
+		/*
+		 * mp will contain the data to be sent out in the read reply.
+		 * This will be freed after the reply has been sent out (by the
+		 * driver).
+		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
+		 * that the call to xdrmblk_putmblk() never fails.
+		 */
+		mp = allocb_wait(RNDUP(args->count), BPRI_MED, STR_NOSIG,
+		    &alloc_err);
+		ASSERT(mp != NULL);
+		ASSERT(alloc_err == 0);
+
+		iov.iov_base = (caddr_t)mp->b_datap->db_base;
+		iov.iov_len = args->count;
+	}
+
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_segflg = UIO_SYSSPACE;
@@ -1106,18 +1128,6 @@
 
 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 
-#if 0 /* notyet */
-	/*
-	 * Don't do this.  It causes local disk writes when just
-	 * reading the file and the overhead is deemed larger
-	 * than the benefit.
-	 */
-	/*
-	 * Force modified metadata out to stable storage.
-	 */
-	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
-#endif
-
 	if (in_crit)
 		nbl_end_crit(vp);
 
@@ -1129,12 +1139,19 @@
 	else
 		resp->resok.eof = FALSE;
 	resp->resok.data.data_len = resp->resok.count;
-	resp->resok.data.data_val = (char *)mp->b_datap->db_base;
-
 	resp->resok.data.mp = mp;
-
 	resp->resok.size = (uint_t)args->count;
 
+	if (args->wlist) {
+		resp->resok.data.data_val = (caddr_t)iov.iov_base;
+		if (!rdma_setup_read_data3(args, &(resp->resok))) {
+			resp->status = NFS3ERR_INVAL;
+		}
+	} else {
+		resp->resok.data.data_val = (caddr_t)mp->b_datap->db_base;
+		(resp->resok).wlist = NULL;
+	}
+
 done:
 	DTRACE_NFSV3_4(op__read__done, struct svc_req *, req,
 	    cred_t *, cr, vnode_t *, vp, READ3res *, resp);
@@ -1331,6 +1348,12 @@
 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
 		}
 		mblk_to_iov(args->mblk, iovcnt, iovp);
+
+	} else if (args->rlist != NULL) {
+		iovcnt = 1;
+		iovp = iov;
+		iovp->iov_base = (char *)((args->rlist)->u.c_daddr3);
+		iovp->iov_len = args->count;
 	} else {
 		iovcnt = 1;
 		iovp = iov;
@@ -4261,7 +4284,7 @@
 {
 
 	/* Return error if time or size overflow */
-	if (!  (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
+	if (!(NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
 	    NFS_TIME_T_OK(vap->va_ctime.tv_sec) &&
 	    NFS3_SIZE_OK(vap->va_size))) {
 		return (EOVERFLOW);
@@ -4351,6 +4374,50 @@
 
 }
 
+static int
+rdma_setup_read_data3(READ3args *args, READ3resok *rok)
+{
+	struct clist	*wcl;
+	int		data_len, avail_len, num;
+	count3		count = rok->count;
+
+	data_len = num = avail_len = 0;
+
+	wcl = args->wlist;
+	while (wcl != NULL) {
+		if (wcl->c_dmemhandle.mrc_rmr == 0)
+			break;
+
+		avail_len += wcl->c_len;
+		if (wcl->c_len < count) {
+			data_len += wcl->c_len;
+		} else {
+			/* Can make the rest chunks all 0-len */
+			data_len += count;
+			wcl->c_len = count;
+		}
+		count -= wcl->c_len;
+		num ++;
+		wcl = wcl->c_next;
+	}
+
+	/*
+	 * MUST fail if there are still more data
+	 */
+	if (count > 0) {
+		DTRACE_PROBE2(nfss__e__read3_wlist_fail,
+		    int, data_len, int, count);
+		return (FALSE);
+	}
+
+	wcl = args->wlist;
+	rok->count = data_len;
+	rok->wlist_len = data_len;
+	rok->wlist = wcl;
+
+	return (TRUE);
+}
+
 void
 rfs3_srvrfini(void)
 {
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c	Thu Aug 21 18:01:07 2008 -0500
@@ -28,8 +28,6 @@
  *	All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -67,6 +65,7 @@
 #include <rpc/types.h>
 #include <rpc/auth.h>
 #include <rpc/clnt.h>
+#include <rpc/rpc_rdma.h>
 
 #include <nfs/nfs.h>
 #include <nfs/nfs_clnt.h>
@@ -476,6 +475,8 @@
 
 	res.uiop = uiop;
 
+	res.wlist = NULL;
+
 	offset = uiop->uio_loffset;
 	count = uiop->uio_resid;
 
@@ -491,6 +492,9 @@
 			args.offset = (offset3)offset;
 			args.count = (count3)tsize;
 			res.size = (uint_t)tsize;
+			args.res_uiop = uiop;
+			args.res_data_val_alt = NULL;
+
 			error = rfs3call(mi, NFSPROC3_READ,
 			    xdr_READ3args, (caddr_t)&args,
 			    xdr_READ3uiores, (caddr_t)&res, cr,
@@ -1111,6 +1115,7 @@
 	res.pov.fres.vp = vp;
 	res.pov.fres.vap = &va;
 
+	res.wlist = NULL;
 	*residp = count;
 	do {
 		if (mi->mi_io_kstats) {
@@ -1130,6 +1135,9 @@
 			res.data.data_len = tsize;
 			args.offset = (offset3)offset;
 			args.count = (count3)tsize;
+			args.res_uiop = NULL;
+			args.res_data_val_alt = base;
+
 			t = gethrtime();
 			error = rfs3call(mi, NFSPROC3_READ,
 			    xdr_READ3args, (caddr_t)&args,
@@ -1667,7 +1675,7 @@
 	t = gethrtime();
 
 	error = rfs3call(VTOMI(vp), NFSPROC3_READLINK,
-	    xdr_nfs_fh3, (caddr_t)&args,
+	    xdr_READLINK3args, (caddr_t)&args,
 	    xdr_READLINK3res, (caddr_t)&res, cr,
 	    &douprintf, &res.status, 0, &fi);
 
--- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c	Thu Aug 21 18:01:07 2008 -0500
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
 /* All Rights Reserved */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -51,6 +49,7 @@
 
 #include <nfs/nfs.h>
 #include <nfs/rnode.h>
+#include <rpc/rpc_rdma.h>
 
 /*
  * These are the XDR routines used to serialize and deserialize
@@ -539,22 +538,22 @@
 		return (TRUE);
 	}
 	if (!(xdr_enum(xdrs, (enum_t *)&na->type) &&
-		xdr_u_int(xdrs, &na->mode) &&
-		xdr_u_int(xdrs, &na->nlink) &&
-		xdr_u_int(xdrs, &na->uid) &&
-		xdr_u_int(xdrs, &na->gid) &&
-		xdr_u_longlong_t(xdrs, &na->size) &&
-		xdr_u_longlong_t(xdrs, &na->used) &&
-		xdr_u_int(xdrs, &na->rdev.specdata1) &&
-		xdr_u_int(xdrs, &na->rdev.specdata2) &&
-		xdr_u_longlong_t(xdrs, &na->fsid) &&
-		xdr_u_longlong_t(xdrs, &na->fileid) &&
-		xdr_u_int(xdrs, &na->atime.seconds) &&
-		xdr_u_int(xdrs, &na->atime.nseconds) &&
-		xdr_u_int(xdrs, &na->mtime.seconds) &&
-		xdr_u_int(xdrs, &na->mtime.nseconds) &&
-		xdr_u_int(xdrs, &na->ctime.seconds) &&
-		xdr_u_int(xdrs, &na->ctime.nseconds)))
+	    xdr_u_int(xdrs, &na->mode) &&
+	    xdr_u_int(xdrs, &na->nlink) &&
+	    xdr_u_int(xdrs, &na->uid) &&
+	    xdr_u_int(xdrs, &na->gid) &&
+	    xdr_u_longlong_t(xdrs, &na->size) &&
+	    xdr_u_longlong_t(xdrs, &na->used) &&
+	    xdr_u_int(xdrs, &na->rdev.specdata1) &&
+	    xdr_u_int(xdrs, &na->rdev.specdata2) &&
+	    xdr_u_longlong_t(xdrs, &na->fsid) &&
+	    xdr_u_longlong_t(xdrs, &na->fileid) &&
+	    xdr_u_int(xdrs, &na->atime.seconds) &&
+	    xdr_u_int(xdrs, &na->atime.nseconds) &&
+	    xdr_u_int(xdrs, &na->mtime.seconds) &&
+	    xdr_u_int(xdrs, &na->mtime.nseconds) &&
+	    xdr_u_int(xdrs, &na->ctime.seconds) &&
+	    xdr_u_int(xdrs, &na->ctime.nseconds)))
 			return (FALSE);
 	return (TRUE);
 }
@@ -668,16 +667,16 @@
 		 * Slow path
 		 */
 		if (!(xdr_enum(xdrs, (enum_t *)&vap->va_type) &&
-			xdr_u_int(xdrs, &vap->va_mode) &&
-			xdr_u_int(xdrs, &vap->va_nlink) &&
-			xdr_u_int(xdrs, (uint_t *)&vap->va_uid) &&
-			xdr_u_int(xdrs, (uint_t *)&vap->va_gid) &&
-			xdr_u_longlong_t(xdrs, &vap->va_size) &&
-			xdr_u_longlong_t(xdrs, &used) &&
-			xdr_u_int(xdrs, &rdev.specdata1) &&
-			xdr_u_int(xdrs, &rdev.specdata2) &&
-			xdr_u_longlong_t(xdrs, &fsid) &&	/* ignored */
-			xdr_u_longlong_t(xdrs, &vap->va_nodeid)))
+		    xdr_u_int(xdrs, &vap->va_mode) &&
+		    xdr_u_int(xdrs, &vap->va_nlink) &&
+		    xdr_u_int(xdrs, (uint_t *)&vap->va_uid) &&
+		    xdr_u_int(xdrs, (uint_t *)&vap->va_gid) &&
+		    xdr_u_longlong_t(xdrs, &vap->va_size) &&
+		    xdr_u_longlong_t(xdrs, &used) &&
+		    xdr_u_int(xdrs, &rdev.specdata1) &&
+		    xdr_u_int(xdrs, &rdev.specdata2) &&
+		    xdr_u_longlong_t(xdrs, &fsid) &&	/* ignored */
+		    xdr_u_longlong_t(xdrs, &vap->va_nodeid)))
 				return (FALSE);
 
 		if (nfs_allow_preepoch_time) {
@@ -779,8 +778,8 @@
 	case VDIR:
 	case VLNK:
 		vap->va_nblocks = (u_longlong_t)
-			((used + (size3)DEV_BSIZE - (size3)1) /
-			(size3)DEV_BSIZE);
+		    ((used + (size3)DEV_BSIZE - (size3)1) /
+		    (size3)DEV_BSIZE);
 		break;
 	case VBLK:
 		vap->va_blksize = DEV_BSIZE;
@@ -1199,6 +1198,23 @@
 }
 
 bool_t
+xdr_READLINK3args(XDR *xdrs,  READLINK3args *objp)
+{
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
+	if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+	    xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = MAXPATHLEN;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+	if (!xdr_nfs_fh3(xdrs, (nfs_fh3 *)objp))
+		return (FALSE);
+	return (TRUE);
+}
+
+bool_t
 xdr_READLINK3res(XDR *xdrs, READLINK3res *objp)
 {
 
@@ -1208,7 +1224,7 @@
 		return (FALSE);
 	if (objp->status != NFS3_OK)
 		return (xdr_post_op_attr(xdrs,
-			&objp->resfail.symlink_attributes));
+		    &objp->resfail.symlink_attributes));
 
 	/* xdr_READLINK3resok */
 	resokp = &objp->resok;
@@ -1220,6 +1236,10 @@
 bool_t
 xdr_READ3args(XDR *xdrs, READ3args *objp)
 {
+	rdma_chunkinfo_t rci;
+	rdma_wlist_conn_info_t rwci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
 	switch (xdrs->x_op) {
 	case XDR_FREE:
 	case XDR_ENCODE:
@@ -1233,7 +1253,46 @@
 	}
 	if (!xdr_u_longlong_t(xdrs, &objp->offset))
 		return (FALSE);
-	return (xdr_u_int(xdrs, &objp->count));
+	if (!xdr_u_int(xdrs, &objp->count))
+		return (FALSE);
+
+	DTRACE_PROBE1(xdr__i__read3_buf_len, int, objp->count);
+
+	objp->wlist = NULL;
+
+	/* if xdrrdma_sizeof in progress, then store the size */
+	if (xdrs->x_ops == xops && xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+		rci.rci_len = objp->count;
+		(void) XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
+	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_op == XDR_FREE)
+		return (TRUE);
+
+	if (xdrs->x_op == XDR_ENCODE) {
+
+		if (objp->res_uiop != NULL) {
+			rci.rci_type = RCI_WRITE_UIO_CHUNK;
+			rci.rci_a.rci_uiop = objp->res_uiop;
+			rci.rci_len = objp->count;
+			rci.rci_clpp = &objp->wlist;
+		} else {
+			rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+			rci.rci_a.rci_addr = objp->res_data_val_alt;
+			rci.rci_len = objp->count;
+			rci.rci_clpp = &objp->wlist;
+		}
+
+		return (XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci));
+	}
+
+	/* XDR_DECODE case */
+	(void) XDR_CONTROL(xdrs, XDR_RDMA_GET_WCINFO, &rwci);
+	objp->wlist = rwci.rwci_wlist;
+	objp->conn = rwci.rwci_conn;
+
+	return (TRUE);
 }
 
 bool_t
@@ -1264,7 +1323,7 @@
 		if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
 			mp->b_wptr += resokp->count;
 			rndup = BYTES_PER_XDR_UNIT -
-				(resokp->data.data_len % BYTES_PER_XDR_UNIT);
+			    (resokp->data.data_len % BYTES_PER_XDR_UNIT);
 			if (rndup != BYTES_PER_XDR_UNIT)
 				for (i = 0; i < rndup; i++)
 					*mp->b_wptr++ = '\0';
@@ -1272,6 +1331,26 @@
 				resokp->data.mp = NULL;
 				return (TRUE);
 			}
+		} else if (mp == NULL) {
+			if (xdr_u_int(xdrs, &resokp->count) == FALSE) {
+				return (FALSE);
+			}
+			/*
+			 * If read data sent by wlist (RDMA_WRITE), don't do
+			 * xdr_bytes() below.   RDMA_WRITE transfers the data.
+			 * Note: this is encode-only because the client code
+			 * uses xdr_READ3vres/xdr_READ3uiores to decode results.
+			 */
+			if (resokp->wlist) {
+				if (resokp->wlist->c_len != resokp->count) {
+					resokp->wlist->c_len = resokp->count;
+				}
+				if (resokp->count != 0) {
+					return (xdrrdma_send_read_data(
+					    xdrs, resokp->wlist));
+				}
+				return (TRUE);
+			}
 		}
 		/*
 		 * Fall thru for the xdr_bytes()
@@ -1281,6 +1360,8 @@
 		 */
 	}
 
+	/* no RDMA_WRITE transfer -- send data inline */
+
 	ret = xdr_bytes(xdrs, (char **)&resokp->data.data_val,
 	    &resokp->data.data_len, nfs3tsize());
 
@@ -1290,6 +1371,7 @@
 bool_t
 xdr_READ3vres(XDR *xdrs, READ3vres *objp)
 {
+	count3 ocount;
 	/*
 	 * DECODE or FREE only
 	 */
@@ -1314,6 +1396,31 @@
 	if (!xdr_bool(xdrs, &objp->eof))
 		return (FALSE);
 
+	/*
+	 * If read data received via RDMA_WRITE, don't do xdr_bytes().
+	 * RDMA_WRITE already moved the data so decode length of RDMA_WRITE.
+	 */
+	if (xdrs->x_ops == &xdrrdma_ops) {
+		struct clist *cl;
+
+		XDR_CONTROL(xdrs, XDR_RDMA_GET_WLIST, &cl);
+
+		if (cl) {
+			if (!xdr_u_int(xdrs, &ocount)) {
+				return (FALSE);
+			}
+			if (ocount != objp->count) {
+				DTRACE_PROBE2(xdr__e__read3vres_fail,
+				    int, ocount, int, objp->count);
+				return (FALSE);
+			}
+
+			objp->wlist_len = cl->c_len;
+			objp->data.data_len = objp->wlist_len;
+			return (TRUE);
+		}
+	}
+
 	return (xdr_bytes(xdrs, (char **)&objp->data.data_val,
 	    &objp->data.data_len, nfs3tsize()));
 }
@@ -1321,6 +1428,7 @@
 bool_t
 xdr_READ3uiores(XDR *xdrs, READ3uiores *objp)
 {
+	count3 ocount;
 	bool_t attributes;
 	mblk_t *mp;
 	size_t n;
@@ -1384,7 +1492,7 @@
 			if ((n = MIN(uiop->uio_resid, n)) != 0) {
 
 				error = uiomove((char *)mp->b_rptr, n, UIO_READ,
-						uiop);
+				    uiop);
 				if (error)
 					return (FALSE);
 				mp->b_rptr += n;
@@ -1398,9 +1506,46 @@
 		return (TRUE);
 	}
 
+	if (xdrs->x_ops == &xdrrdma_ops) {
+		struct clist *cl;
+
+		XDR_CONTROL(xdrs, XDR_RDMA_GET_WLIST, &cl);
+
+		objp->wlist = cl;
+
+		if (objp->wlist) {
+			if (!xdr_u_int(xdrs, &ocount)) {
+				objp->wlist = NULL;
+				return (FALSE);
+			}
+
+			if (ocount != objp->count) {
+				DTRACE_PROBE2(xdr__e__read3uiores_fail,
+				    int, ocount, int, objp->count);
+				objp->wlist = NULL;
+				return (FALSE);
+			}
+
+			objp->wlist_len = cl->c_len;
+
+			uiop->uio_resid -= objp->count;
+			uiop->uio_iov->iov_len -= objp->count;
+			uiop->uio_iov->iov_base += objp->count;
+			uiop->uio_loffset += objp->count;
+
+			/*
+			 * XXX: Assume 1 iov, needs to be changed.
+			 */
+			objp->size = objp->wlist_len;
+
+			return (TRUE);
+		}
+	}
+
 	/*
-	 * This isn't an xdrmblk stream.   Handle the likely
-	 * case that it can be inlined (ex. xdrmem).
+	 * This isn't an xdrmblk stream nor RDMA.
+	 * Handle the likely case that it can be
+	 * inlined (ex. xdrmem).
 	 */
 	if (!XDR_GETINT32(xdrs, (int32_t *)&objp->size))
 		return (FALSE);
@@ -1461,9 +1606,38 @@
 			}
 		}
 		objp->mblk = NULL;
+
+		if (xdrs->x_ops == &xdrrdmablk_ops) {
+			if (xdrrdma_getrdmablk(xdrs, &objp->rlist,
+			    &objp->data.data_len,
+			    &objp->conn, nfs3tsize()) == TRUE) {
+				objp->data.data_val = NULL;
+				if (xdrrdma_read_from_client(
+				    &objp->rlist,
+				    &objp->conn,
+				    objp->count) == FALSE) {
+					return (FALSE);
+				}
+				return (TRUE);
+			}
+		}
+		objp->rlist = NULL;
+
 		/* Else fall thru for the xdr_bytes(). */
 	}
 
+	if (xdrs->x_op == XDR_FREE) {
+		if (objp->rlist != NULL) {
+			(void) xdrrdma_free_clist(objp->conn, objp->rlist);
+			objp->rlist = NULL;
+			objp->data.data_val = NULL;
+			return (TRUE);
+		}
+	}
+
+	DTRACE_PROBE1(xdr__i__write3_buf_len,
+	    int, objp->data.data_len);
+
 	return (xdr_bytes(xdrs, (char **)&objp->data.data_val,
 	    &objp->data.data_len, nfs3tsize()));
 }
@@ -1764,6 +1938,9 @@
 bool_t
 xdr_READDIR3args(XDR *xdrs, READDIR3args *objp)
 {
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
@@ -1778,6 +1955,13 @@
 			return (FALSE);
 		break;
 	}
+	if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+	    xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = objp->count;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
 	if (!xdr_u_longlong_t(xdrs, &objp->cookie))
 		return (FALSE);
 	/*
@@ -2023,6 +2207,9 @@
 bool_t
 xdr_READDIRPLUS3args(XDR *xdrs, READDIRPLUS3args *objp)
 {
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
@@ -2037,6 +2224,13 @@
 			return (FALSE);
 		break;
 	}
+	if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+	    xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = objp->maxcount;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
 	if (!xdr_u_longlong_t(xdrs, &objp->cookie))
 		return (FALSE);
 	/*
@@ -2254,7 +2448,7 @@
 			return (FALSE);
 
 		if (pov.attributes == TRUE &&
-				pov.fres.status == NFS3_OK)
+		    pov.fres.status == NFS3_OK)
 			va_valid = TRUE;
 		else
 			va_valid = FALSE;
@@ -2286,14 +2480,14 @@
 		 * we cannot determine the type for.
 		 */
 		if (!(namlen == 1 && dp->d_name[0] == '.') &&
-			va_valid && fh_valid) {
+		    va_valid && fh_valid) {
 
 			/*
 			 * Do the DNLC caching
 			 */
 			nvp = makenfs3node_va(&fh, &va, dvp->v_vfsp,
-				objp->time, objp->credentials,
-				rp->r_path, dp->d_name);
+			    objp->time, objp->credentials,
+			    rp->r_path, dp->d_name);
 			dnlc_update(dvp, dp->d_name, nvp);
 			VN_RELE(nvp);
 		}
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c	Thu Aug 21 18:01:07 2008 -0500
@@ -28,8 +28,6 @@
  *	All Rights Reserved
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -80,10 +78,12 @@
 #define	RFS4_MAXLOCK_TRIES 4	/* Try to get the lock this many times */
 static int rfs4_maxlock_tries = RFS4_MAXLOCK_TRIES;
 #define	RFS4_LOCK_DELAY 10	/* Milliseconds */
-static clock_t rfs4_lock_delay = RFS4_LOCK_DELAY;
-
+static clock_t  rfs4_lock_delay = RFS4_LOCK_DELAY;
+extern struct svc_ops rdma_svc_ops;
 /* End of Tunables */
 
+static int rdma_setup_read_data4(READ4args *, READ4res *);
+
 /*
  * Used to bump the stateid4.seqid value and show changes in the stateid
  */
@@ -144,13 +144,13 @@
 
 static sysid_t lockt_sysid;		/* dummy sysid for all LOCKT calls */
 
-u_longlong_t nfs4_srv_caller_id;
-uint_t nfs4_srv_vkey = 0;
+u_longlong_t	nfs4_srv_caller_id;
+uint_t		nfs4_srv_vkey = 0;
 
 verifier4	Write4verf;
 verifier4	Readdir4verf;
 
-void		rfs4_init_compound_state(struct compound_state *);
+void	rfs4_init_compound_state(struct compound_state *);
 
 static void	nullfree(caddr_t);
 static void	rfs4_op_inval(nfs_argop4 *, nfs_resop4 *, struct svc_req *,
@@ -264,11 +264,11 @@
 		    struct compound_state *cs, struct nfs4_svgetit_arg *sargp,
 		    struct nfs4_ntov_table *ntovp, nfs4_attr_cmd_t cmd);
 
-fem_t	*deleg_rdops;
-fem_t	*deleg_wrops;
-
-rfs4_servinst_t	*rfs4_cur_servinst = NULL;	/* current server instance */
-kmutex_t	rfs4_servinst_lock;		/* protects linked list */
+fem_t		*deleg_rdops;
+fem_t		*deleg_wrops;
+
+rfs4_servinst_t *rfs4_cur_servinst = NULL;	/* current server instance */
+kmutex_t	rfs4_servinst_lock;	/* protects linked list */
 int		rfs4_seen_first_compound;	/* set first time we see one */
 
 /*
@@ -332,10 +332,10 @@
 	{rfs4_op_locku, nullfree, 0},
 
 	/* OP_LOOKUP = 15 */
-	{rfs4_op_lookup, nullfree, (RPC_IDEMPOTENT|RPC_PUBLICFH_OK)},
+	{rfs4_op_lookup, nullfree, (RPC_IDEMPOTENT | RPC_PUBLICFH_OK)},
 
 	/* OP_LOOKUPP = 16 */
-	{rfs4_op_lookupp, nullfree, (RPC_IDEMPOTENT|RPC_PUBLICFH_OK)},
+	{rfs4_op_lookupp, nullfree, (RPC_IDEMPOTENT | RPC_PUBLICFH_OK)},
 
 	/* OP_NVERIFY = 17 */
 	{rfs4_op_nverify, nullfree, RPC_IDEMPOTENT},
@@ -413,12 +413,12 @@
 
 #ifdef DEBUG
 
-int rfs4_fillone_debug = 0;
-int rfs4_shrlock_debug = 0;
-int rfs4_no_stub_access = 1;
-int rfs4_rddir_debug = 0;
-
-static char *rfs4_op_string[] = {
+int		rfs4_fillone_debug = 0;
+int		rfs4_shrlock_debug = 0;
+int		rfs4_no_stub_access = 1;
+int		rfs4_rddir_debug = 0;
+
+static char    *rfs4_op_string[] = {
 	"rfs4_op_null",
 	"rfs4_op_1 unused",
 	"rfs4_op_2 unused",
@@ -463,9 +463,9 @@
 };
 #endif
 
-void rfs4_ss_chkclid(rfs4_client_t *);
-
-extern size_t strlcpy(char *dst, const char *src, size_t dstsize);
+void	rfs4_ss_chkclid(rfs4_client_t *);
+
+extern size_t   strlcpy(char *dst, const char *src, size_t dstsize);
 
 #ifdef	nextdp
 #undef nextdp
@@ -1321,8 +1321,8 @@
 		    (!is_system_labeled() || admin_low_client ||
 		    blequal(clabel, slabel)))
 			resp->access |=
-			    (args->access & (ACCESS4_MODIFY|ACCESS4_EXTEND));
-		resp->supported |= (ACCESS4_MODIFY|ACCESS4_EXTEND);
+			    (args->access & (ACCESS4_MODIFY | ACCESS4_EXTEND));
+		resp->supported |= (ACCESS4_MODIFY | ACCESS4_EXTEND);
 	}
 
 	if (checkwriteperm &&
@@ -2218,8 +2218,8 @@
 		amap = ntov.amap;
 		for (i = 0; i < ntov.attrcnt; i++, na++, amap++) {
 			if (!(*nfs4_ntov_map[*amap].xfunc)(&xdr, na)) {
-				cmn_err(CE_WARN, "do_rfs4_op_getattr: xdr "
-				    "encode of attribute %d failed\n", *amap);
+				DTRACE_PROBE1(nfss__e__getattr4_encfail,
+				    int, *amap);
 				status = NFS4ERR_SERVERFAULT;
 				break;
 			}
@@ -3120,6 +3120,9 @@
 		resp->data_len = 0;
 		resp->data_val = NULL;
 		resp->mblk = NULL;
+		/* RDMA */
+		resp->wlist = args->wlist;
+		resp->wlist_len = resp->data_len;
 		*cs->statusp = resp->status = NFS4_OK;
 		goto out;
 	}
@@ -3130,6 +3133,9 @@
 		resp->data_len = 0;
 		resp->data_val = NULL;
 		resp->mblk = NULL;
+		/* RDMA */
+		resp->wlist = args->wlist;
+		resp->wlist_len = resp->data_len;
 		goto out;
 	}
 
@@ -3141,26 +3147,36 @@
 		args->count = rfs4_tsize(req);
 
 	/*
-	 * mp will contain the data to be sent out in the read reply.
-	 * It will be freed after the reply has been sent.
-	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple,
-	 * so that the call to xdrmblk_putmblk() never fails.
-	 * If the first alloc of the requested size fails, then
-	 * decrease the size to something more reasonable and wait
-	 * for the allocation to occur.
-	 */
-	mp = allocb(RNDUP(args->count), BPRI_MED);
-	if (mp == NULL) {
-		if (args->count > MAXBSIZE)
-			args->count = MAXBSIZE;
-		mp = allocb_wait(RNDUP(args->count), BPRI_MED,
-		    STR_NOSIG, &alloc_err);
-	}
-	ASSERT(mp != NULL);
-	ASSERT(alloc_err == 0);
-
-	iov.iov_base = (caddr_t)mp->b_datap->db_base;
-	iov.iov_len = args->count;
+	 * If returning data via RDMA Write, then grab the chunk list. If we
+	 * aren't returning READ data w/RDMA_WRITE, then grab a mblk.
+	 */
+	if (args->wlist) {
+		mp = NULL;
+		(void) rdma_get_wchunk(req, &iov, args->wlist);
+	} else {
+		/*
+		 * mp will contain the data to be sent out in the read reply.
+		 * It will be freed after the reply has been sent. Let's
+		 * roundup the data to a BYTES_PER_XDR_UNIT multiple, so that
+		 * the call to xdrmblk_putmblk() never fails. If the first
+		 * alloc of the requested size fails, then decrease the size to
+		 * something more reasonable and wait for the allocation to
+		 * occur.
+		 */
+		mp = allocb(RNDUP(args->count), BPRI_MED);
+		if (mp == NULL) {
+			if (args->count > MAXBSIZE)
+				args->count = MAXBSIZE;
+			mp = allocb_wait(RNDUP(args->count), BPRI_MED,
+			    STR_NOSIG, &alloc_err);
+		}
+		ASSERT(mp != NULL);
+		ASSERT(alloc_err == 0);
+
+		iov.iov_base = (caddr_t)mp->b_datap->db_base;
+		iov.iov_len = args->count;
+	}
+
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_segflg = UIO_SYSSPACE;
@@ -3183,7 +3199,11 @@
 
 	ASSERT(uio.uio_resid >= 0);
 	resp->data_len = args->count - uio.uio_resid;
-	resp->data_val = (char *)mp->b_datap->db_base;
+	if (mp) {
+		resp->data_val = (char *)mp->b_datap->db_base;
+	} else {
+		resp->data_val = (caddr_t)iov.iov_base;
+	}
 	resp->mblk = mp;
 
 	if (!verror && offset + resp->data_len == va.va_size)
@@ -3191,6 +3211,14 @@
 	else
 		resp->eof = FALSE;
 
+	if (args->wlist) {
+		if (!rdma_setup_read_data4(args, resp)) {
+			*cs->statusp = resp->status = NFS4ERR_INVAL;
+		}
+	} else {
+		resp->wlist = NULL;
+	}
+
 out:
 	if (in_crit)
 		nbl_end_crit(vp);
@@ -3202,7 +3230,7 @@
 static void
 rfs4_op_read_free(nfs_resop4 *resop)
 {
-	READ4res *resp = &resop->nfs_resop4_u.opread;
+	READ4res	*resp = &resop->nfs_resop4_u.opread;
 
 	if (resp->status == NFS4_OK && resp->mblk != NULL) {
 		freeb(resp->mblk);
@@ -3213,9 +3241,9 @@
 }
 
 static void
-rfs4_op_readdir_free(nfs_resop4 *resop)
+rfs4_op_readdir_free(nfs_resop4 * resop)
 {
-	READDIR4res *resp = &resop->nfs_resop4_u.opreaddir;
+	READDIR4res    *resp = &resop->nfs_resop4_u.opreaddir;
 
 	if (resp->status == NFS4_OK && resp->mblk != NULL) {
 		freeb(resp->mblk);
@@ -3228,13 +3256,13 @@
 /* ARGSUSED */
 static void
 rfs4_op_putpubfh(nfs_argop4 *args, nfs_resop4 *resop, struct svc_req *req,
-	struct compound_state *cs)
+    struct compound_state *cs)
 {
-	PUTPUBFH4res *resp = &resop->nfs_resop4_u.opputpubfh;
-	int error;
-	vnode_t *vp;
+	PUTPUBFH4res	*resp = &resop->nfs_resop4_u.opputpubfh;
+	int		error;
+	vnode_t		*vp;
 	struct exportinfo *exi, *sav_exi;
-	nfs_fh4_fmt_t *fh_fmtp;
+	nfs_fh4_fmt_t	*fh_fmtp;
 
 	DTRACE_NFSV4_1(op__putpubfh__start, struct compound_state *, cs);
 
@@ -5378,6 +5406,11 @@
 			iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
 		}
 		mblk_to_iov(args->mblk, iovcnt, iovp);
+	} else if (args->rlist != NULL) {
+		iovcnt = 1;
+		iovp = iov;
+		iovp->iov_base = (char *)((args->rlist)->u.c_daddr3);
+		iovp->iov_len = args->data_len;
 	} else {
 		iovcnt = 1;
 		iovp = iov;
@@ -9082,3 +9115,46 @@
 {
 	(void) rfs4_shrlock(sp, F_UNSHARE);
 }
+
+static int
+rdma_setup_read_data4(READ4args * args, READ4res * rok)
+{
+	struct clist	*wcl;
+	int		data_len, avail_len, num;
+	count4		count = rok->data_len;
+
+	data_len = num = avail_len = 0;
+
+	wcl = args->wlist;
+	while (wcl != NULL) {
+		if (wcl->c_dmemhandle.mrc_rmr == 0)
+			break;
+
+		avail_len += wcl->c_len;
+		if (wcl->c_len < count) {
+			data_len += wcl->c_len;
+		} else {
+			/* Can make the rest chunks all 0-len */
+			data_len += count;
+			wcl->c_len = count;
+		}
+		count -= wcl->c_len;
+		num++;
+		wcl = wcl->c_next;
+	}
+
+	/*
+	 * MUST fail if there are still more data
+	 */
+	if (count > 0) {
+		DTRACE_PROBE2(nfss__e__read4_wlist_fail,
+		    int, data_len, int, count);
+		return (FALSE);
+	}
+	wcl = args->wlist;
+	rok->data_len = data_len;
+	rok->wlist_len = data_len;
+	rok->wlist = wcl;
+
+	return (TRUE);
+}
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Thu Aug 21 18:01:07 2008 -0500
@@ -28,8 +28,6 @@
  *	All Rights Reserved
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -3408,12 +3406,15 @@
 			tsize = MIN(mi->mi_tsize, count);
 		else
 			tsize = MIN(mi->mi_curread, count);
+
 		rargs->offset = (offset4)offset;
 		rargs->count = (count4)tsize;
 		rargs->res_data_val_alt = NULL;
 		rargs->res_mblk = NULL;
 		rargs->res_uiop = NULL;
 		rargs->res_maxsize = 0;
+		rargs->wlist = NULL;
+
 		if (uiop)
 			rargs->res_uiop = uiop;
 		else
@@ -3501,7 +3502,6 @@
 
 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
 			    "nfs4read: initiating recovery\n"));
-
 			abort = nfs4_start_recovery(&e,
 			    mi, vp, NULL, &rargs->stateid,
 			    NULL, OP_READ, NULL);
--- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c	Thu Aug 21 18:01:07 2008 -0500
@@ -19,11 +19,11 @@
  * CDDL HEADER END
  */
 /*
- *	Copyright 2007 Sun Microsystems, Inc.
- *	All rights reserved.  Use is subject to license terms.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#pragma ident	"@(#)nfs4_xdr.c	1.27	08/08/11 SMI"
 
 /*
  * A handcoded version based on the original rpcgen code.
@@ -67,13 +67,13 @@
 
 #if defined(_LITTLE_ENDIAN)
 		if (XDR_PUTINT32(xdrs, (int32_t *)((char *)objp +
-						BYTES_PER_XDR_UNIT)) == TRUE) {
+		    BYTES_PER_XDR_UNIT)) == TRUE) {
 			return (XDR_PUTINT32(xdrs, (int32_t *)objp));
 		}
 #elif defined(_BIG_ENDIAN)
 		if (XDR_PUTINT32(xdrs, (int32_t *)objp) == TRUE) {
 			return (XDR_PUTINT32(xdrs, (int32_t *)((char *)objp +
-						BYTES_PER_XDR_UNIT)));
+			    BYTES_PER_XDR_UNIT)));
 		}
 #endif
 		return (FALSE);
@@ -88,13 +88,13 @@
 	if (len == 2) {
 #if defined(_LITTLE_ENDIAN)
 		if (XDR_GETINT32(xdrs, (int32_t *)((char *)objp +
-					BYTES_PER_XDR_UNIT)) == TRUE) {
+		    BYTES_PER_XDR_UNIT)) == TRUE) {
 			return (XDR_GETINT32(xdrs, (int32_t *)objp));
 		}
 #elif defined(_BIG_ENDIAN)
 		if (XDR_GETINT32(xdrs, (int32_t *)objp) == TRUE) {
 			return (XDR_GETINT32(xdrs, (int32_t *)((char *)objp +
-					BYTES_PER_XDR_UNIT)));
+			    BYTES_PER_XDR_UNIT)));
 		}
 #endif
 		return (FALSE);
@@ -138,7 +138,7 @@
 {
 	if (xdrs->x_op != XDR_FREE)
 		return (xdr_bytes(xdrs, (char **)&objp->utf8string_val,
-			(uint_t *)&objp->utf8string_len, NFS4_MAX_UTF8STRING));
+		    (uint_t *)&objp->utf8string_len, NFS4_MAX_UTF8STRING));
 
 	if (objp->utf8string_val != NULL) {
 		kmem_free(objp->utf8string_val, objp->utf8string_len);
@@ -495,13 +495,13 @@
 xdr_fs_location4(XDR *xdrs, fs_location4 *objp)
 {
 	if (!xdr_array(xdrs, (char **)&objp->server_val,
-			(uint_t *)&objp->server_len, NFS4_FS_LOCATIONS_LIMIT,
-			sizeof (utf8string), (xdrproc_t)xdr_utf8string))
+	    (uint_t *)&objp->server_len, NFS4_FS_LOCATIONS_LIMIT,
+	    sizeof (utf8string), (xdrproc_t)xdr_utf8string))
 		return (FALSE);
 	return (xdr_array(xdrs, (char **)&objp->rootpath.pathname4_val,
-			(uint_t *)&objp->rootpath.pathname4_len,
-			NFS4_MAX_PATHNAME4,
-			sizeof (utf8string), (xdrproc_t)xdr_utf8string));
+	    (uint_t *)&objp->rootpath.pathname4_len,
+	    NFS4_MAX_PATHNAME4,
+	    sizeof (utf8string), (xdrproc_t)xdr_utf8string));
 }
 
 /* Called by xdr_array */
@@ -522,8 +522,8 @@
 		}
 
 		return (xdr_bytes(xdrs, (char **)&objp->who.utf8string_val,
-			(uint_t *)&objp->who.utf8string_len,
-			NFS4_MAX_UTF8STRING));
+		    (uint_t *)&objp->who.utf8string_len,
+		    NFS4_MAX_UTF8STRING));
 	}
 
 	/*
@@ -555,21 +555,21 @@
 xdr_fattr4_acl(XDR *xdrs, fattr4_acl *objp)
 {
 	return (xdr_array(xdrs, (char **)&objp->fattr4_acl_val,
-			(uint_t *)&objp->fattr4_acl_len, NFS4_ACL_LIMIT,
-			sizeof (nfsace4), (xdrproc_t)xdr_nfsace4));
+	    (uint_t *)&objp->fattr4_acl_len, NFS4_ACL_LIMIT,
+	    sizeof (nfsace4), (xdrproc_t)xdr_nfsace4));
 }
 
 bool_t
 xdr_fattr4_fs_locations(XDR *xdrs, fattr4_fs_locations *objp)
 {
 	if (!xdr_array(xdrs, (char **)&objp->fs_root.pathname4_val,
-			(uint_t *)&objp->fs_root.pathname4_len,
-			NFS4_MAX_PATHNAME4,
-			sizeof (utf8string), (xdrproc_t)xdr_utf8string))
+	    (uint_t *)&objp->fs_root.pathname4_len,
+	    NFS4_MAX_PATHNAME4,
+	    sizeof (utf8string), (xdrproc_t)xdr_utf8string))
 		return (FALSE);
 	return (xdr_array(xdrs, (char **)&objp->locations_val,
-			(uint_t *)&objp->locations_len, NFS4_FS_LOCATIONS_LIMIT,
-			sizeof (fs_location4), (xdrproc_t)xdr_fs_location4));
+	    (uint_t *)&objp->locations_len, NFS4_FS_LOCATIONS_LIMIT,
+	    sizeof (fs_location4), (xdrproc_t)xdr_fs_location4));
 }
 
 bool_t
@@ -766,13 +766,13 @@
 		}
 		if (resbmap & FATTR4_CHANGE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&garp->n4g_change))
+			    (u_longlong_t *)&garp->n4g_change))
 				return (FALSE);
 			garp->n4g_change_valid = 1;
 		}
 		if (resbmap & FATTR4_SIZE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&vap->va_size))
+			    (u_longlong_t *)&vap->va_size))
 				return (FALSE);
 			if (!NFS4_SIZE_OK(vap->va_size)) {
 				garp->n4g_attrerr = EFBIG;
@@ -785,20 +785,20 @@
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_link_support =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_SYMLINK_SUPPORT_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_symlink_support =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_NAMED_ATTR_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_xattr_exists = TRUE;
 			gesp->n4g_pc4.pc4_xattr_exists =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -808,18 +808,18 @@
 	    FATTR4_RDATTR_ERROR_MASK)) {
 
 		if (resbmap & FATTR4_FSID_MASK) {
-		    if ((!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&garp->n4g_fsid.major)) ||
-			(!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&garp->n4g_fsid.minor)))
+			if ((!xdr_u_longlong_t(xdrs,
+			    (u_longlong_t *)&garp->n4g_fsid.major)) ||
+			    (!xdr_u_longlong_t(xdrs,
+			    (u_longlong_t *)&garp->n4g_fsid.minor)))
 				return (FALSE);
-		    garp->n4g_fsid_valid = 1;
+			garp->n4g_fsid_valid = 1;
 		}
 		if (resbmap & FATTR4_UNIQUE_HANDLES_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_unique_handles =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_LEASE_TIME_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&gesp->n4g_leasetime))
@@ -827,7 +827,7 @@
 		}
 		if (resbmap & FATTR4_RDATTR_ERROR_MASK) {
 			if (!XDR_GETINT32(xdrs,
-					(int *)&gesp->n4g_rdattr_error))
+			    (int *)&gesp->n4g_rdattr_error))
 				return (FALSE);
 		}
 	}
@@ -863,7 +863,7 @@
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_cansettime =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -880,24 +880,24 @@
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_case_insensitive =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_CASE_PRESERVING_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_case_preserving =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_CHOWN_RESTRICTED_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_chown_restricted =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_FILEHANDLE_MASK) {
 			gesp->n4g_fh_u.nfs_fh4_alt.len = 0;
 			gesp->n4g_fh_u.nfs_fh4_alt.val =
-				gesp->n4g_fh_u.nfs_fh4_alt.data;
+			    gesp->n4g_fh_u.nfs_fh4_alt.data;
 			if (!xdr_bytes(xdrs,
 			    (char **)&gesp->n4g_fh_u.n4g_fh.nfs_fh4_val,
 			    (uint_t *)&gesp->n4g_fh_u.n4g_fh.nfs_fh4_len,
@@ -906,23 +906,23 @@
 		}
 		if (resbmap & FATTR4_FILEID_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&vap->va_nodeid))
+			    (u_longlong_t *)&vap->va_nodeid))
 				return (FALSE);
 			vap->va_mask |= AT_NODEID;
 		}
 		if (resbmap & FATTR4_FILES_AVAIL_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_favail))
+			    (u_longlong_t *)&gesp->n4g_sb.f_favail))
 				return (FALSE);
 		}
 		if (resbmap & FATTR4_FILES_FREE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_ffree))
+			    (u_longlong_t *)&gesp->n4g_sb.f_ffree))
 				return (FALSE);
 		}
 		if (resbmap & FATTR4_FILES_TOTAL_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_files))
+			    (u_longlong_t *)&gesp->n4g_sb.f_files))
 				return (FALSE);
 		}
 	}
@@ -941,7 +941,7 @@
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_homogeneous =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -953,28 +953,28 @@
 
 		if (resbmap & FATTR4_MAXFILESIZE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_maxfilesize))
+			    (u_longlong_t *)&gesp->n4g_maxfilesize))
 				return (FALSE);
 		}
 		if (resbmap & FATTR4_MAXLINK_MASK) {
 			if (!XDR_GETINT32(xdrs,
-					(int *)&gesp->n4g_pc4.pc4_link_max))
+			    (int *)&gesp->n4g_pc4.pc4_link_max))
 				return (FALSE);
 		}
 		if (resbmap & FATTR4_MAXNAME_MASK) {
 			if (!XDR_GETINT32(xdrs,
-					(int *)&gesp->n4g_pc4.pc4_name_max))
+			    (int *)&gesp->n4g_pc4.pc4_name_max))
 				return (FALSE);
 			gesp->n4g_sb.f_namemax = gesp->n4g_pc4.pc4_name_max;
 		}
 		if (resbmap & FATTR4_MAXREAD_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_maxread))
+			    (u_longlong_t *)&gesp->n4g_maxread))
 				return (FALSE);
 		}
 		if (resbmap & FATTR4_MAXWRITE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_maxwrite))
+			    (u_longlong_t *)&gesp->n4g_maxwrite))
 				return (FALSE);
 		}
 	}
@@ -996,7 +996,7 @@
 			if (!XDR_GETINT32(xdrs, (int *)&truefalse))
 				return (FALSE);
 			gesp->n4g_pc4.pc4_no_trunc =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_NUMLINKS_MASK) {
 			if (!XDR_GETINT32(xdrs, (int *)&vap->va_nlink))
@@ -1032,7 +1032,7 @@
 			/* find memory to store the decode */
 			if (*owner_length > MAX_OG_NAME || pug == NULL)
 				owner_val = owner_alloc =
-					kmem_alloc(*owner_length, KM_SLEEP);
+				    kmem_alloc(*owner_length, KM_SLEEP);
 			else
 				owner_val = pug->u_curr.utf8string_val;
 
@@ -1047,7 +1047,7 @@
 			if (pug &&
 			    *owner_length == pug->u_last.utf8string_len &&
 			    bcmp(owner_val, pug->u_last.utf8string_val,
-					*owner_length) == 0) {
+			    *owner_length) == 0) {
 				vap->va_uid = pug->uid;
 				vap->va_mask |= AT_UID;
 			} else {
@@ -1064,13 +1064,13 @@
 				if (error == ENOTSUP) {
 					error = 0;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_NOCACHE_OK;
+					    NFS4_GETATTR_NOCACHE_OK;
 				}
 
 				if (error) {
 					garp->n4g_attrerr = error;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_ATUID_ERR;
+					    NFS4_GETATTR_ATUID_ERR;
 				} else {
 					vap->va_uid = uid;
 					vap->va_mask |= AT_UID;
@@ -1105,7 +1105,7 @@
 			/* find memory to store the decode */
 			if (*group_length > MAX_OG_NAME || pug == NULL)
 				group_val = group_alloc =
-					kmem_alloc(*group_length, KM_SLEEP);
+				    kmem_alloc(*group_length, KM_SLEEP);
 			else
 				group_val = pug->g_curr.utf8string_val;
 
@@ -1120,7 +1120,7 @@
 			if (pug &&
 			    *group_length == pug->g_last.utf8string_len &&
 			    bcmp(group_val, pug->g_last.utf8string_val,
-					*group_length) == 0) {
+			    *group_length) == 0) {
 				vap->va_gid = pug->gid;
 				vap->va_mask |= AT_GID;
 			} else {
@@ -1137,13 +1137,13 @@
 				if (error == ENOTSUP) {
 					error = 0;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_NOCACHE_OK;
+					    NFS4_GETATTR_NOCACHE_OK;
 				}
 
 				if (error) {
 					garp->n4g_attrerr = error;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_ATGID_ERR;
+					    NFS4_GETATTR_ATGID_ERR;
 				} else {
 					vap->va_gid = gid;
 					vap->va_mask |= AT_GID;
@@ -1182,7 +1182,7 @@
 
 			if (vap->va_type == VCHR || vap->va_type == VBLK) {
 				vap->va_rdev = makedevice(rawdev.specdata1,
-							rawdev.specdata2);
+				    rawdev.specdata2);
 			} else {
 				vap->va_rdev = 0;
 			}
@@ -1190,26 +1190,26 @@
 		}
 		if (resbmap & FATTR4_SPACE_AVAIL_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_bavail))
+			    (u_longlong_t *)&gesp->n4g_sb.f_bavail))
 				return (FALSE);
 			gesp->n4g_sb.f_bavail /= DEV_BSIZE;
 		}
 		if (resbmap & FATTR4_SPACE_FREE_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_bfree))
+			    (u_longlong_t *)&gesp->n4g_sb.f_bfree))
 				return (FALSE);
 			gesp->n4g_sb.f_bfree /= DEV_BSIZE;
 		}
 		if (resbmap & FATTR4_SPACE_TOTAL_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&gesp->n4g_sb.f_blocks))
+			    (u_longlong_t *)&gesp->n4g_sb.f_blocks))
 				return (FALSE);
 			gesp->n4g_sb.f_blocks /= DEV_BSIZE;
 		}
 		if (resbmap & FATTR4_SPACE_USED_MASK) {
 			uint64_t space_used;
 			if (!xdr_u_longlong_t(xdrs,
-						(u_longlong_t *)&space_used))
+			    (u_longlong_t *)&space_used))
 				return (FALSE);
 
 			/* Compute space depending on device type */
@@ -1217,8 +1217,8 @@
 			if (vap->va_type == VREG || vap->va_type == VDIR ||
 			    vap->va_type == VLNK) {
 				vap->va_nblocks = (u_longlong_t)
-					((space_used + (offset4)DEV_BSIZE -
-					(offset4)1) / (offset4)DEV_BSIZE);
+				    ((space_used + (offset4)DEV_BSIZE -
+				    (offset4)1) / (offset4)DEV_BSIZE);
 			} else {
 				vap->va_nblocks = 0;
 			}
@@ -1244,7 +1244,7 @@
 			int error;
 
 			if (!xdr_longlong_t(xdrs,
-					    (longlong_t *)&atime.seconds))
+			    (longlong_t *)&atime.seconds))
 				return (FALSE);
 			if (!XDR_GETINT32(xdrs, (int *)&atime.nseconds))
 				return (FALSE);
@@ -1290,7 +1290,7 @@
 			int error;
 
 			if (!xdr_longlong_t(xdrs,
-					    (longlong_t *)&mtime.seconds))
+			    (longlong_t *)&mtime.seconds))
 				return (FALSE);
 			if (!XDR_GETINT32(xdrs, (int32_t *)&mtime.nseconds))
 				return (FALSE);
@@ -1306,7 +1306,7 @@
 		}
 		if (resbmap & FATTR4_MOUNTED_ON_FILEID_MASK) {
 			if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&garp->n4g_mon_fid))
+			    (u_longlong_t *)&garp->n4g_mon_fid))
 				return (FALSE);
 			garp->n4g_mon_fid_valid = 1;
 		}
@@ -1448,18 +1448,18 @@
 		if (resbmap & FATTR4_LINK_SUPPORT_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_link_support =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_SYMLINK_SUPPORT_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_symlink_support =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_NAMED_ATTR_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_xattr_exists = TRUE;
 			gesp->n4g_pc4.pc4_xattr_exists =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -1476,7 +1476,7 @@
 		if (resbmap & FATTR4_UNIQUE_HANDLES_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_unique_handles =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_LEASE_TIME_MASK) {
 			gesp->n4g_leasetime = IXDR_GET_U_INT32(ptr);
@@ -1503,7 +1503,7 @@
 		if (resbmap & FATTR4_CANSETTIME_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_cansettime =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -1519,24 +1519,24 @@
 		if (resbmap & FATTR4_CASE_INSENSITIVE_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_case_insensitive =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_CASE_PRESERVING_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_case_preserving =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_CHOWN_RESTRICTED_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_chown_restricted =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_FILEHANDLE_MASK) {
 			int len = IXDR_GET_U_INT32(ptr);
 
 			gesp->n4g_fh_u.nfs_fh4_alt.len = 0;
 			gesp->n4g_fh_u.nfs_fh4_alt.val =
-				gesp->n4g_fh_u.nfs_fh4_alt.data;
+			    gesp->n4g_fh_u.nfs_fh4_alt.data;
 			gesp->n4g_fh_u.n4g_fh.nfs_fh4_len = len;
 
 			bcopy(ptr, gesp->n4g_fh_u.n4g_fh.nfs_fh4_val, len);
@@ -1571,7 +1571,7 @@
 		if (resbmap & FATTR4_HOMOGENEOUS_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_homogeneous =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 	}
 	if (resbmap &
@@ -1614,7 +1614,7 @@
 		if (resbmap & FATTR4_NO_TRUNC_MASK) {
 			truefalse = IXDR_GET_U_INT32(ptr);
 			gesp->n4g_pc4.pc4_no_trunc =
-				(truefalse ? TRUE : FALSE);
+			    (truefalse ? TRUE : FALSE);
 		}
 		if (resbmap & FATTR4_NUMLINKS_MASK) {
 			vap->va_nlink = IXDR_GET_U_INT32(ptr);
@@ -1654,7 +1654,7 @@
 			if (pug &&
 			    *owner_length == pug->u_last.utf8string_len &&
 			    bcmp(owner_val, pug->u_last.utf8string_val,
-					*owner_length) == 0) {
+			    *owner_length) == 0) {
 				vap->va_uid = pug->uid;
 				vap->va_mask |= AT_UID;
 			} else {
@@ -1671,13 +1671,13 @@
 				if (error == ENOTSUP) {
 					error = 0;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_NOCACHE_OK;
+					    NFS4_GETATTR_NOCACHE_OK;
 				}
 
 				if (error) {
 					garp->n4g_attrerr = error;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_ATUID_ERR;
+					    NFS4_GETATTR_ATUID_ERR;
 				} else {
 					vap->va_uid = uid;
 					vap->va_mask |= AT_UID;
@@ -1685,9 +1685,10 @@
 					if (pug && ol <= MAX_OG_NAME) {
 						pug->uid = uid;
 						pug->u_curr.utf8string_len =
-							ov.utf8string_len;
+						    ov.utf8string_len;
 						bcopy(owner_val,
-						pug->u_curr.utf8string_val, ol);
+						    pug->u_curr.utf8string_val,
+						    ol);
 						U_SWAP_CURR_LAST(pug);
 					}
 				}
@@ -1721,7 +1722,7 @@
 			if (pug &&
 			    *group_length == pug->g_last.utf8string_len &&
 			    bcmp(group_val, pug->g_last.utf8string_val,
-					*group_length) == 0) {
+			    *group_length) == 0) {
 				vap->va_gid = pug->gid;
 				vap->va_mask |= AT_GID;
 			} else {
@@ -1738,20 +1739,20 @@
 				if (error == ENOTSUP) {
 					error = 0;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_NOCACHE_OK;
+					    NFS4_GETATTR_NOCACHE_OK;
 				}
 
 				if (error) {
 					garp->n4g_attrerr = error;
 					garp->n4g_attrwhy =
-						NFS4_GETATTR_ATGID_ERR;
+					    NFS4_GETATTR_ATGID_ERR;
 				} else {
 					vap->va_gid = gid;
 					vap->va_mask |= AT_GID;
 					if (pug && gl <= MAX_OG_NAME) {
 						pug->gid = gid;
 						pug->g_curr.utf8string_len =
-							gv.utf8string_len;
+						    gv.utf8string_len;
 						bcopy(group_val,
 						    pug->g_curr.utf8string_val,
 						    gl);
@@ -1787,7 +1788,7 @@
 
 			if (vap->va_type == VCHR || vap->va_type == VBLK) {
 				vap->va_rdev = makedevice(rawdev.specdata1,
-							rawdev.specdata2);
+				    rawdev.specdata2);
 			} else {
 				vap->va_rdev = 0;
 			}
@@ -1814,8 +1815,8 @@
 			if (vap->va_type == VREG || vap->va_type == VDIR ||
 			    vap->va_type == VLNK) {
 				vap->va_nblocks = (u_longlong_t)
-					((space_used + (offset4)DEV_BSIZE -
-					(offset4)1) / (offset4)DEV_BSIZE);
+				    ((space_used + (offset4)DEV_BSIZE -
+				    (offset4)1) / (offset4)DEV_BSIZE);
 			} else {
 				vap->va_nblocks = 0;
 			}
@@ -2011,26 +2012,26 @@
 
 	/* Fill in dot and dot-dot if needed */
 	if (rdc->nfs4_cookie == (nfs_cookie4) 0 ||
-		    rdc->nfs4_cookie == (nfs_cookie4) 1) {
+	    rdc->nfs4_cookie == (nfs_cookie4) 1) {
 
 		if (rdc->nfs4_cookie == (nfs_cookie4)0) {
 			bcopy(nfs4_dot_entries, rdc->entries,
-				    DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2));
+			    DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2));
 			objp->dotp = dp;
 			dp = (struct dirent64 *)(((char *)dp) +
-							DIRENT64_RECLEN(1));
+			    DIRENT64_RECLEN(1));
 			objp->dotdotp = dp;
 			dp = (struct dirent64 *)(((char *)dp) +
-							DIRENT64_RECLEN(2));
+			    DIRENT64_RECLEN(2));
 			space_left -= DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2);
 
 		} else	{	/* for ".." entry */
 			bcopy(nfs4_dot_dot_entry, rdc->entries,
-						DIRENT64_RECLEN(2));
+			    DIRENT64_RECLEN(2));
 			objp->dotp = NULL;
 			objp->dotdotp = dp;
 			dp = (struct dirent64 *)(((char *)dp) +
-							DIRENT64_RECLEN(2));
+			    DIRENT64_RECLEN(2));
 			space_left -= DIRENT64_RECLEN(2);
 		}
 		/* Magic NFSv4 number for entry after start */
@@ -2083,7 +2084,7 @@
 			if (!xdr_opaque(xdrs, dp->d_name, namelen))
 				goto noentries;
 			bzero(&dp->d_name[namelen],
-				DIRENT64_NAMELEN(entry_length) - namelen);
+			    DIRENT64_NAMELEN(entry_length) - namelen);
 			dp->d_off = last_cookie = cookie;
 			dp->d_reclen = entry_length;
 		} else {
@@ -2106,13 +2107,11 @@
 			    (ptr = (uint32_t *)XDR_INLINE(xdrs, attrlen))
 			    != NULL) {
 				if (!xdr_ga_fattr_res_inline(ptr, &gar, resbmap,
-							aobjp->attr_request,
-							aobjp->mi, pug))
+				    aobjp->attr_request, aobjp->mi, pug))
 					goto noentries;
 			} else {
 				if (!xdr_ga_fattr_res(xdrs, &gar, resbmap,
-							aobjp->attr_request,
-							aobjp->mi, pug))
+				    aobjp->attr_request, aobjp->mi, pug))
 					goto noentries;
 			}
 
@@ -2139,14 +2138,14 @@
 				vnode_t *vp;
 
 				sfhp = sfh4_put(&ges.n4g_fh_u.n4g_fh,
-							aobjp->mi, NULL);
+				    aobjp->mi, NULL);
 				vp = makenfs4node(sfhp, &gar,
-					aobjp->dvp->v_vfsp,
-					aobjp->t,
-					aobjp->cr,
-					aobjp->dvp,
-					fn_get(VTOSV(aobjp->dvp)->sv_name,
-						dp->d_name));
+				    aobjp->dvp->v_vfsp,
+				    aobjp->t,
+				    aobjp->cr,
+				    aobjp->dvp,
+				    fn_get(VTOSV(aobjp->dvp)->sv_name,
+				    dp->d_name));
 				sfh4_rele(&sfhp);
 				dnlc_update(aobjp->dvp, dp->d_name, vp);
 				VN_RELE(vp);
@@ -2264,7 +2263,7 @@
 
 	/* Initialize objp attribute error values */
 	objp->ga_res.n4g_attrerr =
-		objp->ga_res.n4g_attrwhy = NFS4_GETATTR_OP_OK;
+	    objp->ga_res.n4g_attrwhy = NFS4_GETATTR_OP_OK;
 
 	if (!xdr_bitmap4(xdrs, &resbmap))
 		return (FALSE);
@@ -2301,7 +2300,7 @@
 		 * specific routines that decode the server response.
 		 */
 		deltabmap = ((aobjp->attr_request ^ resbmap)
-				& aobjp->attr_request);
+		    & aobjp->attr_request);
 		if ((deltabmap & FATTR4_MANDATTR_MASK)) {
 			objp->ga_res.n4g_attrerr = EINVAL;
 			objp->ga_res.n4g_attrwhy = NFS4_GETATTR_MANDATTR_ERR;
@@ -2313,12 +2312,10 @@
 	if (!(resbmap & FATTR4_ACL_MASK) &&
 	    (ptr = (uint32_t *)XDR_INLINE(xdrs, attrlen)) != NULL)
 		return (xdr_ga_fattr_res_inline(ptr, &objp->ga_res,
-					resbmap, aobjp->attr_request,
-					aobjp->mi, NULL));
+		    resbmap, aobjp->attr_request, aobjp->mi, NULL));
 	else
 		return (xdr_ga_fattr_res(xdrs, &objp->ga_res,
-					resbmap, aobjp->attr_request,
-					aobjp->mi, NULL));
+		    resbmap, aobjp->attr_request, aobjp->mi, NULL));
 }
 
 #if defined(DEBUG) && !defined(lint)
@@ -2354,7 +2351,7 @@
 		if (!xdr_bitmap4(xdrs, &objp->attrmask))
 			return (FALSE);
 		return (xdr_bytes(xdrs, (char **)&objp->attrlist4,
-			(uint_t *)&objp->attrlist4_len, NFS4_FATTR4_LIMIT));
+		    (uint_t *)&objp->attrlist4_len, NFS4_FATTR4_LIMIT));
 	}
 
 	/*
@@ -2427,8 +2424,8 @@
 			break;	/* server should return NFS4ERR_BADTYPE */
 		}
 		if (!xdr_bytes(xdrs, (char **)&objp->objname.utf8string_val,
-				(uint_t *)&objp->objname.utf8string_len,
-				NFS4_MAX_UTF8STRING))
+		    (uint_t *)&objp->objname.utf8string_len,
+		    NFS4_MAX_UTF8STRING))
 			return (FALSE);
 		return (xdr_fattr4(xdrs, &objp->createattrs));
 	}
@@ -2439,11 +2436,11 @@
 	if (objp->type == NF4LNK) {
 		if (objp->ftype4_u.linkdata.utf8string_val != NULL)
 			kmem_free(objp->ftype4_u.linkdata.utf8string_val,
-				objp->ftype4_u.linkdata.utf8string_len);
+			    objp->ftype4_u.linkdata.utf8string_len);
 	}
 	if (objp->objname.utf8string_val != NULL)
 		kmem_free(objp->objname.utf8string_val,
-			objp->objname.utf8string_len);
+		    objp->objname.utf8string_len);
 	return (xdr_fattr4(xdrs, &objp->createattrs));
 }
 
@@ -2469,10 +2466,10 @@
 	case NF4BLK:
 	case NF4CHR:
 		if (!XDR_PUTINT32(xdrs,
-				(int32_t *)&objp->ftype4_u.devdata.specdata1))
+		    (int32_t *)&objp->ftype4_u.devdata.specdata1))
 			return (FALSE);
 		if (!XDR_PUTINT32(xdrs,
-				(int32_t *)&objp->ftype4_u.devdata.specdata2))
+		    (int32_t *)&objp->ftype4_u.devdata.specdata2))
 			return (FALSE);
 		break;
 	case NF4SOCK:
@@ -2539,42 +2536,40 @@
 			return (FALSE);
 		if (objp->locker.new_lock_owner == TRUE) {
 			if (!xdr_u_int(xdrs, &objp->locker.locker4_u.open_owner.
-							open_seqid))
+			    open_seqid))
 				return (FALSE);
 			if (!xdr_u_int(xdrs, &objp->locker.locker4_u.open_owner.
-							open_stateid.seqid))
+			    open_stateid.seqid))
 				return (FALSE);
 			if (!xdr_opaque(xdrs, objp->locker.locker4_u.open_owner.
-							open_stateid.other,
-				    12))
+			    open_stateid.other, 12))
 				return (FALSE);
 			if (!xdr_u_int(xdrs, &objp->locker.locker4_u.open_owner.
-							lock_seqid))
+			    lock_seqid))
 				return (FALSE);
 			if (!xdr_u_longlong_t(xdrs,
-				    (u_longlong_t *)&objp->locker.locker4_u.
-					open_owner.lock_owner.clientid))
+			    (u_longlong_t *)&objp->locker.locker4_u.
+			    open_owner.lock_owner.clientid))
 				return (FALSE);
 			return (xdr_bytes(xdrs,
-				(char **)&objp->locker.locker4_u.open_owner.
-				    lock_owner.owner_val,
-				(uint_t *)&objp->locker.locker4_u.open_owner.
-				    lock_owner.owner_len,
-				NFS4_OPAQUE_LIMIT));
+			    (char **)&objp->locker.locker4_u.open_owner.
+			    lock_owner.owner_val,
+			    (uint_t *)&objp->locker.locker4_u.open_owner.
+			    lock_owner.owner_len,
+			    NFS4_OPAQUE_LIMIT));
 		}
 
 		if (objp->locker.new_lock_owner != FALSE)
 			return (FALSE);
 
 		if (!xdr_u_int(xdrs, &objp->locker.locker4_u.lock_owner.
-							lock_stateid.seqid))
+		    lock_stateid.seqid))
 			return (FALSE);
 		if (!xdr_opaque(xdrs, objp->locker.locker4_u.lock_owner.
-							lock_stateid.other,
-			    12))
+		    lock_stateid.other, 12))
 			return (FALSE);
 		return (xdr_u_int(xdrs, &objp->locker.locker4_u.lock_owner.
-							lock_seqid));
+		    lock_seqid));
 	}
 
 	/*
@@ -2582,11 +2577,11 @@
 	 */
 	if (objp->locker.new_lock_owner == TRUE) {
 		if (objp->locker.locker4_u.open_owner.lock_owner.owner_val !=
-								NULL) {
+		    NULL) {
 			kmem_free(objp->locker.locker4_u.open_owner.lock_owner.
-							owner_val,
-				objp->locker.locker4_u.open_owner.lock_owner.
-							owner_len);
+			    owner_val,
+			    objp->locker.locker4_u.open_owner.lock_owner.
+			    owner_len);
 		}
 	}
 
@@ -2601,29 +2596,29 @@
 			return (FALSE);
 		if (objp->status == NFS4_OK) {
 			if (!xdr_u_int(xdrs,
-					&objp->LOCK4res_u.lock_stateid.seqid))
+			    &objp->LOCK4res_u.lock_stateid.seqid))
 				return (FALSE);
 			return (xdr_opaque(xdrs,
-				objp->LOCK4res_u.lock_stateid.other, 12));
+			    objp->LOCK4res_u.lock_stateid.other, 12));
 		}
 		if (objp->status != NFS4ERR_DENIED)
 			return (TRUE);
 
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->LOCK4res_u.
-				denied.offset))
+		    denied.offset))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->LOCK4res_u.
-				denied.length))
+		    denied.length))
 			return (FALSE);
 		if (!xdr_int(xdrs, (int *)&objp->LOCK4res_u.denied.locktype))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->LOCK4res_u.
-				denied.owner.clientid))
+		    denied.owner.clientid))
 			return (FALSE);
 		return (xdr_bytes(xdrs,
-			    (char **)&objp->LOCK4res_u.denied.owner.owner_val,
-			    (uint_t *)&objp->LOCK4res_u.denied.owner.owner_len,
-			    NFS4_OPAQUE_LIMIT));
+		    (char **)&objp->LOCK4res_u.denied.owner.owner_val,
+		    (uint_t *)&objp->LOCK4res_u.denied.owner.owner_len,
+		    NFS4_OPAQUE_LIMIT));
 	}
 
 	/*
@@ -2634,7 +2629,7 @@
 
 	if (objp->LOCK4res_u.denied.owner.owner_val != NULL)
 		kmem_free(objp->LOCK4res_u.denied.owner.owner_val,
-			objp->LOCK4res_u.denied.owner.owner_len);
+		    objp->LOCK4res_u.denied.owner.owner_len);
 	return (TRUE);
 }
 
@@ -2649,11 +2644,11 @@
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->length))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&objp->owner.clientid))
+		    (u_longlong_t *)&objp->owner.clientid))
 			return (FALSE);
 		return (xdr_bytes(xdrs, (char **)&objp->owner.owner_val,
-			(uint_t *)&objp->owner.owner_len,
-			NFS4_OPAQUE_LIMIT));
+		    (uint_t *)&objp->owner.owner_len,
+		    NFS4_OPAQUE_LIMIT));
 	}
 
 	/*
@@ -2676,15 +2671,15 @@
 			return (TRUE);
 		/* xdr_LOCK4denied */
 		if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&objp->denied.offset))
+		    (u_longlong_t *)&objp->denied.offset))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&objp->denied.length))
+		    (u_longlong_t *)&objp->denied.length))
 			return (FALSE);
 		if (!xdr_int(xdrs, (int *)&objp->denied.locktype))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->denied.owner.clientid))
+		    (u_longlong_t *)&objp->denied.owner.clientid))
 			return (FALSE);
 		return (xdr_bytes(xdrs,
 		    (char **)&objp->denied.owner.owner_val,
@@ -2699,7 +2694,7 @@
 		return (TRUE);
 	if (objp->denied.owner.owner_val != NULL)
 		kmem_free(objp->denied.owner.owner_val,
-				objp->denied.owner.owner_len);
+		    objp->denied.owner.owner_len);
 	return (TRUE);
 }
 
@@ -2732,11 +2727,11 @@
 
 		/* xdr_open_owner4 */
 		if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&objp->owner.clientid))
+		    (u_longlong_t *)&objp->owner.clientid))
 			return (FALSE);
 		if (!xdr_bytes(xdrs, (char **)&objp->owner.owner_val,
-				(uint_t *)&objp->owner.owner_len,
-				NFS4_OPAQUE_LIMIT))
+		    (uint_t *)&objp->owner.owner_len,
+		    NFS4_OPAQUE_LIMIT))
 			return (FALSE);
 
 		/* xdr_openflag4 */
@@ -2751,7 +2746,7 @@
 			case UNCHECKED4:
 			case GUARDED4:
 				if (!xdr_fattr4(xdrs,
-					    &objp->createhow4_u.createattrs))
+				    &objp->createhow4_u.createattrs))
 					return (FALSE);
 				break;
 			case EXCLUSIVE4:
@@ -2772,31 +2767,31 @@
 		switch (objp->claim) {
 		case CLAIM_NULL:
 			return (xdr_bytes(xdrs, (char **)&objp->open_claim4_u.
-					file.utf8string_val,
-				(uint_t *)&objp->open_claim4_u.file.
-					utf8string_len,
-				NFS4_MAX_UTF8STRING));
+			    file.utf8string_val,
+			    (uint_t *)&objp->open_claim4_u.file.
+			    utf8string_len,
+			    NFS4_MAX_UTF8STRING));
 		case CLAIM_PREVIOUS:
 			return (xdr_int(xdrs,
-				(int *)&objp->open_claim4_u.delegate_type));
+			    (int *)&objp->open_claim4_u.delegate_type));
 		case CLAIM_DELEGATE_CUR:
 			if (!xdr_u_int(xdrs, (uint_t *)&objp->open_claim4_u.
-				    delegate_cur_info.delegate_stateid.seqid))
+			    delegate_cur_info.delegate_stateid.seqid))
 				return (FALSE);
 			if (!xdr_opaque(xdrs, objp->open_claim4_u.
-				    delegate_cur_info.delegate_stateid.other,
-				    12))
+			    delegate_cur_info.delegate_stateid.other,
+			    12))
 				return (FALSE);
 			return (xdr_bytes(xdrs, (char **)&objp->open_claim4_u.
-				delegate_cur_info.file.utf8string_val,
+			    delegate_cur_info.file.utf8string_val,
 			    (uint_t *)&objp->open_claim4_u.
-				delegate_cur_info.file.utf8string_len,
+			    delegate_cur_info.file.utf8string_len,
 			    NFS4_MAX_UTF8STRING));
 		case CLAIM_DELEGATE_PREV:
 			return (xdr_bytes(xdrs, (char **)&objp->open_claim4_u.
-				file_delegate_prev.utf8string_val,
+			    file_delegate_prev.utf8string_val,
 			    (uint_t *)&objp->open_claim4_u.
-				file_delegate_prev.utf8string_len,
+			    file_delegate_prev.utf8string_len,
 			    NFS4_MAX_UTF8STRING));
 		default:
 			return (FALSE);
@@ -2814,7 +2809,7 @@
 		case UNCHECKED4:
 		case GUARDED4:
 			(void) xdr_fattr4(xdrs,
-					&objp->createhow4_u.createattrs);
+			    &objp->createhow4_u.createattrs);
 			break;
 		case EXCLUSIVE4:
 		default:
@@ -2826,26 +2821,26 @@
 	case CLAIM_NULL:
 		if (objp->open_claim4_u.file.utf8string_val != NULL)
 			kmem_free(objp->open_claim4_u.file.utf8string_val,
-				objp->open_claim4_u.file.utf8string_len);
+			    objp->open_claim4_u.file.utf8string_len);
 		return (TRUE);
 	case CLAIM_PREVIOUS:
 		return (TRUE);
 	case CLAIM_DELEGATE_CUR:
 		if (objp->open_claim4_u.delegate_cur_info.file.utf8string_val !=
-								NULL) {
+		    NULL) {
 			kmem_free(objp->open_claim4_u.delegate_cur_info.file.
-							utf8string_val,
-				objp->open_claim4_u.delegate_cur_info.file.
-							utf8string_len);
+			    utf8string_val,
+			    objp->open_claim4_u.delegate_cur_info.file.
+			    utf8string_len);
 		}
 		return (TRUE);
 	case CLAIM_DELEGATE_PREV:
 		if (objp->open_claim4_u.file_delegate_prev.utf8string_val !=
-								NULL) {
+		    NULL) {
 			kmem_free(objp->open_claim4_u.file_delegate_prev.
-							utf8string_val,
-				objp->open_claim4_u.file_delegate_prev.
-							utf8string_len);
+			    utf8string_val,
+			    objp->open_claim4_u.file_delegate_prev.
+			    utf8string_len);
 		}
 		return (TRUE);
 	default:
@@ -2903,12 +2898,12 @@
 
 		/* xdr_open_owner4 */
 		if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->owner.clientid))
+		    (u_longlong_t *)&objp->owner.clientid))
 			return (FALSE);
 		if (!XDR_PUTINT32(xdrs, (int32_t *)&objp->owner.owner_len))
 			return (FALSE);
 		if (!xdr_opaque(xdrs, objp->owner.owner_val,
-				objp->owner.owner_len))
+		    objp->owner.owner_len))
 			return (FALSE);
 
 		/* xdr_openflag4 */
@@ -2924,7 +2919,7 @@
 		case UNCHECKED4:
 		case GUARDED4:
 			if (!xdr_fattr4(xdrs,
-				    &objp->createhow4_u.createattrs))
+			    &objp->createhow4_u.createattrs))
 				return (FALSE);
 			break;
 		case EXCLUSIVE4:
@@ -2949,27 +2944,27 @@
 			return (FALSE);
 		if (XDR_PUTINT32(xdrs, &len)) {
 			return (xdr_opaque(xdrs,
-				objp->open_claim4_u.cfile, len));
+			    objp->open_claim4_u.cfile, len));
 		}
 		return (FALSE);
 	case CLAIM_PREVIOUS:
 		return (XDR_PUTINT32(xdrs,
-			(int32_t *)&objp->open_claim4_u.delegate_type));
+		    (int32_t *)&objp->open_claim4_u.delegate_type));
 	case CLAIM_DELEGATE_CUR:
 		if (!XDR_PUTINT32(xdrs, (int32_t *)&objp->open_claim4_u.
-				delegate_cur_info.delegate_stateid.seqid))
+		    delegate_cur_info.delegate_stateid.seqid))
 			return (FALSE);
 		if (!xdr_opaque(xdrs, objp->open_claim4_u.
-				delegate_cur_info.delegate_stateid.other,
-				12))
+		    delegate_cur_info.delegate_stateid.other,
+		    12))
 			return (FALSE);
 		len = strlen(objp->open_claim4_u.delegate_cur_info.cfile);
 		if (len > NFS4_MAX_UTF8STRING)
 			return (FALSE);
 		if (XDR_PUTINT32(xdrs, &len)) {
 			return (xdr_opaque(xdrs,
-				objp->open_claim4_u.delegate_cur_info.cfile,
-				len));
+			    objp->open_claim4_u.delegate_cur_info.cfile,
+			    len));
 		}
 		return (FALSE);
 	case CLAIM_DELEGATE_PREV:
@@ -2978,7 +2973,7 @@
 			return (FALSE);
 		if (XDR_PUTINT32(xdrs, &len)) {
 			return (xdr_opaque(xdrs,
-				objp->open_claim4_u.cfile_delegate_prev, len));
+			    objp->open_claim4_u.cfile_delegate_prev, len));
 		}
 		return (FALSE);
 	default:
@@ -3001,7 +2996,7 @@
 		if (!xdr_bool(xdrs, &objp->cinfo.atomic))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->cinfo.before))
+		    (u_longlong_t *)&objp->cinfo.before))
 			return (FALSE);
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->cinfo.after))
 			return (FALSE);
@@ -3010,42 +3005,42 @@
 		if (!xdr_bitmap4(xdrs, &objp->attrset))
 			return (FALSE);
 		if (!xdr_int(xdrs,
-			    (int *)&objp->delegation.delegation_type))
+		    (int *)&objp->delegation.delegation_type))
 			return (FALSE);
 		switch (objp->delegation.delegation_type) {
 		case OPEN_DELEGATE_NONE:
 			return (TRUE);
 		case OPEN_DELEGATE_READ:
 			if (!xdr_u_int(xdrs, &objp->delegation.
-					open_delegation4_u.read.stateid.seqid))
+			    open_delegation4_u.read.stateid.seqid))
 				return (FALSE);
 			if (!xdr_opaque(xdrs, objp->delegation.
-					open_delegation4_u.read.stateid.other,
-					12))
+			    open_delegation4_u.read.stateid.other,
+			    12))
 				return (FALSE);
 			if (!xdr_bool(xdrs, &objp->delegation.
-					open_delegation4_u.read.recall))
+			    open_delegation4_u.read.recall))
 				return (FALSE);
 			return (xdr_nfsace4(xdrs, &objp->delegation.
-					open_delegation4_u.read.permissions));
+			    open_delegation4_u.read.permissions));
 		case OPEN_DELEGATE_WRITE:
 			if (!xdr_u_int(xdrs, &objp->delegation.
-					open_delegation4_u.write.stateid.seqid))
+			    open_delegation4_u.write.stateid.seqid))
 				return (FALSE);
 			if (!xdr_opaque(xdrs, objp->delegation.
-					open_delegation4_u.write.stateid.other,
-					12))
+			    open_delegation4_u.write.stateid.other,
+			    12))
 				return (FALSE);
 			if (!xdr_bool(xdrs, &objp->delegation.
-					open_delegation4_u.write.recall))
+			    open_delegation4_u.write.recall))
 				return (FALSE);
 			if (!xdr_int(xdrs, (int *)&objp->delegation.
-					open_delegation4_u.write.space_limit.
-					limitby))
+			    open_delegation4_u.write.space_limit.
+			    limitby))
 				return (FALSE);
 			switch (objp->delegation.
-					open_delegation4_u.write.space_limit.
-					limitby) {
+			    open_delegation4_u.write.space_limit.
+			    limitby) {
 			case NFS_LIMIT_SIZE:
 				if (!xdr_u_longlong_t(xdrs,
 				    (u_longlong_t *)&objp->delegation.
@@ -3085,10 +3080,10 @@
 		return (TRUE);
 	case OPEN_DELEGATE_READ:
 		return (xdr_nfsace4(xdrs, &objp->delegation.
-					open_delegation4_u.read.permissions));
+		    open_delegation4_u.read.permissions));
 	case OPEN_DELEGATE_WRITE:
 		switch (objp->delegation.
-				open_delegation4_u.write.space_limit.limitby) {
+		    open_delegation4_u.write.space_limit.limitby) {
 		case NFS_LIMIT_SIZE:
 		case NFS_LIMIT_BLOCKS:
 			break;
@@ -3096,7 +3091,7 @@
 			return (FALSE);
 		}
 		return (xdr_nfsace4(xdrs, &objp->delegation.
-				open_delegation4_u.write.permissions));
+		    open_delegation4_u.write.permissions));
 	}
 	return (FALSE);
 }
@@ -3142,13 +3137,55 @@
 static bool_t
 xdr_READ4args(XDR *xdrs, READ4args *objp)
 {
+	rdma_chunkinfo_t rci;
+	rdma_wlist_conn_info_t rwci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
 	if (!xdr_u_int(xdrs, &objp->stateid.seqid))
 		return (FALSE);
 	if (!xdr_opaque(xdrs, objp->stateid.other, 12))
 		return (FALSE);
 	if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->offset))
 		return (FALSE);
-	return (xdr_u_int(xdrs, &objp->count));
+	if (!xdr_u_int(xdrs, &objp->count))
+		return (FALSE);
+
+	DTRACE_PROBE1(xdr__i__read4args_buf_len,
+	    int, objp->count);
+
+	objp->wlist = NULL;
+
+	if (xdrs->x_ops == xops && xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+		rci.rci_len = objp->count;
+		(void) XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
+	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_op == XDR_FREE)
+		return (TRUE);
+
+	if (xdrs->x_op == XDR_ENCODE) {
+		if (objp->res_uiop != NULL) {
+			rci.rci_type = RCI_WRITE_UIO_CHUNK;
+			rci.rci_a.rci_uiop = objp->res_uiop;
+			rci.rci_len = objp->count;
+			rci.rci_clpp = &objp->wlist;
+		} else {
+			rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+			rci.rci_a.rci_addr = objp->res_data_val_alt;
+			rci.rci_len = objp->count;
+			rci.rci_clpp = &objp->wlist;
+		}
+
+		return (XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci));
+	}
+
+	/* XDR_DECODE case */
+	(void) XDR_CONTROL(xdrs, XDR_RDMA_GET_WCINFO, &rwci);
+	objp->wlist = rwci.rwci_wlist;
+	objp->conn = rwci.rwci_conn;
+
+	return (TRUE);
 }
 
 static bool_t
@@ -3184,7 +3221,7 @@
 	if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
 		mp->b_wptr += objp->data_len;
 		rndup = BYTES_PER_XDR_UNIT -
-			(objp->data_len % BYTES_PER_XDR_UNIT);
+		    (objp->data_len % BYTES_PER_XDR_UNIT);
 		if (rndup != BYTES_PER_XDR_UNIT)
 			for (i = 0; i < rndup; i++)
 				*mp->b_wptr++ = '\0';
@@ -3192,10 +3229,31 @@
 			objp->mblk = NULL;
 			return (TRUE);
 		}
+	} else if (mp == NULL) {
+		if (xdr_u_int(xdrs, &objp->data_len) == FALSE) {
+			return (FALSE);
+		}
+		/*
+		 * If read data sent by wlist (RDMA_WRITE), don't do
+		 * xdr_bytes() below.   RDMA_WRITE transfers the data.
+		 * Note: this is encode-only because the client code
+		 * uses xdr_READ4res_clnt to decode results.
+		 */
+		if (objp->wlist) {
+			if (objp->wlist->c_len != objp->data_len) {
+				objp->wlist->c_len = objp->data_len;
+			}
+			if (objp->data_len != 0) {
+				return (xdrrdma_send_read_data(
+				    xdrs, objp->wlist));
+			}
+			return (TRUE);
+		}
 	}
+
 	return (xdr_bytes(xdrs, (char **)&objp->data_val,
-			(uint_t *)&objp->data_len,
-			objp->data_len));
+	    (uint_t *)&objp->data_len,
+	    objp->data_len));
 }
 
 static bool_t
@@ -3205,6 +3263,7 @@
 	size_t n;
 	int error;
 	uint_t size = aobjp->res_maxsize;
+	count4 ocount;
 
 	if (xdrs->x_op == XDR_ENCODE)
 		return (FALSE);
@@ -3254,7 +3313,7 @@
 				if ((n = MIN(uiop->uio_resid, n)) != 0) {
 
 					error =	uiomove((char *)mp->b_rptr, n,
-							UIO_READ, uiop);
+					    UIO_READ, uiop);
 					if (error)
 						return (FALSE);
 					mp->b_rptr += n;
@@ -3268,9 +3327,46 @@
 			return (TRUE);
 		}
 
+		if (xdrs->x_ops == &xdrrdma_ops) {
+			struct clist *cl;
+
+			XDR_CONTROL(xdrs, XDR_RDMA_GET_WLIST, &cl);
+
+			objp->wlist = cl;
+
+			if (objp->wlist) {
+				/* opaque count */
+				if (!xdr_u_int(xdrs, &ocount)) {
+					objp->wlist = NULL;
+					return (FALSE);
+				}
+
+				objp->wlist_len = cl->c_len;
+				objp->data_len = objp->wlist_len;
+
+				if (ocount != objp->data_len) {
+					DTRACE_PROBE2(
+					    xdr__e__read4resuio_clnt_fail,
+					    int, ocount,
+					    int, objp->data_len);
+					objp->wlist = NULL;
+					return (FALSE);
+				}
+
+				uiop->uio_resid -= objp->data_len;
+				uiop->uio_iov->iov_len -= objp->data_len;
+				uiop->uio_iov->iov_base += objp->data_len;
+				uiop->uio_loffset += objp->data_len;
+
+				objp->wlist = NULL;
+				return (TRUE);
+			}
+		}
+
 		/*
-		 * This isn't an xdrmblk stream.   Handle the likely
-		 * case that it can be inlined (ex. xdrmem).
+		 * This isn't an xdrmblk stream nor RDMA.
+		 * Handle the likely case that it can be
+		 * inlined (ex. xdrmem).
 		 */
 		if (!XDR_GETINT32(xdrs, (int32_t *)&objp->data_len))
 			return (FALSE);
@@ -3284,7 +3380,7 @@
 		size = (int)objp->data_len;
 		if ((ptr = XDR_INLINE(xdrs, size)) != NULL)
 			return (uiomove(ptr, size, UIO_READ, uiop) ?
-				FALSE : TRUE);
+			    FALSE : TRUE);
 
 		/*
 		 * Handle some other (unlikely) stream type that will
@@ -3307,18 +3403,67 @@
 	 * Check for the other special case of the caller providing
 	 * the target area for the data.
 	 */
-	if (aobjp->res_data_val_alt)
-		return (xdr_bytes(xdrs, (char **)&aobjp->res_data_val_alt,
-				(uint_t *)&objp->data_len,
-				aobjp->res_maxsize));
-
-	/* caller didn't set things up right if we got this far */
-	return (FALSE);
+	if (aobjp->res_data_val_alt == NULL)
+		return (FALSE);
+
+	/*
+	 * If read data received via RDMA_WRITE, don't do xdr_bytes().
+	 * RDMA_WRITE already moved the data so decode length of
+	 * RDMA_WRITE.
+	 */
+	if (xdrs->x_ops == &xdrrdma_ops) {
+		struct clist *cl;
+
+		XDR_CONTROL(xdrs, XDR_RDMA_GET_WLIST, &cl);
+
+		objp->wlist = cl;
+
+		/*
+		 * Data transferred through inline if
+		 * objp->wlist == NULL
+		 */
+		if (objp->wlist) {
+			/* opaque count */
+			if (!xdr_u_int(xdrs, &ocount)) {
+				objp->wlist = NULL;
+				return (FALSE);
+			}
+
+			objp->wlist_len = cl->c_len;
+			objp->data_len = objp->wlist_len;
+
+			if (ocount != objp->data_len) {
+				DTRACE_PROBE2(
+				    xdr__e__read4res_clnt_fail,
+				    int, ocount,
+				    int, objp->data_len);
+				objp->wlist = NULL;
+				return (FALSE);
+			}
+
+			objp->wlist = NULL;
+			return (TRUE);
+		}
+	}
+
+	return (xdr_bytes(xdrs, (char **)&aobjp->res_data_val_alt,
+	    (uint_t *)&objp->data_len,
+	    aobjp->res_maxsize));
 }
 
 static bool_t
 xdr_READDIR4args(XDR *xdrs, READDIR4args *objp)
 {
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
+	if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+	    xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = objp->maxcount;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
 	if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->cookie))
 		return (FALSE);
 	if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->cookieverf))
@@ -3362,7 +3507,7 @@
 
 	if (xdrs->x_ops == &xdrmblk_ops) {
 		if (xdrmblk_putmblk_rd(xdrs, mp)
-				    == TRUE) {
+		    == TRUE) {
 			/* mblk successfully inserted into outgoing chain */
 			objp->mblk = NULL;
 			return (TRUE);
@@ -3372,20 +3517,27 @@
 	ASSERT(mp->b_cont == NULL);
 
 	/*
-	 * If running over RDMA, the pre-encoded m_blk needs to be moved
+	 * If transport is RDMA, the pre-encoded m_blk needs to be moved
 	 * without being chunked.
-	 * Check if chunking is disabled for this xdr stream. If not disable
-	 * it for this op and then enable it back on.
+	 * Check if chunking is enabled for the xdr stream.
+	 * If it is enabled, disable it temporarily for this op,
+	 * then re-enable.
 	 */
-	XDR_CONTROL(xdrs, XDR_RDMAGET, &flags);
-	if (flags & RDMA_NOCHUNK)
+	XDR_CONTROL(xdrs, XDR_RDMA_GET_FLAGS, &flags);
+
+	if (!(flags & XDR_RDMA_CHUNK))
 		return (xdr_opaque(xdrs, (char *)mp->b_rptr, objp->data_len));
 
-	flags |= RDMA_NOCHUNK;
-	(void) XDR_CONTROL(xdrs, XDR_RDMASET, &flags);
+	flags &= ~XDR_RDMA_CHUNK;
+
+	(void) XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flags);
+
 	ret_val = xdr_opaque(xdrs, (char *)mp->b_rptr, objp->data_len);
-	flags &= ~RDMA_NOCHUNK;
-	(void) XDR_CONTROL(xdrs, XDR_RDMASET, &flags);
+
+	flags |= XDR_RDMA_CHUNK;
+
+	(void) XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flags);
+
 	return (ret_val);
 }
 
@@ -3398,8 +3550,8 @@
 		if (objp->status != NFS4_OK)
 			return (TRUE);
 		return (xdr_bytes(xdrs, (char **)&objp->link.utf8string_val,
-			(uint_t *)&objp->link.utf8string_len,
-			NFS4_MAX_UTF8STRING));
+		    (uint_t *)&objp->link.utf8string_len,
+		    NFS4_MAX_UTF8STRING));
 	}
 
 	/*
@@ -3424,7 +3576,7 @@
 	if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->cinfo.before))
 		return (FALSE);
 	return (xdr_u_longlong_t(xdrs,
-		(u_longlong_t *)&objp->cinfo.after));
+	    (u_longlong_t *)&objp->cinfo.after));
 }
 
 static bool_t
@@ -3448,7 +3600,7 @@
 	    (u_longlong_t *)&objp->target_cinfo.before))
 		return (FALSE);
 	return (xdr_u_longlong_t(xdrs,
-		(u_longlong_t *)&objp->target_cinfo.after));
+	    (u_longlong_t *)&objp->target_cinfo.after));
 }
 
 static bool_t
@@ -3460,9 +3612,9 @@
 		if (objp->flavor != RPCSEC_GSS)
 			return (TRUE);
 		if (!xdr_bytes(xdrs,
-			    (char **)&objp->flavor_info.oid.sec_oid4_val,
-			    (uint_t *)&objp->flavor_info.oid.sec_oid4_len,
-			    NFS4_MAX_SECOID4))
+		    (char **)&objp->flavor_info.oid.sec_oid4_val,
+		    (uint_t *)&objp->flavor_info.oid.sec_oid4_len,
+		    NFS4_MAX_SECOID4))
 			return (FALSE);
 		if (!xdr_u_int(xdrs, &objp->flavor_info.qop))
 			return (FALSE);
@@ -3477,7 +3629,7 @@
 
 	if (objp->flavor_info.oid.sec_oid4_val != NULL)
 		kmem_free(objp->flavor_info.oid.sec_oid4_val,
-			objp->flavor_info.oid.sec_oid4_len);
+		    objp->flavor_info.oid.sec_oid4_len);
 	return (TRUE);
 }
 
@@ -3486,18 +3638,18 @@
 {
 	if (xdrs->x_op != XDR_FREE) {
 		if (!xdr_u_longlong_t(xdrs,
-					(u_longlong_t *)&objp->client.verifier))
+		    (u_longlong_t *)&objp->client.verifier))
 			return (FALSE);
 		if (!xdr_bytes(xdrs, (char **)&objp->client.id_val,
-			    (uint_t *)&objp->client.id_len, NFS4_OPAQUE_LIMIT))
+		    (uint_t *)&objp->client.id_len, NFS4_OPAQUE_LIMIT))
 			return (FALSE);
 		if (!xdr_u_int(xdrs, &objp->callback.cb_program))
 			return (FALSE);
 		if (!xdr_string(xdrs, &objp->callback.cb_location.r_netid,
-						NFS4_OPAQUE_LIMIT))
+		    NFS4_OPAQUE_LIMIT))
 			return (FALSE);
 		if (!xdr_string(xdrs, &objp->callback.cb_location.r_addr,
-						NFS4_OPAQUE_LIMIT))
+		    NFS4_OPAQUE_LIMIT))
 			return (FALSE);
 		return (xdr_u_int(xdrs, &objp->callback_ident));
 	}
@@ -3508,9 +3660,9 @@
 	if (objp->client.id_val != NULL)
 		kmem_free(objp->client.id_val, objp->client.id_len);
 	(void) xdr_string(xdrs, &objp->callback.cb_location.r_netid,
-							NFS4_OPAQUE_LIMIT);
+	    NFS4_OPAQUE_LIMIT);
 	return (xdr_string(xdrs, &objp->callback.cb_location.r_addr,
-							NFS4_OPAQUE_LIMIT));
+	    NFS4_OPAQUE_LIMIT));
 }
 
 static bool_t
@@ -3527,15 +3679,15 @@
 				return (FALSE);
 			return (xdr_u_longlong_t(xdrs,
 			    (u_longlong_t *)&objp->SETCLIENTID4res_u.
-						resok4.setclientid_confirm));
+			    resok4.setclientid_confirm));
 		case NFS4ERR_CLID_INUSE:
 			if (!xdr_string(xdrs,
-				    &objp->SETCLIENTID4res_u.client_using.
-				    r_netid, NFS4_OPAQUE_LIMIT))
+			    &objp->SETCLIENTID4res_u.client_using.
+			    r_netid, NFS4_OPAQUE_LIMIT))
 				return (FALSE);
 			return (xdr_string(xdrs,
-				    &objp->SETCLIENTID4res_u.client_using.
-				    r_addr, NFS4_OPAQUE_LIMIT));
+			    &objp->SETCLIENTID4res_u.client_using.
+			    r_addr, NFS4_OPAQUE_LIMIT));
 		}
 		return (TRUE);
 	}
@@ -3547,10 +3699,10 @@
 		return (TRUE);
 
 	if (!xdr_string(xdrs, &objp->SETCLIENTID4res_u.client_using.r_netid,
-							NFS4_OPAQUE_LIMIT))
+	    NFS4_OPAQUE_LIMIT))
 		return (FALSE);
 	return (xdr_string(xdrs, &objp->SETCLIENTID4res_u.client_using.r_addr,
-							NFS4_OPAQUE_LIMIT));
+	    NFS4_OPAQUE_LIMIT));
 }
 
 static bool_t
@@ -3569,13 +3721,31 @@
 			if (xdrs->x_ops == &xdrmblk_ops) {
 				objp->data_val = NULL;
 				return (xdrmblk_getmblk(xdrs, &objp->mblk,
-							&objp->data_len));
+				    &objp->data_len));
 			}
-			/* Else fall thru for the xdr_bytes(). */
 			objp->mblk = NULL;
+			if (xdrs->x_ops == &xdrrdmablk_ops) {
+				int retval;
+				retval = xdrrdma_getrdmablk(xdrs,
+				    &objp->rlist,
+				    &objp->data_len,
+				    &objp->conn, NFS4_DATA_LIMIT);
+				if (retval == FALSE)
+					return (FALSE);
+				return (xdrrdma_read_from_client(&objp->rlist,
+				    &objp->conn, objp->data_len));
+			}
 		}
+		/* Else fall thru for the xdr_bytes(). */
 		return (xdr_bytes(xdrs, (char **)&objp->data_val,
-				(uint_t *)&objp->data_len, NFS4_DATA_LIMIT));
+		    (uint_t *)&objp->data_len, NFS4_DATA_LIMIT));
+	}
+	if (objp->rlist != NULL) {
+		(void) xdrrdma_free_clist(objp->conn, objp->rlist);
+		objp->rlist = NULL;
+		objp->data_val = NULL;
+
+		return (TRUE);
 	}
 
 	/*
@@ -3598,7 +3768,7 @@
 	if (!xdr_int(xdrs, (int *)&objp->committed))
 		return (FALSE);
 	return (xdr_u_longlong_t(xdrs,
-			(u_longlong_t *)&objp->writeverf));
+	    (u_longlong_t *)&objp->writeverf));
 }
 
 static bool_t
@@ -3625,11 +3795,11 @@
 		switch (array[i].argop) {
 		case OP_PUTFH:
 			if (array[i].nfs_argop4_u.opputfh.object.nfs_fh4_val !=
-								NULL) {
+			    NULL) {
 				kmem_free(array[i].nfs_argop4_u.opputfh.object.
-								nfs_fh4_val,
-					array[i].nfs_argop4_u.opputfh.object.
-								nfs_fh4_len);
+				    nfs_fh4_val,
+				    array[i].nfs_argop4_u.opputfh.object.
+				    nfs_fh4_len);
 			}
 			continue;
 		case OP_GETATTR:
@@ -3637,16 +3807,16 @@
 			continue;
 		case OP_LOOKUP:
 			if (array[i].nfs_argop4_u.oplookup.objname.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.oplookup.
-						objname.utf8string_val,
-					array[i].nfs_argop4_u.oplookup.
-						objname.utf8string_len);
+				    objname.utf8string_val,
+				    array[i].nfs_argop4_u.oplookup.
+				    objname.utf8string_len);
 			}
 			continue;
 		case OP_OPEN:
 			(void) xdr_OPEN4args(xdrs,
-						&array[i].nfs_argop4_u.opopen);
+			    &array[i].nfs_argop4_u.opopen);
 			continue;
 		case OP_CLOSE:
 		case OP_ACCESS:
@@ -3654,7 +3824,7 @@
 			continue;
 		case OP_WRITE:
 			(void) xdr_WRITE4args(xdrs,
-						&array[i].nfs_argop4_u.opwrite);
+			    &array[i].nfs_argop4_u.opwrite);
 			continue;
 		case OP_DELEGRETURN:
 		case OP_LOOKUPP:
@@ -3662,37 +3832,37 @@
 			continue;
 		case OP_REMOVE:
 			if (array[i].nfs_argop4_u.opremove.target.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.opremove.target.
-							utf8string_val,
-					array[i].nfs_argop4_u.opremove.target.
-							utf8string_len);
+				    utf8string_val,
+				    array[i].nfs_argop4_u.opremove.target.
+				    utf8string_len);
 			}
 			continue;
 		case OP_COMMIT:
 			continue;
 		case OP_CREATE:
 			(void) xdr_CREATE4args(xdrs,
-					&array[i].nfs_argop4_u.opcreate);
+			    &array[i].nfs_argop4_u.opcreate);
 			continue;
 		case OP_DELEGPURGE:
 			continue;
 		case OP_LINK:
 			if (array[i].nfs_argop4_u.oplink.newname.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.oplink.newname.
-							utf8string_val,
-					array[i].nfs_argop4_u.oplink.newname.
-							utf8string_len);
+				    utf8string_val,
+				    array[i].nfs_argop4_u.oplink.newname.
+				    utf8string_len);
 			}
 			continue;
 		case OP_LOCK:
 			(void) xdr_LOCK4args(xdrs,
-						&array[i].nfs_argop4_u.oplock);
+			    &array[i].nfs_argop4_u.oplock);
 			continue;
 		case OP_LOCKT:
 			(void) xdr_LOCKT4args(xdrs,
-						&array[i].nfs_argop4_u.oplockt);
+			    &array[i].nfs_argop4_u.oplockt);
 			continue;
 		case OP_LOCKU:
 			continue;
@@ -3709,18 +3879,18 @@
 			continue;
 		case OP_RENAME:
 			if (array[i].nfs_argop4_u.oprename.oldname.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.oprename.
-						oldname.utf8string_val,
-					array[i].nfs_argop4_u.oprename.
-						oldname.utf8string_len);
+				    oldname.utf8string_val,
+				    array[i].nfs_argop4_u.oprename.
+				    oldname.utf8string_len);
 			}
 			if (array[i].nfs_argop4_u.oprename.newname.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.oprename.
-						newname.utf8string_val,
-					array[i].nfs_argop4_u.oprename.
-						newname.utf8string_len);
+				    newname.utf8string_val,
+				    array[i].nfs_argop4_u.oprename.
+				    newname.utf8string_len);
 			}
 			continue;
 		case OP_RENEW:
@@ -3729,11 +3899,11 @@
 			continue;
 		case OP_SECINFO:
 			if (array[i].nfs_argop4_u.opsecinfo.name.
-						utf8string_val != NULL) {
+			    utf8string_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.opsecinfo.name.
-							utf8string_val,
-					array[i].nfs_argop4_u.opsecinfo.name.
-							utf8string_len);
+				    utf8string_val,
+				    array[i].nfs_argop4_u.opsecinfo.name.
+				    utf8string_len);
 			}
 			continue;
 		case OP_SETATTR:
@@ -3742,20 +3912,20 @@
 			continue;
 		case OP_SETCLIENTID:
 			(void) xdr_SETCLIENTID4args(xdrs,
-					&array[i].nfs_argop4_u.opsetclientid);
+			    &array[i].nfs_argop4_u.opsetclientid);
 			continue;
 		case OP_SETCLIENTID_CONFIRM:
 			continue;
 		case OP_VERIFY:
 			(void) xdr_fattr4(xdrs,
-				&array[i].nfs_argop4_u.opverify.obj_attributes);
+			    &array[i].nfs_argop4_u.opverify.obj_attributes);
 			continue;
 		case OP_RELEASE_LOCKOWNER:
 			if (array[i].nfs_argop4_u.oprelease_lockowner.
-						lock_owner.owner_val != NULL) {
+			    lock_owner.owner_val != NULL) {
 				kmem_free(array[i].nfs_argop4_u.
 				    oprelease_lockowner.lock_owner.owner_val,
-					array[i].nfs_argop4_u.
+				    array[i].nfs_argop4_u.
 				    oprelease_lockowner.lock_owner.owner_len);
 			}
 			continue;
@@ -3780,6 +3950,9 @@
 static bool_t
 xdr_nfs_argop4(XDR *xdrs, nfs_argop4 *objp)
 {
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
 	/*
 	 * These should be ordered by frequency of use
 	 */
@@ -3791,22 +3964,22 @@
 		    NFS4_FHSIZE));
 	case OP_GETATTR:
 		return (xdr_bitmap4(xdrs,
-				&objp->nfs_argop4_u.opgetattr.attr_request));
+		    &objp->nfs_argop4_u.opgetattr.attr_request));
 	case OP_GETFH:
 		return (TRUE);
 	case OP_LOOKUP:
 		return (xdr_bytes(xdrs, (char **)&objp->nfs_argop4_u.oplookup.
-				objname.utf8string_val,
-			(uint_t *)&objp->nfs_argop4_u.oplookup.
-				objname.utf8string_len,
-			NFS4_MAX_UTF8STRING));
+		    objname.utf8string_val,
+		    (uint_t *)&objp->nfs_argop4_u.oplookup.
+		    objname.utf8string_len,
+		    NFS4_MAX_UTF8STRING));
 	case OP_OPEN:
 		return (xdr_OPEN4args(xdrs, &objp->nfs_argop4_u.opopen));
 	case OP_CLOSE:
 		return (xdr_CLOSE4args(xdrs, &objp->nfs_argop4_u.opclose));
 	case OP_ACCESS:
 		return (xdr_u_int(xdrs,
-				&objp->nfs_argop4_u.opaccess.access));
+		    &objp->nfs_argop4_u.opaccess.access));
 	case OP_READ:
 		return (xdr_READ4args(xdrs, &objp->nfs_argop4_u.opread));
 	case OP_WRITE:
@@ -3823,10 +3996,10 @@
 		return (xdr_READDIR4args(xdrs, &objp->nfs_argop4_u.opreaddir));
 	case OP_REMOVE:
 		return (xdr_bytes(xdrs, (char **)&objp->nfs_argop4_u.opremove.
-				target.utf8string_val,
-			(uint_t *)&objp->nfs_argop4_u.opremove.
-				target.utf8string_len,
-			NFS4_MAX_UTF8STRING));
+		    target.utf8string_val,
+		    (uint_t *)&objp->nfs_argop4_u.opremove.
+		    target.utf8string_len,
+		    NFS4_MAX_UTF8STRING));
 	case OP_COMMIT:
 		if (!xdr_u_longlong_t(xdrs,
 		    (u_longlong_t *)&objp->nfs_argop4_u.opcommit.offset))
@@ -3850,43 +4023,49 @@
 		return (xdr_LOCKU4args(xdrs, &objp->nfs_argop4_u.oplocku));
 	case OP_NVERIFY:
 		return (xdr_fattr4(xdrs,
-				&objp->nfs_argop4_u.opnverify.obj_attributes));
+		    &objp->nfs_argop4_u.opnverify.obj_attributes));
 	case OP_OPENATTR:
 		return (xdr_bool(xdrs,
-				&objp->nfs_argop4_u.opopenattr.createdir));
+		    &objp->nfs_argop4_u.opopenattr.createdir));
 	case OP_OPEN_CONFIRM:
 		if (!xdr_u_int(xdrs, &objp->nfs_argop4_u.opopen_confirm.
-						open_stateid.seqid))
+		    open_stateid.seqid))
 			return (FALSE);
 		if (!xdr_opaque(xdrs, objp->nfs_argop4_u.opopen_confirm.
-						open_stateid.other, 12))
+		    open_stateid.other, 12))
 			return (FALSE);
 		return (xdr_u_int(xdrs, &objp->nfs_argop4_u.opopen_confirm.
-						seqid));
+		    seqid));
 	case OP_OPEN_DOWNGRADE:
 		return (xdr_OPEN_DOWNGRADE4args(xdrs,
-				&objp->nfs_argop4_u.opopen_downgrade));
+		    &objp->nfs_argop4_u.opopen_downgrade));
 	case OP_PUTPUBFH:
 		return (TRUE);
 	case OP_PUTROOTFH:
 		return (TRUE);
 	case OP_READLINK:
+		if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+		    xdrs->x_op == XDR_ENCODE) {
+			rci.rci_type = RCI_REPLY_CHUNK;
+			rci.rci_len = MAXPATHLEN;
+			XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+		}
 		return (TRUE);
 	case OP_RENAME:
 		if (!xdr_bytes(xdrs, (char **)&objp->nfs_argop4_u.oprename.
-						oldname.utf8string_val,
-			    (uint_t *)&objp->nfs_argop4_u.oprename.
-						oldname.utf8string_len,
-			    NFS4_MAX_UTF8STRING))
+		    oldname.utf8string_val,
+		    (uint_t *)&objp->nfs_argop4_u.oprename.
+		    oldname.utf8string_len,
+		    NFS4_MAX_UTF8STRING))
 			return (FALSE);
 		return (xdr_bytes(xdrs, (char **)&objp->nfs_argop4_u.oprename.
-						newname.utf8string_val,
-			    (uint_t *)&objp->nfs_argop4_u.oprename.
-						newname.utf8string_len,
-			    NFS4_MAX_UTF8STRING));
+		    newname.utf8string_val,
+		    (uint_t *)&objp->nfs_argop4_u.oprename.
+		    newname.utf8string_len,
+		    NFS4_MAX_UTF8STRING));
 	case OP_RENEW:
 		return (xdr_u_longlong_t(xdrs,
-			(u_longlong_t *)&objp->nfs_argop4_u.oprenew.clientid));
+		    (u_longlong_t *)&objp->nfs_argop4_u.oprenew.clientid));
 	case OP_RESTOREFH:
 		return (TRUE);
 	case OP_SAVEFH:
@@ -3898,36 +4077,36 @@
 		    NFS4_MAX_UTF8STRING));
 	case OP_SETATTR:
 		if (!xdr_u_int(xdrs, &objp->nfs_argop4_u.opsetattr.
-						stateid.seqid))
+		    stateid.seqid))
 			return (FALSE);
 		if (!xdr_opaque(xdrs, objp->nfs_argop4_u.opsetattr.
-						stateid.other, 12))
+		    stateid.other, 12))
 			return (FALSE);
 		return (xdr_fattr4(xdrs, &objp->nfs_argop4_u.opsetattr.
-						obj_attributes));
+		    obj_attributes));
 	case OP_SETCLIENTID:
 		return (xdr_SETCLIENTID4args(xdrs,
-				&objp->nfs_argop4_u.opsetclientid));
+		    &objp->nfs_argop4_u.opsetclientid));
 	case OP_SETCLIENTID_CONFIRM:
 		if (!xdr_u_longlong_t(xdrs, (u_longlong_t *)&objp->nfs_argop4_u.
-				opsetclientid_confirm.clientid))
+		    opsetclientid_confirm.clientid))
 			return (FALSE);
 		return (xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->nfs_argop4_u.
-				opsetclientid_confirm.setclientid_confirm));
+		    (u_longlong_t *)&objp->nfs_argop4_u.
+		    opsetclientid_confirm.setclientid_confirm));
 	case OP_VERIFY:
 		return (xdr_fattr4(xdrs,
-				&objp->nfs_argop4_u.opverify.obj_attributes));
+		    &objp->nfs_argop4_u.opverify.obj_attributes));
 	case OP_RELEASE_LOCKOWNER:
 		if (!xdr_u_longlong_t(xdrs,
-			    (u_longlong_t *)&objp->nfs_argop4_u.
-				oprelease_lockowner.lock_owner.clientid))
+		    (u_longlong_t *)&objp->nfs_argop4_u.
+		    oprelease_lockowner.lock_owner.clientid))
 			return (FALSE);
 		return (xdr_bytes(xdrs,
-			(char **)&objp->nfs_argop4_u.oprelease_lockowner.
-				lock_owner.owner_val,
-			(uint_t *)&objp->nfs_argop4_u.oprelease_lockowner.
-				lock_owner.owner_len, NFS4_OPAQUE_LIMIT));
+		    (char **)&objp->nfs_argop4_u.oprelease_lockowner.
+		    lock_owner.owner_val,
+		    (uint_t *)&objp->nfs_argop4_u.oprelease_lockowner.
+		    lock_owner.owner_len, NFS4_OPAQUE_LIMIT));
 	case OP_ILLEGAL:
 		return (TRUE);
 	}
@@ -3952,7 +4131,7 @@
 	switch (objp->argop) {
 	case OP_PUTFH:
 		return (xdr_decode_nfs_fh4(xdrs,
-			&objp->nfs_argop4_u.opputfh.object));
+		    &objp->nfs_argop4_u.opputfh.object));
 	default:
 		return (xdr_nfs_argop4(xdrs, objp));
 	}
@@ -4000,7 +4179,7 @@
 		 */
 		if (!(len % BYTES_PER_XDR_UNIT) &&
 		    (ptr = XDR_INLINE(xdrs, 2 * BYTES_PER_XDR_UNIT + len)) !=
-									NULL) {
+		    NULL) {
 			IXDR_PUT_U_INT32(ptr, OP_PUTFH);
 			IXDR_PUT_U_INT32(ptr, len);
 			bcopy(sfh->sfh_fh.nfs_fh4_val, ptr, len);
@@ -4036,8 +4215,8 @@
 		if (XDR_PUTINT32(xdrs, &op)) {
 			if (XDR_PUTINT32(xdrs, &len)) {
 				return (xdr_opaque(xdrs,
-					objp->nfs_argop4_u.opclookup.cname,
-					len));
+				    objp->nfs_argop4_u.opclookup.cname,
+				    len));
 			}
 		}
 		return (FALSE);
@@ -4052,8 +4231,8 @@
 		if (XDR_PUTINT32(xdrs, &op)) {
 			if (XDR_PUTINT32(xdrs, &len)) {
 				return (xdr_opaque(xdrs,
-					objp->nfs_argop4_u.opcremove.ctarget,
-					len));
+				    objp->nfs_argop4_u.opcremove.ctarget,
+				    len));
 			}
 		}
 		return (FALSE);
@@ -4070,8 +4249,8 @@
 		if (XDR_PUTINT32(xdrs, &op)) {
 			if (XDR_PUTINT32(xdrs, &len)) {
 				return (xdr_opaque(xdrs,
-					objp->nfs_argop4_u.opclink.cnewname,
-					len));
+				    objp->nfs_argop4_u.opclink.cnewname,
+				    len));
 			}
 		}
 		return (FALSE);
@@ -4085,14 +4264,14 @@
 		if (!XDR_PUTINT32(xdrs, &len))
 			return (FALSE);
 		if (!xdr_opaque(xdrs,
-				objp->nfs_argop4_u.opcrename.coldname, len))
+		    objp->nfs_argop4_u.opcrename.coldname, len))
 			return (FALSE);
 		len = strlen(objp->nfs_argop4_u.opcrename.cnewname);
 		if (len > NFS4_MAX_UTF8STRING)
 			return (FALSE);
 		if (XDR_PUTINT32(xdrs, &len)) {
 			return (xdr_opaque(xdrs,
-				objp->nfs_argop4_u.opcrename.cnewname, len));
+			    objp->nfs_argop4_u.opcrename.cnewname, len));
 		}
 		return (FALSE);
 	case OP_CSECINFO:
@@ -4103,8 +4282,8 @@
 		if (XDR_PUTINT32(xdrs, &op)) {
 			if (XDR_PUTINT32(xdrs, &len)) {
 				return (xdr_opaque(xdrs,
-					objp->nfs_argop4_u.opcsecinfo.cname,
-					len));
+				    objp->nfs_argop4_u.opcsecinfo.cname,
+				    len));
 			}
 		}
 		return (FALSE);
@@ -4143,33 +4322,32 @@
 				continue;
 			if (array[i].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
 				kmem_free(array[i].nfs_resop4_u.opgetattr.
-					ga_res.n4g_ext_res,
-					sizeof (struct nfs4_ga_ext_res));
+				    ga_res.n4g_ext_res,
+				    sizeof (struct nfs4_ga_ext_res));
 			continue;
 		case OP_GETFH:
 			if (array[i].nfs_resop4_u.opgetfh.status != NFS4_OK)
 				continue;
 			if (array[i].nfs_resop4_u.opgetfh.object.nfs_fh4_val !=
-							NULL) {
+			    NULL) {
 				kmem_free(array[i].nfs_resop4_u.opgetfh.object.
-							nfs_fh4_val,
-					array[i].nfs_resop4_u.opgetfh.object.
-							nfs_fh4_len);
+				    nfs_fh4_val,
+				    array[i].nfs_resop4_u.opgetfh.object.
+				    nfs_fh4_len);
 			}
 			continue;
 		case OP_LOOKUP:
 			continue;
 		case OP_OPEN:
 			(void) xdr_OPEN4res(xdrs, &array[i].nfs_resop4_u.
-								opopen);
+			    opopen);
 			continue;
 		case OP_CLOSE:
 		case OP_ACCESS:
 			continue;
 		case OP_READ:
 			(void) xdr_READ4res(xdrs,
-					    &array[i].nfs_resop4_u.
-								opread);
+			    &array[i].nfs_resop4_u.opread);
 			continue;
 		case OP_WRITE:
 		case OP_DELEGRETURN:
@@ -4183,11 +4361,11 @@
 			continue;
 		case OP_LOCK:
 			(void) xdr_LOCK4res(xdrs, &array[i].nfs_resop4_u.
-								oplock);
+			    oplock);
 			continue;
 		case OP_LOCKT:
 			(void) xdr_LOCKT4res(xdrs, &array[i].nfs_resop4_u.
-								oplockt);
+			    oplockt);
 			continue;
 		case OP_LOCKU:
 		case OP_NVERIFY:
@@ -4203,20 +4381,20 @@
 			continue;
 		case OP_READLINK:
 			(void) xdr_READLINK4res(xdrs, &array[i].nfs_resop4_u.
-								opreadlink);
+			    opreadlink);
 			continue;
 		case OP_SECINFO:
 			(void) xdr_array(xdrs,
-				(char **)&array[i].nfs_resop4_u.opsecinfo.
-					SECINFO4resok_val,
-				(uint_t *)&array[i].nfs_resop4_u.opsecinfo.
-					SECINFO4resok_len,
-				NFS4_SECINFO_LIMIT, sizeof (secinfo4),
-				(xdrproc_t)xdr_secinfo4);
+			    (char **)&array[i].nfs_resop4_u.opsecinfo.
+			    SECINFO4resok_val,
+			    (uint_t *)&array[i].nfs_resop4_u.opsecinfo.
+			    SECINFO4resok_len,
+			    NFS4_SECINFO_LIMIT, sizeof (secinfo4),
+			    (xdrproc_t)xdr_secinfo4);
 			continue;
 		case OP_SETCLIENTID:
 			(void) xdr_SETCLIENTID4res(xdrs,
-					&array[i].nfs_resop4_u.opsetclientid);
+			    &array[i].nfs_resop4_u.opsetclientid);
 			continue;
 		case OP_SETATTR:
 		case OP_SETCLIENTID_CONFIRM:
@@ -4255,18 +4433,18 @@
 	switch (objp->resop) {
 	case OP_PUTFH:
 		return (xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.opputfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputfh.status));
 	case OP_GETATTR:
 		if (!xdr_int(xdrs,
-			    (int32_t *)&objp->nfs_resop4_u.opgetattr.status))
+		    (int32_t *)&objp->nfs_resop4_u.opgetattr.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opgetattr.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_fattr4(xdrs,
-			    &objp->nfs_resop4_u.opgetattr.obj_attributes));
+		    &objp->nfs_resop4_u.opgetattr.obj_attributes));
 	case OP_GETFH:
 		if (!xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.opgetfh.status))
+		    (int32_t *)&objp->nfs_resop4_u.opgetfh.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opgetfh.status != NFS4_OK)
 			return (TRUE);
@@ -4276,7 +4454,7 @@
 		    NFS4_FHSIZE));
 	case OP_LOOKUP:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oplookup.status));
+		    (int32_t *)&objp->nfs_resop4_u.oplookup.status));
 	case OP_OPEN:
 		return (xdr_OPEN4res(xdrs, &objp->nfs_resop4_u.opopen));
 	case OP_CLOSE:
@@ -4289,10 +4467,10 @@
 		return (xdr_WRITE4res(xdrs, &objp->nfs_resop4_u.opwrite));
 	case OP_DELEGRETURN:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opdelegreturn.status));
+		    (int32_t *)&objp->nfs_resop4_u.opdelegreturn.status));
 	case OP_LOOKUPP:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oplookupp.status));
+		    (int32_t *)&objp->nfs_resop4_u.oplookupp.status));
 	case OP_READDIR:
 		return (xdr_READDIR4res(xdrs, &objp->nfs_resop4_u.opreaddir));
 	case OP_REMOVE:
@@ -4300,18 +4478,18 @@
 
 	case OP_COMMIT:
 		if (!xdr_int(xdrs,
-			    (int32_t *)&objp->nfs_resop4_u.opcommit.status))
+		    (int32_t *)&objp->nfs_resop4_u.opcommit.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opcommit.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->nfs_resop4_u.opcommit.
-						writeverf));
+		    (u_longlong_t *)&objp->nfs_resop4_u.opcommit.
+		    writeverf));
 	case OP_CREATE:
 		return (xdr_CREATE4res(xdrs, &objp->nfs_resop4_u.opcreate));
 	case OP_DELEGPURGE:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opdelegpurge.status));
+		    (int32_t *)&objp->nfs_resop4_u.opdelegpurge.status));
 	case OP_LINK:
 		return (xdr_LINK4res(xdrs, &objp->nfs_resop4_u.oplink));
 	case OP_LOCK:
@@ -4320,7 +4498,7 @@
 		return (xdr_LOCKT4res(xdrs, &objp->nfs_resop4_u.oplockt));
 	case OP_LOCKU:
 		if (!xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.oplocku.status))
+		    (int32_t *)&objp->nfs_resop4_u.oplocku.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.oplocku.status != NFS4_OK)
 			return (TRUE);
@@ -4332,69 +4510,69 @@
 		    12));
 	case OP_NVERIFY:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opnverify.status));
+		    (int32_t *)&objp->nfs_resop4_u.opnverify.status));
 	case OP_OPENATTR:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opopenattr.status));
+		    (int32_t *)&objp->nfs_resop4_u.opopenattr.status));
 	case OP_OPEN_CONFIRM:
 		return (xdr_OPEN_CONFIRM4res(xdrs,
-				&objp->nfs_resop4_u.opopen_confirm));
+		    &objp->nfs_resop4_u.opopen_confirm));
 	case OP_OPEN_DOWNGRADE:
 		return (xdr_OPEN_DOWNGRADE4res(xdrs,
-				&objp->nfs_resop4_u.opopen_downgrade));
+		    &objp->nfs_resop4_u.opopen_downgrade));
 	case OP_PUTPUBFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opputpubfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputpubfh.status));
 	case OP_PUTROOTFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opputrootfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputrootfh.status));
 	case OP_READLINK:
 		return (xdr_READLINK4res(xdrs, &objp->nfs_resop4_u.opreadlink));
 	case OP_RENAME:
 		return (xdr_RENAME4res(xdrs, &objp->nfs_resop4_u.oprename));
 	case OP_RENEW:
 		return (xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.oprenew.status));
+		    (int32_t *)&objp->nfs_resop4_u.oprenew.status));
 	case OP_RESTOREFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oprestorefh.status));
+		    (int32_t *)&objp->nfs_resop4_u.oprestorefh.status));
 	case OP_SAVEFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opsavefh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opsavefh.status));
 	case OP_SECINFO:
 		if (!xdr_int(xdrs, (int32_t *)&objp->nfs_resop4_u.opsecinfo.
-					status))
+		    status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opsecinfo.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_array(xdrs, (char **)&objp->nfs_resop4_u.opsecinfo.
-					SECINFO4resok_val,
-			(uint_t *)&objp->nfs_resop4_u.opsecinfo.
-					SECINFO4resok_len,
-			NFS4_SECINFO_LIMIT, sizeof (secinfo4),
-			(xdrproc_t)xdr_secinfo4));
+		    SECINFO4resok_val,
+		    (uint_t *)&objp->nfs_resop4_u.opsecinfo.
+		    SECINFO4resok_len,
+		    NFS4_SECINFO_LIMIT, sizeof (secinfo4),
+		    (xdrproc_t)xdr_secinfo4));
 	case OP_SETATTR:
 		if (!xdr_int(xdrs, (int32_t *)&objp->nfs_resop4_u.opsetattr.
-						status))
+		    status))
 			return (FALSE);
 		return (xdr_bitmap4(xdrs,
-				&objp->nfs_resop4_u.opsetattr.attrsset));
+		    &objp->nfs_resop4_u.opsetattr.attrsset));
 	case OP_SETCLIENTID:
 		return (xdr_SETCLIENTID4res(xdrs,
-				&objp->nfs_resop4_u.opsetclientid));
+		    &objp->nfs_resop4_u.opsetclientid));
 	case OP_SETCLIENTID_CONFIRM:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opsetclientid_confirm.
-					status));
+		    (int32_t *)&objp->nfs_resop4_u.opsetclientid_confirm.
+		    status));
 	case OP_VERIFY:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opverify.status));
+		    (int32_t *)&objp->nfs_resop4_u.opverify.status));
 	case OP_RELEASE_LOCKOWNER:
 		return (xdr_int(xdrs,
 		    (int32_t *)&objp->nfs_resop4_u.oprelease_lockowner.status));
 	case OP_ILLEGAL:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opillegal.status));
+		    (int32_t *)&objp->nfs_resop4_u.opillegal.status));
 	}
 	return (FALSE);
 }
@@ -4430,19 +4608,19 @@
 	switch (objp->resop) {
 	case OP_PUTFH:
 		return (xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.opputfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputfh.status));
 	case OP_GETATTR:
 		if (!xdr_int(xdrs,
-			    (int32_t *)&objp->nfs_resop4_u.opgetattr.status))
+		    (int32_t *)&objp->nfs_resop4_u.opgetattr.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opgetattr.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_ga_res(xdrs,
-				(GETATTR4res *)&objp->nfs_resop4_u.opgetattr,
-				&aobjp->nfs_argop4_u.opgetattr));
+		    (GETATTR4res *)&objp->nfs_resop4_u.opgetattr,
+		    &aobjp->nfs_argop4_u.opgetattr));
 	case OP_GETFH:
 		if (!xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.opgetfh.status))
+		    (int32_t *)&objp->nfs_resop4_u.opgetfh.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opgetfh.status != NFS4_OK)
 			return (TRUE);
@@ -4452,10 +4630,10 @@
 		    NFS4_FHSIZE));
 	case OP_LOOKUP:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oplookup.status));
+		    (int32_t *)&objp->nfs_resop4_u.oplookup.status));
 	case OP_NVERIFY:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opnverify.status));
+		    (int32_t *)&objp->nfs_resop4_u.opnverify.status));
 	case OP_OPEN:
 		return (xdr_OPEN4res(xdrs, &objp->nfs_resop4_u.opopen));
 	case OP_CLOSE:
@@ -4464,36 +4642,36 @@
 		return (xdr_ACCESS4res(xdrs, &objp->nfs_resop4_u.opaccess));
 	case OP_READ:
 		return (xdr_READ4res_clnt(xdrs, &objp->nfs_resop4_u.opread,
-					&aobjp->nfs_argop4_u.opread));
+		    &aobjp->nfs_argop4_u.opread));
 	case OP_WRITE:
 		return (xdr_WRITE4res(xdrs, &objp->nfs_resop4_u.opwrite));
 	case OP_DELEGRETURN:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opdelegreturn.status));
+		    (int32_t *)&objp->nfs_resop4_u.opdelegreturn.status));
 	case OP_LOOKUPP:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oplookupp.status));
+		    (int32_t *)&objp->nfs_resop4_u.oplookupp.status));
 	case OP_READDIR:
 		return (xdr_READDIR4res_clnt(xdrs,
-				&objp->nfs_resop4_u.opreaddirclnt,
-				&aobjp->nfs_argop4_u.opreaddir));
+		    &objp->nfs_resop4_u.opreaddirclnt,
+		    &aobjp->nfs_argop4_u.opreaddir));
 	case OP_REMOVE:
 		return (xdr_REMOVE4res(xdrs, &objp->nfs_resop4_u.opremove));
 
 	case OP_COMMIT:
 		if (!xdr_int(xdrs,
-			    (int32_t *)&objp->nfs_resop4_u.opcommit.status))
+		    (int32_t *)&objp->nfs_resop4_u.opcommit.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opcommit.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_u_longlong_t(xdrs,
-				(u_longlong_t *)&objp->nfs_resop4_u.opcommit.
-						writeverf));
+		    (u_longlong_t *)&objp->nfs_resop4_u.opcommit.
+		    writeverf));
 	case OP_CREATE:
 		return (xdr_CREATE4res(xdrs, &objp->nfs_resop4_u.opcreate));
 	case OP_DELEGPURGE:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opdelegpurge.status));
+		    (int32_t *)&objp->nfs_resop4_u.opdelegpurge.status));
 	case OP_LINK:
 		return (xdr_LINK4res(xdrs, &objp->nfs_resop4_u.oplink));
 	case OP_LOCK:
@@ -4502,7 +4680,7 @@
 		return (xdr_LOCKT4res(xdrs, &objp->nfs_resop4_u.oplockt));
 	case OP_LOCKU:
 		if (!xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.oplocku.status))
+		    (int32_t *)&objp->nfs_resop4_u.oplocku.status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.oplocku.status != NFS4_OK)
 			return (TRUE);
@@ -4514,65 +4692,65 @@
 		    12));
 	case OP_OPENATTR:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opopenattr.status));
+		    (int32_t *)&objp->nfs_resop4_u.opopenattr.status));
 	case OP_OPEN_CONFIRM:
 		return (xdr_OPEN_CONFIRM4res(xdrs,
-				&objp->nfs_resop4_u.opopen_confirm));
+		    &objp->nfs_resop4_u.opopen_confirm));
 	case OP_OPEN_DOWNGRADE:
 		return (xdr_OPEN_DOWNGRADE4res(xdrs,
-				&objp->nfs_resop4_u.opopen_downgrade));
+		    &objp->nfs_resop4_u.opopen_downgrade));
 	case OP_PUTPUBFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opputpubfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputpubfh.status));
 	case OP_PUTROOTFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opputrootfh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opputrootfh.status));
 	case OP_READLINK:
 		return (xdr_READLINK4res(xdrs, &objp->nfs_resop4_u.opreadlink));
 	case OP_RENAME:
 		return (xdr_RENAME4res(xdrs, &objp->nfs_resop4_u.oprename));
 	case OP_RENEW:
 		return (xdr_int(xdrs,
-				(int32_t *)&objp->nfs_resop4_u.oprenew.status));
+		    (int32_t *)&objp->nfs_resop4_u.oprenew.status));
 	case OP_RESTOREFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.oprestorefh.status));
+		    (int32_t *)&objp->nfs_resop4_u.oprestorefh.status));
 	case OP_SAVEFH:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opsavefh.status));
+		    (int32_t *)&objp->nfs_resop4_u.opsavefh.status));
 	case OP_SECINFO:
 		if (!xdr_int(xdrs, (int32_t *)&objp->nfs_resop4_u.opsecinfo.
-					status))
+		    status))
 			return (FALSE);
 		if (objp->nfs_resop4_u.opsecinfo.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_array(xdrs, (char **)&objp->nfs_resop4_u.opsecinfo.
-					SECINFO4resok_val,
-			(uint_t *)&objp->nfs_resop4_u.opsecinfo.
-					SECINFO4resok_len,
-			~0, sizeof (secinfo4), (xdrproc_t)xdr_secinfo4));
+		    SECINFO4resok_val,
+		    (uint_t *)&objp->nfs_resop4_u.opsecinfo.
+		    SECINFO4resok_len,
+		    ~0, sizeof (secinfo4), (xdrproc_t)xdr_secinfo4));
 	case OP_SETATTR:
 		if (!xdr_int(xdrs, (int32_t *)&objp->nfs_resop4_u.opsetattr.
-						status))
+		    status))
 			return (FALSE);
 		return (xdr_bitmap4(xdrs,
-				&objp->nfs_resop4_u.opsetattr.attrsset));
+		    &objp->nfs_resop4_u.opsetattr.attrsset));
 	case OP_SETCLIENTID:
 		return (xdr_SETCLIENTID4res(xdrs,
-				&objp->nfs_resop4_u.opsetclientid));
+		    &objp->nfs_resop4_u.opsetclientid));
 	case OP_SETCLIENTID_CONFIRM:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opsetclientid_confirm.
-					status));
+		    (int32_t *)&objp->nfs_resop4_u.opsetclientid_confirm.
+		    status));
 	case OP_VERIFY:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opverify.status));
+		    (int32_t *)&objp->nfs_resop4_u.opverify.status));
 	case OP_RELEASE_LOCKOWNER:
 		return (xdr_int(xdrs,
 		    (int32_t *)&objp->nfs_resop4_u.oprelease_lockowner.status));
 	case OP_ILLEGAL:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_resop4_u.opillegal.status));
+		    (int32_t *)&objp->nfs_resop4_u.opillegal.status));
 	}
 	return (FALSE);
 }
@@ -4584,6 +4762,8 @@
 	static int32_t minorversion = NFS4_MINORVERSION;
 	uint32_t *ctagp;
 	rpc_inline_t *ptr;
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
 
 	/*
 	 * XDR_ENCODE only
@@ -4621,25 +4801,30 @@
 		if (!XDR_PUTINT32(xdrs, (int32_t *)&minorversion))
 			return (FALSE);
 	}
+	if (xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = MAXPATHLEN * 2;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
 
 	return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_argop4), (xdrproc_t)xdr_cnfs_argop4));
+	    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+	    sizeof (nfs_argop4), (xdrproc_t)xdr_cnfs_argop4));
 }
 
 bool_t
 xdr_COMPOUND4args_srv(XDR *xdrs, COMPOUND4args *objp)
 {
 	if (!xdr_bytes(xdrs, (char **)&objp->tag.utf8string_val,
-			(uint_t *)&objp->tag.utf8string_len,
-			NFS4_MAX_UTF8STRING))
+	    (uint_t *)&objp->tag.utf8string_len,
+	    NFS4_MAX_UTF8STRING))
 		return (FALSE);
 	if (!xdr_u_int(xdrs, &objp->minorversion))
 		return (FALSE);
 	if (xdrs->x_op != XDR_FREE)
 		return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_argop4), (xdrproc_t)xdr_snfs_argop4));
+		    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+		    sizeof (nfs_argop4), (xdrproc_t)xdr_snfs_argop4));
 
 	return (xdr_snfs_argop4_free(xdrs, &objp->array, objp->array_len));
 }
@@ -4693,7 +4878,7 @@
 		objp->array = resop = kmem_zalloc(len, KM_SLEEP);
 
 		for (len = 0; len < objp->array_len;
-			len++, resop++, argop++, objp->decode_len++) {
+		    len++, resop++, argop++, objp->decode_len++) {
 			if (!xdr_nfs_resop4_clnt(xdrs, resop, argop)) {
 				/*
 				 * Make sure to free anything that may
@@ -4701,15 +4886,15 @@
 				 */
 				xdrs->x_op = XDR_FREE;
 				(void) xdr_nfs_resop4_free(xdrs, &objp->array,
-							    objp->array_len,
-							    objp->decode_len);
+				    objp->array_len,
+				    objp->decode_len);
 				return (FALSE);
 			}
 		}
 		return (TRUE);
 	}
 	return (xdr_nfs_resop4_free(xdrs, &objp->array,
-				    objp->array_len, objp->decode_len));
+	    objp->array_len, objp->decode_len));
 }
 
 bool_t
@@ -4718,17 +4903,17 @@
 	if (!xdr_int(xdrs, (int32_t *)&objp->status))
 		return (FALSE);
 	if (!xdr_bytes(xdrs, (char **)&objp->tag.utf8string_val,
-			(uint_t *)&objp->tag.utf8string_len,
-			NFS4_MAX_UTF8STRING))
+	    (uint_t *)&objp->tag.utf8string_len,
+	    NFS4_MAX_UTF8STRING))
 		return (FALSE);
 
 	if (xdrs->x_op != XDR_FREE)
 		return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_resop4), (xdrproc_t)xdr_snfs_resop4));
+		    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+		    sizeof (nfs_resop4), (xdrproc_t)xdr_snfs_resop4));
 
 	return (xdr_snfs_resop4_free(xdrs, &objp->array,
-				    objp->array_len, objp->array_len));
+	    objp->array_len, objp->array_len));
 }
 
 /*
@@ -4817,20 +5002,20 @@
 	switch (objp->resop) {
 	case OP_CB_GETATTR:
 		if (!xdr_int(xdrs,
-				(int32_t *)&objp->nfs_cb_resop4_u.opcbgetattr.
-				    status))
+		    (int32_t *)&objp->nfs_cb_resop4_u.opcbgetattr.
+		    status))
 			return (FALSE);
 		if (objp->nfs_cb_resop4_u.opcbgetattr.status != NFS4_OK)
 			return (TRUE);
 		return (xdr_fattr4(xdrs,
-				&objp->nfs_cb_resop4_u.opcbgetattr.
-				    obj_attributes));
+		    &objp->nfs_cb_resop4_u.opcbgetattr.
+		    obj_attributes));
 	case OP_CB_RECALL:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_cb_resop4_u.opcbrecall.status));
+		    (int32_t *)&objp->nfs_cb_resop4_u.opcbrecall.status));
 	case OP_CB_ILLEGAL:
 		return (xdr_int(xdrs,
-			(int32_t *)&objp->nfs_cb_resop4_u.opcbillegal.status));
+		    (int32_t *)&objp->nfs_cb_resop4_u.opcbillegal.status));
 	}
 	return (FALSE);
 }
@@ -4842,16 +5027,16 @@
 xdr_CB_COMPOUND4args_clnt(XDR *xdrs, CB_COMPOUND4args *objp)
 {
 	if (!xdr_bytes(xdrs, (char **)&objp->tag.utf8string_val,
-			(uint_t *)&objp->tag.utf8string_len,
-			NFS4_MAX_UTF8STRING))
+	    (uint_t *)&objp->tag.utf8string_len,
+	    NFS4_MAX_UTF8STRING))
 		return (FALSE);
 	if (!xdr_u_int(xdrs, &objp->minorversion))
 		return (FALSE);
 	if (!xdr_u_int(xdrs, &objp->callback_ident))
 		return (FALSE);
 	return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_cb_argop4), (xdrproc_t)xdr_cnfs_cb_argop4));
+	    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+	    sizeof (nfs_cb_argop4), (xdrproc_t)xdr_cnfs_cb_argop4));
 }
 
 /*
@@ -4861,16 +5046,16 @@
 xdr_CB_COMPOUND4args_srv(XDR *xdrs, CB_COMPOUND4args *objp)
 {
 	if (!xdr_bytes(xdrs, (char **)&objp->tag.utf8string_val,
-			(uint_t *)&objp->tag.utf8string_len,
-			NFS4_MAX_UTF8STRING))
+	    (uint_t *)&objp->tag.utf8string_len,
+	    NFS4_MAX_UTF8STRING))
 		return (FALSE);
 	if (!xdr_u_int(xdrs, &objp->minorversion))
 		return (FALSE);
 	if (!xdr_u_int(xdrs, &objp->callback_ident))
 		return (FALSE);
 	return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_cb_argop4), (xdrproc_t)xdr_snfs_cb_argop4));
+	    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+	    sizeof (nfs_cb_argop4), (xdrproc_t)xdr_snfs_cb_argop4));
 }
 
 bool_t
@@ -4879,10 +5064,10 @@
 	if (!xdr_int(xdrs, (int32_t *)&objp->status))
 		return (FALSE);
 	if (!xdr_bytes(xdrs, (char **)&objp->tag.utf8string_val,
-			(uint_t *)&objp->tag.utf8string_len,
-			NFS4_MAX_UTF8STRING))
+	    (uint_t *)&objp->tag.utf8string_len,
+	    NFS4_MAX_UTF8STRING))
 		return (FALSE);
 	return (xdr_array(xdrs, (char **)&objp->array,
-			(uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
-			sizeof (nfs_cb_resop4), (xdrproc_t)xdr_nfs_cb_resop4));
+	    (uint_t *)&objp->array_len, NFS4_COMPOUND_LIMIT,
+	    sizeof (nfs_cb_resop4), (xdrproc_t)xdr_nfs_cb_resop4));
 }
--- a/usr/src/uts/common/fs/nfs/nfs_srv.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs_srv.c	Thu Aug 21 18:01:07 2008 -0500
@@ -28,8 +28,6 @@
  *	All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -52,6 +50,7 @@
 #include <sys/acl.h>
 #include <sys/nbmlock.h>
 #include <sys/policy.h>
+#include <sys/sdt.h>
 
 #include <rpc/types.h>
 #include <rpc/auth.h>
@@ -102,13 +101,9 @@
 	vnode_t *vp;
 	struct vattr va;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_GETATTR_START, "rfs_getattr_start:");
-
 	vp = nfs_fhtovp(fhp, exi);
 	if (vp == NULL) {
 		ns->ns_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END,
-		    "rfs_getattr_end:(%S)", "stale");
 		return;
 	}
 
@@ -116,9 +111,8 @@
 	 * Do the getattr.
 	 */
 	va.va_mask = AT_ALL;	/* we want all the attributes */
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 	error = rfs4_delegated_getattr(vp, &va, 0, cr);
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
 
 	/* check for overflows */
 	if (!error) {
@@ -129,8 +123,6 @@
 	VN_RELE(vp);
 
 	ns->ns_status = puterrno(error);
-
-	TRACE_1(TR_FAC_NFS, TR_RFS_GETATTR_END, "rfs_getattr_end:(%S)", "done");
 }
 void *
 rfs_getattr_getfh(fhandle_t *fhp)
@@ -156,21 +148,16 @@
 	struct flock64 bf;
 	caller_context_t ct;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_SETATTR_START, "rfs_setattr_start:");
 
 	vp = nfs_fhtovp(&args->saa_fh, exi);
 	if (vp == NULL) {
 		ns->ns_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
-		    "rfs_setattr_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req) || vn_is_readonly(vp)) {
 		VN_RELE(vp);
 		ns->ns_status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
-		    "rfs_setattr_end:(%S)", "rofs");
 		return;
 	}
 
@@ -178,8 +165,6 @@
 	if (error) {
 		VN_RELE(vp);
 		ns->ns_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
-		    "rfs_setattr_end:(%S)", "sattr");
 		return;
 	}
 
@@ -243,16 +228,14 @@
 		}
 
 		bva.va_mask = AT_UID | AT_SIZE;
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 		error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 		if (error) {
 			if (in_crit)
 				nbl_end_crit(vp);
 			VN_RELE(vp);
 			ns->ns_status = puterrno(error);
-			TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
-			    "rfs_setattr_end:(%S)", "getattr");
 			return;
 		}
 
@@ -282,11 +265,9 @@
 			bf.l_len = 0;
 			bf.l_sysid = 0;
 			bf.l_pid = 0;
-			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_START,
-			    "vop_space_start:");
+
 			error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 			    (offset_t)va.va_size, cr, &ct);
-			TRACE_0(TR_FAC_NFS, TR_VOP_SPACE_END, "vop_space_end:");
 		}
 		if (in_crit)
 			nbl_end_crit(vp);
@@ -297,9 +278,7 @@
 	 * Do the setattr.
 	 */
 	if (!error && va.va_mask) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_START, "vop_setattr_start:");
 		error = VOP_SETATTR(vp, &va, flag, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_SETATTR_END, "vop_setattr_end:");
 	}
 
 	/*
@@ -311,16 +290,13 @@
 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 		VN_RELE(vp);
 		curthread->t_flag |= T_WOULDBLOCK;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END,
-		    "rfs_setattr_end:(%S)", "delegated");
 		return;
 	}
 
 	if (!error) {
 		va.va_mask = AT_ALL;	/* get everything */
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
 
 		/* check for overflows */
 		if (!error) {
@@ -339,8 +315,6 @@
 	VN_RELE(vp);
 
 	ns->ns_status = puterrno(error);
-
-	TRACE_1(TR_FAC_NFS, TR_RFS_SETATTR_END, "rfs_setattr_end:(%S)", "done");
 }
 void *
 rfs_setattr_getfh(struct nfssaargs *args)
@@ -365,8 +339,6 @@
 	struct sec_ol sec = {0, 0};
 	bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_LOOKUP_START, "rfs_lookup_start:");
-
 	/*
 	 * Trusted Extension doesn't support NFSv2. MOUNT
 	 * will reject v2 clients. Need to prevent v2 client
@@ -374,8 +346,6 @@
 	 */
 	if (is_system_labeled() && req->rq_vers == 2) {
 		dr->dr_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
-		    "rfs_lookup_end:(%S)", "access");
 		return;
 	}
 
@@ -384,8 +354,6 @@
 	 */
 	if (da->da_name == NULL || *da->da_name == '\0') {
 		dr->dr_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
-		    "rfs_lookup_end:(%S)", "access");
 		return;
 	}
 
@@ -400,8 +368,6 @@
 		dvp = nfs_fhtovp(fhp, exi);
 		if (dvp == NULL) {
 			dr->dr_status = NFSERR_STALE;
-			TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
-			    "rfs_lookup_end:(%S)", "stale");
 			return;
 		}
 	}
@@ -415,8 +381,6 @@
 	    EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 		VN_RELE(dvp);
 		dr->dr_status = NFSERR_NOENT;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END,
-		    "rfs_lookup_end:(%S)", "noent");
 		return;
 	}
 
@@ -437,17 +401,15 @@
 		/*
 		 * Do a normal single component lookup.
 		 */
-		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
 		error = VOP_LOOKUP(dvp, da->da_name, &vp, NULL, 0, NULL, cr,
 		    NULL, NULL, NULL);
-		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
 	}
 
 	if (!error) {
 		va.va_mask = AT_ALL;	/* we want everything */
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 		error = rfs4_delegated_getattr(vp, &va, 0, cr);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 		/* check for overflows */
 		if (!error) {
 			acl_perm(vp, exi, &va, cr);
@@ -488,8 +450,6 @@
 		dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 	else
 		dr->dr_status = puterrno(error);
-
-	TRACE_1(TR_FAC_NFS, TR_RFS_LOOKUP_END, "rfs_lookup_end:(%S)", "done");
 }
 void *
 rfs_lookup_getfh(struct nfsdiropargs *da)
@@ -512,28 +472,21 @@
 	vnode_t *vp;
 	struct vattr va;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_READLINK_START, "rfs_readlink_start:");
-
 	vp = nfs_fhtovp(fhp, exi);
 	if (vp == NULL) {
 		rl->rl_data = NULL;
 		rl->rl_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
-		    "rfs_readlink_end:(%S)", "stale");
 		return;
 	}
 
 	va.va_mask = AT_MODE;
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
 
 	if (error) {
 		VN_RELE(vp);
 		rl->rl_data = NULL;
 		rl->rl_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
-		    "rfs_readlink_end:(%S)", "getattr error");
 		return;
 	}
 
@@ -541,8 +494,6 @@
 		VN_RELE(vp);
 		rl->rl_data = NULL;
 		rl->rl_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
-		    "rfs_readlink_end:(%S)", "access");
 		return;
 	}
 
@@ -554,8 +505,6 @@
 		VN_RELE(vp);
 		rl->rl_data = NULL;
 		rl->rl_status = NFSERR_NXIO;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
-		    "rfs_readlink_end:(%S)", "nxio");
 		return;
 	}
 
@@ -579,21 +528,7 @@
 	/*
 	 * Do the readlink.
 	 */
-	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_START, "vop_readlink_start:");
 	error = VOP_READLINK(vp, &uio, cr, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_READLINK_END, "vop_readlink_end:");
-
-#if 0 /* notyet */
-	/*
-	 * Don't do this.  It causes local disk writes when just
-	 * reading the file and the overhead is deemed larger
-	 * than the benefit.
-	 */
-	/*
-	 * Force modified metadata out to stable storage.
-	 */
-	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
-#endif
 
 	VN_RELE(vp);
 
@@ -609,8 +544,6 @@
 	else
 		rl->rl_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_READLINK_END,
-	    "rfs_readlink_end:(%S)", "done");
 }
 void *
 rfs_readlink_getfh(fhandle_t *fhp)
@@ -627,6 +560,8 @@
 		kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 }
 
+static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
+
 /*
  * Read data.
  * Returns some data read from the file at the given fhandle.
@@ -646,14 +581,10 @@
 	int in_crit = 0;
 	caller_context_t ct;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_READ_START, "rfs_read_start:");
-
 	vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 	if (vp == NULL) {
 		rr->rr_data = NULL;
 		rr->rr_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "stale");
 		return;
 	}
 
@@ -661,8 +592,6 @@
 		VN_RELE(vp);
 		rr->rr_data = NULL;
 		rr->rr_status = NFSERR_ISDIR;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "isdir");
 		return;
 	}
 
@@ -683,45 +612,36 @@
 			VN_RELE(vp);
 			rr->rr_data = NULL;
 			rr->rr_status = NFSERR_ACCES;
-			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-			    "rfs_read_end:(%S)", " csf access error");
 			return;
 		}
 		in_crit = 1;
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
 	error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
 
 	/* check if a monitor detected a delegation conflict */
 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 		VN_RELE(vp);
 		/* mark as wouldblock so response is dropped */
 		curthread->t_flag |= T_WOULDBLOCK;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "delegated");
+
 		rr->rr_data = NULL;
 		return;
 	}
 
 	va.va_mask = AT_ALL;
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
 
 	if (error) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
-		    "vop_rwunlock_start:");
 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 		if (in_crit)
 			nbl_end_crit(vp);
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
+
 		VN_RELE(vp);
 		rr->rr_data = NULL;
 		rr->rr_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "getattr error");
+
 		return;
 	}
 
@@ -731,52 +651,42 @@
 	 * is always allowed to read it.
 	 */
 	if (crgetuid(cr) != va.va_uid) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
 		error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
+
 		if (error) {
 			/*
 			 * Exec is the same as read over the net because
 			 * of demand loading.
 			 */
-			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
-			    "vop_access_start:");
 			error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
-			TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
-			    "vop_access_end:");
 		}
 		if (error) {
-			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
-			    "vop_rwunlock_start:");
 			VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 			if (in_crit)
 				nbl_end_crit(vp);
-			TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
-			    "vop_rwunlock_end:");
 			VN_RELE(vp);
 			rr->rr_data = NULL;
 			rr->rr_status = puterrno(error);
-			TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-			    "rfs_read_end:(%S)", "access error");
+
 			return;
 		}
 	}
 
 	if (MANDLOCK(vp, va.va_mode)) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
-		    "vop_rwunlock_start:");
 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 		if (in_crit)
 			nbl_end_crit(vp);
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
+
 		VN_RELE(vp);
 		rr->rr_data = NULL;
 		rr->rr_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "mand lock");
+
 		return;
 	}
 
+	rr->rr_ok.rrok_wlist_len = 0;
+	rr->rr_ok.rrok_wlist = NULL;
+
 	if ((u_offset_t)ra->ra_offset >= va.va_size) {
 		rr->rr_count = 0;
 		rr->rr_data = NULL;
@@ -788,25 +698,32 @@
 		goto done;
 	}
 
-	/*
-	 * mp will contain the data to be sent out in the read reply.
-	 * This will be freed after the reply has been sent out (by the
-	 * driver).
-	 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
-	 * that the call to xdrmblk_putmblk() never fails.
-	 */
-	mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
-	    &alloc_err);
-	ASSERT(mp != NULL);
-	ASSERT(alloc_err == 0);
-
-	rr->rr_mp = mp;
-
-	/*
-	 * Set up io vector
-	 */
-	iov.iov_base = (caddr_t)mp->b_datap->db_base;
-	iov.iov_len = ra->ra_count;
+	if (ra->ra_wlist) {
+		mp = NULL;
+		rr->rr_mp = NULL;
+		(void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
+	} else {
+		/*
+		 * mp will contain the data to be sent out in the read reply.
+		 * This will be freed after the reply has been sent out (by the
+		 * driver).
+		 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
+		 * that the call to xdrmblk_putmblk() never fails.
+		 */
+		mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
+		    &alloc_err);
+		ASSERT(mp != NULL);
+		ASSERT(alloc_err == 0);
+
+		rr->rr_mp = mp;
+
+		/*
+		 * Set up io vector
+		 */
+		iov.iov_base = (caddr_t)mp->b_datap->db_base;
+		iov.iov_len = ra->ra_count;
+	}
+
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_segflg = UIO_SYSSPACE;
@@ -814,12 +731,11 @@
 	uio.uio_loffset = (offset_t)ra->ra_offset;
 	uio.uio_resid = ra->ra_count;
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_READ_START, "vop_read_start:");
 	error = VOP_READ(vp, &uio, 0, cr, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_READ_END, "vop_read_end:");
 
 	if (error) {
-		freeb(mp);
+		if (mp)
+			freeb(mp);
 
 		/*
 		 * check if a monitor detected a delegation conflict and
@@ -830,16 +746,13 @@
 		else
 			rr->rr_status = puterrno(error);
 
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
-		    "vop_rwunlock_start:");
 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 		if (in_crit)
 			nbl_end_crit(vp);
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
+
 		VN_RELE(vp);
 		rr->rr_data = NULL;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "read error");
+
 		return;
 	}
 
@@ -848,59 +761,50 @@
 	 * time to the client side for his cache.
 	 */
 	va.va_mask = AT_ALL;
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 	if (error) {
-		freeb(mp);
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START,
-		    "vop_rwunlock_start:");
+		if (mp)
+			freeb(mp);
+
 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 		if (in_crit)
 			nbl_end_crit(vp);
-		TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END,
-		    "vop_rwunlock_end:");
+
 		VN_RELE(vp);
 		rr->rr_data = NULL;
 		rr->rr_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_read_end:(%S)", "read error");
+
 		return;
 	}
 
 	rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 
-	rr->rr_data = (char *)mp->b_datap->db_base;
-
+	if (mp) {
+		rr->rr_data = (char *)mp->b_datap->db_base;
+	} else {
+		if (ra->ra_wlist) {
+			rr->rr_data = (caddr_t)iov.iov_base;
+			if (!rdma_setup_read_data2(ra, rr)) {
+				rr->rr_data = NULL;
+				rr->rr_status = puterrno(NFSERR_INVAL);
+			}
+		}
+	}
 done:
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 	if (in_crit)
 		nbl_end_crit(vp);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
 
 	acl_perm(vp, exi, &va, cr);
 
 	/* check for overflows */
 	error = vattr_to_nattr(&va, &rr->rr_attr);
 
-#if 0 /* notyet */
-	/*
-	 * Don't do this.  It causes local disk writes when just
-	 * reading the file and the overhead is deemed larger
-	 * than the benefit.
-	 */
-	/*
-	 * Force modified metadata out to stable storage.
-	 */
-	(void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
-#endif
-
 	VN_RELE(vp);
 
 	rr->rr_status = puterrno(error);
-
-	TRACE_1(TR_FAC_NFS, TR_RFS_READ_END, "rfs_read_end:(%S)", "done");
 }
 
 /*
@@ -955,29 +859,21 @@
 	int in_crit = 0;
 	caller_context_t ct;
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START, "rfs_write_start:(%S)", "sync");
-
 	vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 	if (vp == NULL) {
 		ns->ns_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req)) {
 		VN_RELE(vp);
 		ns->ns_status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "rofs");
 		return;
 	}
 
 	if (vp->v_type != VREG) {
 		VN_RELE(vp);
 		ns->ns_status = NFSERR_ISDIR;
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "isdir");
 		return;
 	}
 
@@ -987,15 +883,13 @@
 	ct.cc_flags = CC_DONTBLOCK;
 
 	va.va_mask = AT_UID|AT_MODE;
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
 
 	if (error) {
 		VN_RELE(vp);
 		ns->ns_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "getattr error");
+
 		return;
 	}
 
@@ -1005,14 +899,11 @@
 		 * with read only permission.  The owner of the file
 		 * is always allowed to write it.
 		 */
-		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
 		error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
+
 		if (error) {
 			VN_RELE(vp);
 			ns->ns_status = puterrno(error);
-			TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-			    "rfs_write_end:(%S)", "access error");
 			return;
 		}
 	}
@@ -1025,8 +916,6 @@
 	if (MANDLOCK(vp, va.va_mode)) {
 		VN_RELE(vp);
 		ns->ns_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "mand lock");
 		return;
 	}
 
@@ -1044,23 +933,25 @@
 		}
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
 
 	/* check if a monitor detected a delegation conflict */
 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 		VN_RELE(vp);
 		/* mark as wouldblock so response is dropped */
 		curthread->t_flag |= T_WOULDBLOCK;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READ_END,
-		    "rfs_write_end:(%S)", "delegated");
 		return;
 	}
 
-	if (wa->wa_data) {
-		iov[0].iov_base = wa->wa_data;
-		iov[0].iov_len = wa->wa_count;
+	if (wa->wa_data || wa->wa_rlist) {
+		/* Do the RDMA thing if necessary */
+		if (wa->wa_rlist) {
+			iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
+			iov[0].iov_len = wa->wa_count;
+		} else  {
+			iov[0].iov_base = wa->wa_data;
+			iov[0].iov_len = wa->wa_count;
+		}
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_segflg = UIO_SYSSPACE;
@@ -1079,8 +970,6 @@
 		/*
 		 * for now we assume no append mode
 		 */
-		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
-		    "vop_write_start:(%S)", "sync");
 		/*
 		 * We're changing creds because VM may fault and we need
 		 * the cred of the current thread to be used if quota
@@ -1090,7 +979,6 @@
 		curthread->t_cred = cr;
 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
 		curthread->t_cred = savecred;
-		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
 	} else {
 		iovcnt = 0;
 		for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
@@ -1125,8 +1013,6 @@
 		/*
 		 * For now we assume no append mode.
 		 */
-		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
-		    "vop_write_start:(%S)", "iov sync");
 		/*
 		 * We're changing creds because VM may fault and we need
 		 * the cred of the current thread to be used if quota
@@ -1136,15 +1022,12 @@
 		curthread->t_cred = cr;
 		error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
 		curthread->t_cred = savecred;
-		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
 
 		if (iovp != iov)
 			kmem_free(iovp, sizeof (*iovp) * iovcnt);
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
 
 	if (!error) {
 		/*
@@ -1152,9 +1035,9 @@
 		 * time to the client side for his cache.
 		 */
 		va.va_mask = AT_ALL;	/* now we want everything */
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 		error = VOP_GETATTR(vp, &va, 0, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 		/* check for overflows */
 		if (!error) {
 			acl_perm(vp, exi, &va, cr);
@@ -1174,7 +1057,6 @@
 	else
 		ns->ns_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "sync");
 }
 
 struct rfs_async_write {
@@ -1246,9 +1128,6 @@
 		return;
 	}
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_START,
-	    "rfs_write_start:(%S)", "async");
-
 	/*
 	 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
 	 * is considered an OK.
@@ -1297,8 +1176,7 @@
 		while (nrp->ns->ns_status == RFSWRITE_INITVAL)
 			cv_wait(&lp->cv, &rfs_async_write_lock);
 		mutex_exit(&rfs_async_write_lock);
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "cluster child");
+
 		return;
 	}
 
@@ -1346,8 +1224,7 @@
 		}
 		cv_broadcast(&nlp->cv);
 		mutex_exit(&rfs_async_write_lock);
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "stale");
+
 		return;
 	}
 
@@ -1373,8 +1250,7 @@
 		}
 		cv_broadcast(&nlp->cv);
 		mutex_exit(&rfs_async_write_lock);
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "isdir");
+
 		return;
 	}
 
@@ -1396,9 +1272,7 @@
 	 * Lock the file for writing.  This operation provides
 	 * the delay which allows clusters to grow.
 	 */
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_wrlock_start:");
 	error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_wrlock_end");
 
 	/* check if a monitor detected a delegation conflict */
 	if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
@@ -1414,8 +1288,7 @@
 		}
 		cv_broadcast(&nlp->cv);
 		mutex_exit(&rfs_async_write_lock);
-		TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END,
-		    "rfs_write_end:(%S)", "delegated");
+
 		return;
 	}
 
@@ -1470,9 +1343,9 @@
 		}
 
 		va.va_mask = AT_UID|AT_MODE;
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
+
 		error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 		if (!error) {
 			if (crgetuid(rp->cr) != va.va_uid) {
 				/*
@@ -1481,11 +1354,7 @@
 				 * owner of the file is always allowed to
 				 * write it.
 				 */
-				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START,
-				    "vop_access_start:");
 				error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
-				TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END,
-				    "vop_access_end:");
 			}
 			if (!error && MANDLOCK(vp, va.va_mode))
 				error = EACCES;
@@ -1538,7 +1407,7 @@
 		iovcnt = 0;
 		lrp = rp;
 		for (;;) {
-			if (lrp->wa->wa_data)
+			if (lrp->wa->wa_data || lrp->wa->wa_rlist)
 				iovcnt++;
 			else {
 				m = lrp->wa->wa_mblk;
@@ -1575,9 +1444,16 @@
 		trp = rp;
 		count = 0;
 		do {
-			if (trp->wa->wa_data) {
-				iovp->iov_base = trp->wa->wa_data;
-				iovp->iov_len = trp->wa->wa_count;
+			if (trp->wa->wa_data || trp->wa->wa_rlist) {
+				if (trp->wa->wa_rlist) {
+					iovp->iov_base =
+					    (char *)((trp->wa->wa_rlist)->
+					    u.c_daddr3);
+					iovp->iov_len = trp->wa->wa_count;
+				} else  {
+					iovp->iov_base = trp->wa->wa_data;
+					iovp->iov_len = trp->wa->wa_count;
+				}
 				iovp++;
 			} else {
 				m = trp->wa->wa_mblk;
@@ -1616,8 +1492,6 @@
 		/*
 		 * For now we assume no append mode.
 		 */
-		TRACE_1(TR_FAC_NFS, TR_VOP_WRITE_START,
-		    "vop_write_start:(%S)", "async");
 
 		/*
 		 * We're changing creds because VM may fault
@@ -1629,7 +1503,6 @@
 		curthread->t_cred = cr;
 		error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
 		curthread->t_cred = savecred;
-		TRACE_0(TR_FAC_NFS, TR_VOP_WRITE_END, "vop_write_end:");
 
 		/* check if a monitor detected a delegation conflict */
 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
@@ -1646,11 +1519,9 @@
 			 * time to the client side for his cache.
 			 */
 			va.va_mask = AT_ALL;	/* now we want everything */
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
-			    "vop_getattr_start:");
+
 			error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
-			    "vop_getattr_end:");
+
 			if (!error)
 				acl_perm(vp, exi, &va, rp->cr);
 		}
@@ -1678,20 +1549,14 @@
 	 * the data and metadata to stable storage.
 	 */
 	if (data_written) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_START, "vop_putpage_start:");
 		error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
-		TRACE_0(TR_FAC_NFS, TR_VOP_PUTPAGE_END, "vop_putpage_end:");
+
 		if (!error) {
-			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_START,
-			    "vop_fsync_start:");
 			error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
-			TRACE_0(TR_FAC_NFS, TR_VOP_FSYNC_END, "vop_fsync_end:");
 		}
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
 
 	if (in_crit)
 		nbl_end_crit(vp);
@@ -1708,7 +1573,6 @@
 	cv_broadcast(&nlp->cv);
 	mutex_exit(&rfs_async_write_lock);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_WRITE_END, "rfs_write_end:(%S)", "async");
 }
 
 void *
@@ -1739,31 +1603,23 @@
 	int lookup_ok;
 	bool_t trunc;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_CREATE_START, "rfs_create_start:");
-
 	/*
 	 * Disallow NULL paths
 	 */
 	if (name == NULL || *name == '\0') {
 		dr->dr_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
-		    "rfs_create_end:(%S)", "access");
 		return;
 	}
 
 	dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
 	if (dvp == NULL) {
 		dr->dr_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
-		    "rfs_create_end:(%S)", "stale");
 		return;
 	}
 
 	error = sattr_to_vattr(args->ca_sa, &va);
 	if (error) {
 		dr->dr_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
-		    "rfs_create_end:(%S)", "sattr");
 		return;
 	}
 
@@ -1773,8 +1629,6 @@
 	if (!(va.va_mask & AT_MODE)) {
 		VN_RELE(dvp);
 		dr->dr_status = NFSERR_INVAL;
-		TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END,
-		    "rfs_create_end:(%S)", "no mode");
 		return;
 	}
 
@@ -1827,20 +1681,14 @@
 	lookup_ok = 0;
 	mode = VWRITE;
 	if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
-		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
 		error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
 		    NULL, NULL, NULL);
-		TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
 		if (!error) {
 			struct vattr at;
 
 			lookup_ok = 1;
 			at.va_mask = AT_MODE;
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
-			    "vop_getattr_start:");
 			error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
-			    "vop_getattr_end:");
 			if (!error)
 				mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
 			VN_RELE(tvp);
@@ -1925,10 +1773,8 @@
 		    exi->exi_export.ex_flags & EX_NOSUID)
 			va.va_mode &= ~(VSUID | VSGID);
 
-		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_START, "vop_create_start:");
 		error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
 		    NULL, NULL);
-		TRACE_0(TR_FAC_NFS, TR_VOP_CREATE_END, "vop_create_end:");
 
 		if (!error) {
 
@@ -1943,11 +1789,9 @@
 				goto out;
 			}
 			va.va_mask = AT_ALL;
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START,
-			    "vop_getattr_start:");
+
 			error = VOP_GETATTR(vp, &va, 0, cr, NULL);
-			TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END,
-			    "vop_getattr_end:");
+
 			/* check for overflows */
 			if (!error) {
 				acl_perm(vp, exi, &va, cr);
@@ -1986,7 +1830,6 @@
 
 	dr->dr_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_CREATE_END, "rfs_create_end:(%S)", "done");
 }
 void *
 rfs_create_getfh(struct nfscreatargs *args)
@@ -2007,31 +1850,23 @@
 	vnode_t *targvp;
 	int in_crit = 0;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_REMOVE_START, "rfs_remove_start:");
-
 	/*
 	 * Disallow NULL paths
 	 */
 	if (da->da_name == NULL || *da->da_name == '\0') {
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
-		    "rfs_remove_end:(%S)", "access");
 		return;
 	}
 
 	vp = nfs_fhtovp(da->da_fhandle, exi);
 	if (vp == NULL) {
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
-		    "rfs_remove_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req)) {
 		VN_RELE(vp);
 		*status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END,
-		    "rfs_remove_end:(%S)", "rofs");
 		return;
 	}
 
@@ -2070,9 +1905,7 @@
 		}
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_START, "vop_remove_start:");
 	error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_REMOVE_END, "vop_remove_end:");
 
 	/*
 	 * Force modified data and metadata out to stable storage.
@@ -2087,7 +1920,6 @@
 
 	*status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_REMOVE_END, "rfs_remove_end:(%S)", "done");
 }
 
 void *
@@ -2113,13 +1945,9 @@
 	vnode_t *targvp;
 	int in_crit = 0;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_RENAME_START, "rfs_rename_start:");
-
 	fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
 	if (fromvp == NULL) {
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "from stale");
 		return;
 	}
 
@@ -2128,8 +1956,6 @@
 	if (to_exi == NULL) {
 		VN_RELE(fromvp);
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "cross device");
 		return;
 	}
 	exi_rele(to_exi);
@@ -2137,8 +1963,6 @@
 	if (to_exi != exi) {
 		VN_RELE(fromvp);
 		*status = NFSERR_XDEV;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "from stale");
 		return;
 	}
 
@@ -2146,16 +1970,12 @@
 	if (tovp == NULL) {
 		VN_RELE(fromvp);
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "to stale");
 		return;
 	}
 
 	if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "not dir");
 		*status = NFSERR_NOTDIR;
 		return;
 	}
@@ -2168,8 +1988,6 @@
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "access");
 		return;
 	}
 
@@ -2177,8 +1995,6 @@
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
 		*status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END,
-		    "rfs_rename_end:(%S)", "rofs");
 		return;
 	}
 
@@ -2231,10 +2047,8 @@
 		}
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_START, "vop_rename_start:");
 	error = VOP_RENAME(fromvp, args->rna_from.da_name,
 	    tovp, args->rna_to.da_name, cr, NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RENAME_END, "vop_rename_end:");
 
 	if (error == 0)
 		vn_renamepath(tovp, srcvp, args->rna_to.da_name,
@@ -2255,7 +2069,6 @@
 
 	*status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_RENAME_END, "rfs_rename_end:(%S)", "done");
 }
 void *
 rfs_rename_getfh(struct nfsrnmargs *args)
@@ -2277,13 +2090,9 @@
 	struct exportinfo *to_exi;
 	fhandle_t *fh;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_LINK_START, "rfs_link_start:");
-
 	fromvp = nfs_fhtovp(args->la_from, exi);
 	if (fromvp == NULL) {
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "from stale");
 		return;
 	}
 
@@ -2292,8 +2101,6 @@
 	if (to_exi == NULL) {
 		VN_RELE(fromvp);
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "cross device");
 		return;
 	}
 	exi_rele(to_exi);
@@ -2301,8 +2108,6 @@
 	if (to_exi != exi) {
 		VN_RELE(fromvp);
 		*status = NFSERR_XDEV;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "cross device");
 		return;
 	}
 
@@ -2310,8 +2115,6 @@
 	if (tovp == NULL) {
 		VN_RELE(fromvp);
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "to stale");
 		return;
 	}
 
@@ -2319,8 +2122,6 @@
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
 		*status = NFSERR_NOTDIR;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "not dir");
 		return;
 	}
 	/*
@@ -2330,8 +2131,6 @@
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "access");
 		return;
 	}
 
@@ -2339,14 +2138,10 @@
 		VN_RELE(tovp);
 		VN_RELE(fromvp);
 		*status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END,
-		    "rfs_link_end:(%S)", "rofs");
 		return;
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_START, "vop_link_start:");
 	error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_LINK_END, "vop_link_end:");
 
 	/*
 	 * Force modified data and metadata out to stable storage.
@@ -2359,7 +2154,6 @@
 
 	*status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_LINK_END, "rfs_link_end:(%S)", "done");
 }
 void *
 rfs_link_getfh(struct nfslinkargs *args)
@@ -2382,31 +2176,23 @@
 	vnode_t *svp;
 	int lerror;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_SYMLINK_START, "rfs_symlink_start:");
-
 	/*
 	 * Disallow NULL paths
 	 */
 	if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
-		    "rfs_symlink_end:(%S)", "access");
 		return;
 	}
 
 	vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
 	if (vp == NULL) {
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
-		    "rfs_symlink_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req)) {
 		VN_RELE(vp);
 		*status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
-		    "rfs_symlink_end:(%S)", "rofs");
 		return;
 	}
 
@@ -2414,34 +2200,27 @@
 	if (error) {
 		VN_RELE(vp);
 		*status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
-		    "rfs_symlink_end:(%S)", "sattr");
 		return;
 	}
 
 	if (!(va.va_mask & AT_MODE)) {
 		VN_RELE(vp);
 		*status = NFSERR_INVAL;
-		TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END,
-		    "rfs_symlink_end:(%S)", "no mode");
 		return;
 	}
 
 	va.va_type = VLNK;
 	va.va_mask |= AT_TYPE;
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_START, "vop_symlink_start:");
 	error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, args->sla_tnm, cr,
 	    NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_SYMLINK_END, "vop_symlink_end:");
 
 	/*
 	 * Force new data and metadata out to stable storage.
 	 */
-	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_START, "vop_lookup_start:");
 	lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL,
 	    0, NULL, cr, NULL, NULL, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_LOOKUP_END, "vop_lookup_end:");
+
 	if (!lerror) {
 		(void) VOP_FSYNC(svp, 0, cr, NULL);
 		VN_RELE(svp);
@@ -2456,7 +2235,6 @@
 
 	*status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_SYMLINK_END, "rfs_symlink_end:(%S)", "done");
 }
 void *
 rfs_symlink_getfh(struct nfsslargs *args)
@@ -2479,31 +2257,23 @@
 	vnode_t *vp;
 	char *name = args->ca_da.da_name;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_MKDIR_START, "rfs_mkdir_start:");
-
 	/*
 	 * Disallow NULL paths
 	 */
 	if (name == NULL || *name == '\0') {
 		dr->dr_status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
-		    "rfs_mkdir_end:(%S)", "access");
 		return;
 	}
 
 	vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
 	if (vp == NULL) {
 		dr->dr_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
-		    "rfs_mkdir_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req)) {
 		VN_RELE(vp);
 		dr->dr_status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
-		    "rfs_mkdir_end:(%S)", "rofs");
 		return;
 	}
 
@@ -2511,25 +2281,19 @@
 	if (error) {
 		VN_RELE(vp);
 		dr->dr_status = puterrno(error);
-		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
-		    "rfs_mkdir_end:(%S)", "sattr");
 		return;
 	}
 
 	if (!(va.va_mask & AT_MODE)) {
 		VN_RELE(vp);
 		dr->dr_status = NFSERR_INVAL;
-		TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END,
-		    "rfs_mkdir_end:(%S)", "no mode");
 		return;
 	}
 
 	va.va_type = VDIR;
 	va.va_mask |= AT_TYPE;
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_START, "vop_mkdir_start:");
 	error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_MKDIR_END, "vop_mkdir_end:");
 
 	if (!error) {
 		/*
@@ -2537,9 +2301,8 @@
 		 * be returned to the client.
 		 */
 		va.va_mask = AT_ALL; /* We want everything */
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_START, "vop_getattr_start:");
 		error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
-		TRACE_0(TR_FAC_NFS, TR_VOP_GETATTR_END, "vop_getattr_end:");
+
 		/* check for overflows */
 		if (!error) {
 			acl_perm(vp, exi, &va, cr);
@@ -2564,7 +2327,6 @@
 
 	dr->dr_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_MKDIR_END, "rfs_mkdir_end:(%S)", "done");
 }
 void *
 rfs_mkdir_getfh(struct nfscreatargs *args)
@@ -2583,31 +2345,24 @@
 	int error;
 	vnode_t *vp;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_RMDIR_START, "rfs_rmdir_start:");
 
 	/*
 	 * Disallow NULL paths
 	 */
 	if (da->da_name == NULL || *da->da_name == '\0') {
 		*status = NFSERR_ACCES;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
-		    "rfs_rmdir_end:(%S)", "access");
 		return;
 	}
 
 	vp = nfs_fhtovp(da->da_fhandle, exi);
 	if (vp == NULL) {
 		*status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
-		    "rfs_rmdir_end:(%S)", "stale");
 		return;
 	}
 
 	if (rdonly(exi, req)) {
 		VN_RELE(vp);
 		*status = NFSERR_ROFS;
-		TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END,
-		    "rfs_rmdir_end:(%S)", "rofs");
 		return;
 	}
 
@@ -2620,9 +2375,7 @@
 	 * supplying a vnode known to exist and illegal to
 	 * remove.
 	 */
-	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_START, "vop_rmdir_start:");
 	error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RMDIR_END, "vop_rmdir_end:");
 
 	/*
 	 * Force modified data and metadata out to stable storage.
@@ -2642,7 +2395,6 @@
 	else
 		*status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_RMDIR_END, "rfs_rmdir_end:(%S)", "done");
 }
 void *
 rfs_rmdir_getfh(struct nfsdiropargs *da)
@@ -2661,14 +2413,10 @@
 	struct uio uio;
 	vnode_t *vp;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_READDIR_START, "rfs_readdir_start:");
-
 	vp = nfs_fhtovp(&rda->rda_fh, exi);
 	if (vp == NULL) {
 		rd->rd_entries = NULL;
 		rd->rd_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
-		    "rfs_readdir_end:(%S)", "stale");
 		return;
 	}
 
@@ -2676,18 +2424,13 @@
 		VN_RELE(vp);
 		rd->rd_entries = NULL;
 		rd->rd_status = NFSERR_NOTDIR;
-		TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END,
-		    "rfs_readdir_end:(%S)", "notdir");
 		return;
 	}
 
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_START, "vop_rwlock_start:");
 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWLOCK_END, "vop_rwlock_end:");
-
-	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_START, "vop_access_start:");
+
 	error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_ACCESS_END, "vop_access_end:");
+
 	if (error) {
 		rd->rd_entries = NULL;
 		goto bad;
@@ -2723,9 +2466,7 @@
 	/*
 	 * read directory
 	 */
-	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_START, "vop_readdir_start:");
 	error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
-	TRACE_0(TR_FAC_NFS, TR_VOP_READDIR_END, "vop_readdir_end:");
 
 	/*
 	 * Clean up
@@ -2745,9 +2486,7 @@
 	}
 
 bad:
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_START, "vop_rwunlock_start:");
 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
-	TRACE_0(TR_FAC_NFS, TR_VOP_RWUNLOCK_END, "vop_rwunlock_end:");
 
 #if 0 /* notyet */
 	/*
@@ -2765,7 +2504,6 @@
 
 	rd->rd_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_READDIR_END, "rfs_readdir_end:(%S)", "done");
 }
 void *
 rfs_readdir_getfh(struct nfsrddirargs *rda)
@@ -2788,13 +2526,9 @@
 	struct statvfs64 sb;
 	vnode_t *vp;
 
-	TRACE_0(TR_FAC_NFS, TR_RFS_STATFS_START, "rfs_statfs_start:");
-
 	vp = nfs_fhtovp(fh, exi);
 	if (vp == NULL) {
 		fs->fs_status = NFSERR_STALE;
-		TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END,
-		    "rfs_statfs_end:(%S)", "stale");
 		return;
 	}
 
@@ -2812,7 +2546,6 @@
 
 	fs->fs_status = puterrno(error);
 
-	TRACE_1(TR_FAC_NFS, TR_RFS_STATFS_END, "rfs_statfs_end:(%S)", "done");
 }
 void *
 rfs_statfs_getfh(fhandle_t *fh)
@@ -3115,3 +2848,47 @@
 {
 	mutex_destroy(&rfs_async_write_lock);
 }
+
+static int
+rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
+{
+	struct clist	*wcl;
+	int		data_len, avail_len, num;
+	uint32_t	count = rr->rr_count;
+
+	data_len = num = avail_len = 0;
+
+	wcl = ra->ra_wlist;
+	while (wcl != NULL) {
+		if (wcl->c_dmemhandle.mrc_rmr == 0)
+			break;
+
+		avail_len += wcl->c_len;
+		if (wcl->c_len < count) {
+			data_len += wcl->c_len;
+		} else {
+			/* Can make the rest chunks all 0-len */
+			data_len += count;
+			wcl->c_len = count;
+		}
+		count -= wcl->c_len;
+		num ++;
+		wcl = wcl->c_next;
+	}
+
+	/*
+	 * MUST fail if there are still more data
+	 */
+	if (count > 0) {
+		DTRACE_PROBE2(nfss__e__read__wlist__fail,
+		    int, data_len, int, count);
+		return (FALSE);
+	}
+
+	wcl = ra->ra_wlist;
+	rr->rr_count = data_len;
+	rr->rr_ok.rrok_wlist_len = data_len;
+	rr->rr_ok.rrok_wlist = wcl;
+
+	return (TRUE);
+}
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c	Thu Aug 21 18:01:07 2008 -0500
@@ -26,8 +26,6 @@
  *	All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -904,8 +902,8 @@
  * Read from a file.  Reads data in largest chunks our interface can handle.
  */
 static int
-nfsread(vnode_t *vp, caddr_t base, uint_t offset, int count, size_t *residp,
-	cred_t *cr)
+nfsread(vnode_t *vp, caddr_t base, uint_t offset,
+    int count, size_t *residp, cred_t *cr)
 {
 	mntinfo_t *mi;
 	struct nfsreadargs ra;
@@ -946,6 +944,7 @@
 			ra.ra_offset = offset;
 			ra.ra_totcount = tsize;
 			ra.ra_count = tsize;
+			ra.ra_data = base;
 			t = gethrtime();
 			error = rfs2call(mi, RFS_READ,
 			    xdr_readargs, (caddr_t)&ra,
@@ -1466,7 +1465,7 @@
 	douprintf = 1;
 
 	error = rfs2call(VTOMI(vp), RFS_READLINK,
-	    xdr_fhandle, (caddr_t)VTOFH(vp),
+	    xdr_readlink, (caddr_t)VTOFH(vp),
 	    xdr_rdlnres, (caddr_t)&rl, cr,
 	    &douprintf, &rl.rl_status, 0, &fi);
 
@@ -3356,6 +3355,7 @@
 	read_again:
 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
 		    offset, bp->b_bcount, &bp->b_resid, cred);
+
 		crfree(cred);
 		if (!error) {
 			if (bp->b_resid) {
--- a/usr/src/uts/common/fs/nfs/nfs_xdr.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs_xdr.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
 /* All Rights Reserved */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
@@ -41,6 +38,7 @@
 #include <sys/strsubr.h>
 #include <sys/debug.h>
 #include <sys/t_lock.h>
+#include <sys/sdt.h>
 
 #include <rpc/types.h>
 #include <rpc/xdr.h>
@@ -128,7 +126,8 @@
 	int32_t *ptr;
 	int32_t *fhp;
 
-	if (xdrs->x_op == XDR_DECODE) {
+	switch (xdrs->x_op) {
+	case XDR_DECODE:
 		wa->wa_args = &wa->wa_args_buf;
 		ptr = XDR_INLINE(xdrs, RNDUP(sizeof (fhandle_t)) +
 		    3 * BYTES_PER_XDR_UNIT);
@@ -153,11 +152,44 @@
 			 * an array of unknown length as to inline copy it.
 			 */
 			return (xdr_bytes(xdrs, &wa->wa_data,
-				    &wa->wa_count, NFS_MAXDATA));
+			    &wa->wa_count, NFS_MAXDATA));
 		}
-	}
+		if (xdr_fhandle(xdrs, &wa->wa_fhandle) &&
+		    xdr_u_int(xdrs, &wa->wa_begoff) &&
+		    xdr_u_int(xdrs, &wa->wa_offset) &&
+		    xdr_u_int(xdrs, &wa->wa_totcount)) {
+			/* deal with the variety of data transfer types */
+
+			wa->wa_mblk = NULL;
+			wa->wa_data = NULL;
+			wa->wa_rlist = NULL;
+			wa->wa_conn = NULL;
 
-	if (xdrs->x_op == XDR_ENCODE) {
+			if (xdrs->x_ops == &xdrmblk_ops) {
+				if (xdrmblk_getmblk(xdrs, &wa->wa_mblk,
+				    &wa->wa_count) == TRUE)
+					return (TRUE);
+			} else {
+				if (xdrs->x_ops == &xdrrdmablk_ops) {
+					if (xdrrdma_getrdmablk(xdrs,
+					    &wa->wa_rlist,
+					    &wa->wa_count,
+					    &wa->wa_conn,
+					    NFS_MAXDATA) == TRUE)
+					return (xdrrdma_read_from_client(
+					    &wa->wa_rlist,
+					    &wa->wa_conn,
+					    wa->wa_count));
+
+					wa->wa_rlist = NULL;
+					wa->wa_conn = NULL;
+				}
+			}
+			return (xdr_bytes(xdrs, &wa->wa_data,
+			    &wa->wa_count, NFS_MAXDATA));
+		}
+		return (FALSE);
+	case XDR_ENCODE:
 		ptr = XDR_INLINE(xdrs, RNDUP(sizeof (fhandle_t)) +
 		    3 * BYTES_PER_XDR_UNIT);
 		if (ptr != NULL) {
@@ -180,44 +212,21 @@
 			    xdr_u_int(xdrs, &wa->wa_totcount)))
 				return (FALSE);
 		}
-#if 0 /* notdef */
-		if (wa->wa_mblk != NULL && xdrs->x_ops == &xdrmblk_ops) {
-			mblk_t *mp;
 
-			mp = dupb(wa->wa_mblk);
-			if (mp != NULL) {
-				mp->b_wptr += wa->wa_count;
-				if (xdrmblk_putmblk(xdrs, mp,
-				    wa->wa_count) == TRUE) {
-					return (TRUE);
-				} else
-					freeb(mp);
-			}
-			/* else Fall thru for the xdr_bytes() */
-		}
-		/* wa_mblk == NULL || xdrs->x_ops != &xdrmblk_ops Fall thru */
-#endif /* notdef */
 		return (xdr_bytes(xdrs, &wa->wa_data, &wa->wa_count,
 		    NFS_MAXDATA));
-	}
+	case XDR_FREE:
+		if (wa->wa_rlist) {
+			(void) xdrrdma_free_clist(wa->wa_conn, wa->wa_rlist);
+			wa->wa_rlist = NULL;
+		}
 
-	if (xdrs->x_op == XDR_FREE) {
 		if (wa->wa_data != NULL) {
 			kmem_free(wa->wa_data, wa->wa_count);
 			wa->wa_data = NULL;
 		}
 		return (TRUE);
 	}
-
-	if (xdr_fhandle(xdrs, &wa->wa_fhandle) &&
-	    xdr_u_int(xdrs, &wa->wa_begoff) &&
-	    xdr_u_int(xdrs, &wa->wa_offset) &&
-	    xdr_u_int(xdrs, &wa->wa_totcount) &&
-	    (xdrs->x_op == XDR_DECODE && xdrs->x_ops == &xdrmblk_ops) ?
-	    xdrmblk_getmblk(xdrs, &wa->wa_mblk, &wa->wa_count) :
-	    xdr_bytes(xdrs, &wa->wa_data, &wa->wa_count, NFS_MAXDATA)) {
-		return (TRUE);
-	}
 	return (FALSE);
 }
 
@@ -324,6 +333,25 @@
 }
 #endif
 
+bool_t
+xdr_readlink(XDR *xdrs, fhandle_t *fh)
+{
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
+
+	if (xdr_fhandle(xdrs, fh)) {
+		if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+		    xdrs->x_op == XDR_ENCODE) {
+			rci.rci_type = RCI_REPLY_CHUNK;
+			rci.rci_len = MAXPATHLEN;
+			XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+		}
+
+		return (TRUE);
+	}
+	return (FALSE);
+}
+
 /*
  * Arguments to remote read
  */
@@ -332,12 +360,15 @@
 {
 	int32_t *ptr;
 	int32_t *fhp;
+	rdma_chunkinfo_t rci;
+	rdma_wlist_conn_info_t rwci;
+	struct xdr_ops *xops = xdrrdma_xops();
 
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
 	ptr = XDR_INLINE(xdrs,
-			RNDUP(sizeof (fhandle_t)) + 3 * BYTES_PER_XDR_UNIT);
+	    RNDUP(sizeof (fhandle_t)) + 3 * BYTES_PER_XDR_UNIT);
 	if (ptr != NULL) {
 		if (xdrs->x_op == XDR_DECODE) {
 			fhp = (int32_t *)&ra->ra_fhandle;
@@ -366,20 +397,47 @@
 			IXDR_PUT_INT32(ptr, ra->ra_count);
 			IXDR_PUT_INT32(ptr, ra->ra_totcount);
 		}
-		if (ra->ra_count > NFS_MAXDATA)
+	} else {
+		if (!xdr_fhandle(xdrs, &ra->ra_fhandle) ||
+		    !xdr_u_int(xdrs, &ra->ra_offset) ||
+		    !xdr_u_int(xdrs, &ra->ra_count) ||
+		    !xdr_u_int(xdrs, &ra->ra_totcount)) {
 			return (FALSE);
-		return (TRUE);
+		}
 	}
 
-	if (xdr_fhandle(xdrs, &ra->ra_fhandle) &&
-	    xdr_u_int(xdrs, &ra->ra_offset) &&
-	    xdr_u_int(xdrs, &ra->ra_count) &&
-	    xdr_u_int(xdrs, &ra->ra_totcount)) {
-		if (ra->ra_count > NFS_MAXDATA)
-			return (FALSE);
+	if (ra->ra_count > NFS_MAXDATA)
+		return (FALSE);
+
+	ra->ra_wlist = NULL;
+	ra->ra_conn = NULL;
+
+	/* If this is xdrrdma_sizeof, record the expect response size */
+	if (xdrs->x_ops == xops && xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+		rci.rci_len = ra->ra_count;
+		(void) XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+	/* Nothing special to do, return */
+	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_op == XDR_FREE)
 		return (TRUE);
+
+	if (xdrs->x_op == XDR_ENCODE) {
+		/* Place the target data location into the RDMA header */
+		rci.rci_type = RCI_WRITE_ADDR_CHUNK;
+		rci.rci_a.rci_addr = ra->ra_data;
+		rci.rci_len = ra->ra_count;
+		rci.rci_clpp = &ra->ra_wlist;
+
+		return (XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci));
 	}
-	return (FALSE);
+
+	/* XDR_DECODE case */
+	(void) XDR_CONTROL(xdrs, XDR_RDMA_GET_WCINFO, &rwci);
+	ra->ra_wlist = rwci.rwci_wlist;
+	ra->ra_conn = rwci.rwci_conn;
+
+	return (TRUE);
 }
 
 
@@ -391,10 +449,69 @@
 {
 	bool_t ret;
 	mblk_t *mp;
+	struct xdr_ops *xops = xdrrdma_xops();
 
 	if (xdr_fattr(xdrs, &rrok->rrok_attr) == FALSE)
 		return (FALSE);
 
+	/* deal with RDMA separately */
+	if (xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) {
+		if (xdrs->x_op == XDR_ENCODE &&
+		    rrok->rrok_mp != NULL) {
+			ret = xdr_bytes(xdrs, (char **)&rrok->rrok_data,
+			    &rrok->rrok_count, NFS_MAXDATA);
+			return (ret);
+		}
+
+		if (xdrs->x_op == XDR_ENCODE) {
+			if (xdr_u_int(xdrs, &rrok->rrok_count) == FALSE) {
+				return (FALSE);
+			}
+			/*
+			 * If read data sent by wlist (RDMA_WRITE), don't do
+			 * xdr_bytes() below.   RDMA_WRITE transfers the data.
+			 */
+			if (rrok->rrok_wlist) {
+				/* adjust length to match in the rdma header */
+				if (rrok->rrok_wlist->c_len !=
+				    rrok->rrok_count) {
+					rrok->rrok_wlist->c_len =
+					    rrok->rrok_count;
+				}
+				if (rrok->rrok_count != 0) {
+					return (xdrrdma_send_read_data(
+					    xdrs, rrok->rrok_wlist));
+				}
+				return (TRUE);
+			}
+			if (rrok->rrok_count == 0) {
+				return (TRUE);
+			}
+		} else {
+			struct clist *cl;
+			uint32_t count;
+
+			XDR_CONTROL(xdrs, XDR_RDMA_GET_WLIST, &cl);
+
+			if (cl) {
+				if (!xdr_u_int(xdrs, &count))
+					return (FALSE);
+				if (count == 0) {
+					rrok->rrok_wlist_len = 0;
+					rrok->rrok_count = 0;
+				} else {
+					rrok->rrok_wlist_len = cl->c_len;
+					rrok->rrok_count = cl->c_len;
+				}
+				return (TRUE);
+			}
+		}
+		ret = xdr_bytes(xdrs, (char **)&rrok->rrok_data,
+		    &rrok->rrok_count, NFS_MAXDATA);
+
+		return (ret);
+	}
+
 	if (xdrs->x_op == XDR_ENCODE) {
 		int i, rndup;
 
@@ -402,16 +519,17 @@
 		if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
 			mp->b_wptr += rrok->rrok_count;
 			rndup = BYTES_PER_XDR_UNIT -
-				(rrok->rrok_count % BYTES_PER_XDR_UNIT);
+			    (rrok->rrok_count % BYTES_PER_XDR_UNIT);
 			if (rndup != BYTES_PER_XDR_UNIT)
 				for (i = 0; i < rndup; i++)
 					*mp->b_wptr++ = '\0';
 			if (xdrmblk_putmblk(xdrs, mp,
-					    rrok->rrok_count) == TRUE) {
+			    rrok->rrok_count) == TRUE) {
 				rrok->rrok_mp = NULL;
 				return (TRUE);
 			}
 		}
+
 		/*
 		 * Fall thru for the xdr_bytes()
 		 *
@@ -531,12 +649,22 @@
 {
 	int32_t *ptr;
 	int32_t *fhp;
+	rdma_chunkinfo_t rci;
+	struct xdr_ops *xops = xdrrdma_xops();
 
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
 	ptr = XDR_INLINE(xdrs,
 	    RNDUP(sizeof (fhandle_t)) + 2 * BYTES_PER_XDR_UNIT);
+
+	if ((xdrs->x_ops == &xdrrdma_ops || xdrs->x_ops == xops) &&
+	    xdrs->x_op == XDR_ENCODE) {
+		rci.rci_type = RCI_REPLY_CHUNK;
+		rci.rci_len = rda->rda_count;
+		XDR_CONTROL(xdrs, XDR_RDMA_ADD_CHUNK, &rci);
+	}
+
 	if (ptr != NULL) {
 		if (xdrs->x_op == XDR_DECODE) {
 			fhp = (int32_t *)&rda->rda_fh;
@@ -640,8 +768,8 @@
 
 	bufsize = 1 * BYTES_PER_XDR_UNIT;
 	for (size = rd->rd_size, dp = rd->rd_entries;
-		size > 0;
-		size -= dp->d_reclen, dp = nextdp(dp)) {
+	    size > 0;
+	    size -= dp->d_reclen, dp = nextdp(dp)) {
 		if (dp->d_reclen == 0 /* || DIRSIZ(dp) > dp->d_reclen */)
 			return (FALSE);
 		if (dp->d_ino == 0)
@@ -826,7 +954,7 @@
 			IXDR_PUT_U_INT32(ptr, (uint32_t)size);
 			bcopy(da->da_name, ptr, size);
 			rndup = BYTES_PER_XDR_UNIT -
-				(size % BYTES_PER_XDR_UNIT);
+			    (size % BYTES_PER_XDR_UNIT);
 			if (rndup != BYTES_PER_XDR_UNIT) {
 				cptr = (char *)ptr + size;
 				for (i = 0; i < rndup; i++)
--- a/usr/src/uts/common/io/lvm/md/md_med.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/io/lvm/md/md_med.c	Thu Aug 21 18:01:07 2008 -0500
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
@@ -225,7 +223,7 @@
 /*
  * unrecoverable RPC status codes; cf. rfscall()
  */
-#define	IS_UNRECOVERABLE_RPC(s)	(((s) == RPC_AUTHERROR) || \
+#define	MED_IS_UNRECOVERABLE_RPC(s)	(((s) == RPC_AUTHERROR) || \
 	((s) == RPC_CANTENCODEARGS) || \
 	((s) == RPC_CANTDECODERES) || \
 	((s) == RPC_VERSMISMATCH) || \
@@ -385,7 +383,7 @@
 	 */
 	if ((dot = strchr(addr->buf, '.')) == (char *)NULL) {
 		TRIVIA(("put_loopb_port - malformed loopback addr %s\n",
-			addr->buf));
+		    addr->buf));
 		return;
 	}
 
@@ -448,8 +446,8 @@
 		} else if (*univp == '\\') {
 			/* octal character */
 			*transp = (((*(univp+1) - '0') & 3) << 6) +
-				(((*(univp+2) - '0') & 7) << 3) +
-				((*(univp+3) - '0') & 7);
+			    (((*(univp+2) - '0') & 7) << 3) +
+			    ((*(univp+3) - '0') & 7);
 			univp += 4;
 		} else {
 			*transp = *univp;
@@ -509,8 +507,8 @@
 	/*LINTED*/
 	if (1 || error && error != EINTR) {
 		TRIVIA(("rel_client - destroying addr = (%p, %u %u)\n",
-			(void *) medc->addr.buf, medc->addr.len,
-			medc->addr.maxlen));
+		    (void *) medc->addr.buf, medc->addr.len,
+		    medc->addr.maxlen));
 		med_clnt_destroy(&medc->client);
 		if (medc->addr.buf) {
 			kmem_free(medc->addr.buf, medc->addr.maxlen);
@@ -553,13 +551,13 @@
 		med_put_inet_port(addr, htons(PMAPPORT));
 	} else {
 		TRIVIA(("get_pmap_addr - unsupported protofmly %s\n",
-			kncfp->knc_protofmly));
+		    kncfp->knc_protofmly));
 		status = RPC_UNKNOWNPROTO;
 		goto out;
 	}
 
 	TRIVIA(("get_pmap_addr - semantics=%u, protofmly=%s, proto=%s\n",
-		kncfp->knc_semantics, kncfp->knc_protofmly, kncfp->knc_proto));
+	    kncfp->knc_semantics, kncfp->knc_protofmly, kncfp->knc_proto));
 
 	/*
 	 * Mask signals for the duration of the handle creation and
@@ -668,11 +666,11 @@
 		TRIVIA((
 		    "get_rpcb_addr - semantics=%s, protofmly=%s, proto=%s\n",
 		    (kncfp->knc_semantics == NC_TPI_CLTS ?
-			"NC_TPI_CLTS" : "?"),
+		    "NC_TPI_CLTS" : "?"),
 		    kncfp->knc_protofmly, kncfp->knc_proto));
 	} else {
 		TRIVIA(("get_rpcb_addr - unsupported protofmly %s\n",
-			kncfp->knc_protofmly));
+		    kncfp->knc_protofmly));
 		status = RPC_UNKNOWNPROTO;
 		goto out;
 	}
@@ -703,12 +701,11 @@
 	client->cl_auth = authkern_create();
 
 	if ((status = CLNT_CALL(client, RPCBPROC_GETADDR,
-				xdr_rpcb, (char *)&parms,
-				xdr_wrapstring, (char *)&ua,
-				tmo)) != RPC_SUCCESS) {
+	    xdr_rpcb, (char *)&parms, xdr_wrapstring, (char *)&ua,
+	    tmo)) != RPC_SUCCESS) {
 		sigreplace(&oldmask, (k_sigset_t *)NULL);
 		MINUTE(("get_rpcb_addr - CLNT_CALL(GETADDR) returned %d\n",
-			status));
+		    status));
 		goto out;
 	}
 
@@ -780,7 +777,7 @@
 	 */
 	BSTAMP
 	status = med_get_pmap_addr(kncfp, prog, vers, addrp);
-	if (IS_UNRECOVERABLE_RPC(status) && status != RPC_UNKNOWNPROTO &&
+	if (MED_IS_UNRECOVERABLE_RPC(status) && status != RPC_UNKNOWNPROTO &&
 	    ! PMAP_WRONG_VERSION(status)) {
 		status = RPC_RPCBFAILURE;
 		goto bailout;
@@ -942,7 +939,7 @@
 	while (tries--) {
 		error = 0;
 		cl_stat = med_get_client(kncfp, addrp, prog, vers, &med_clnt);
-		if (IS_UNRECOVERABLE_RPC(cl_stat)) {
+		if (MED_IS_UNRECOVERABLE_RPC(cl_stat)) {
 			error = EINVAL;
 			goto rel_client;
 		} else if (cl_stat != RPC_SUCCESS) {
@@ -955,7 +952,7 @@
 
 		sigreplace(&newmask, &oldmask);
 		cl_stat = CLNT_CALL(med_clnt->client, proc, inproc, in,
-				outproc, out, *timout);
+		    outproc, out, *timout);
 		sigreplace(&oldmask, (k_sigset_t *)NULL);
 
 		switch (cl_stat) {
@@ -979,7 +976,7 @@
 		case RPC_CANTSEND:
 		case RPC_XPRTFAILED:
 		default:
-			if (IS_UNRECOVERABLE_RPC(cl_stat)) {
+			if (MED_IS_UNRECOVERABLE_RPC(cl_stat)) {
 				error = EINVAL;
 			} else {
 				error = EIO;
@@ -1403,7 +1400,7 @@
 	 * Register cpr callback
 	 */
 	CALLB_CPR_INIT(&cprinfo, &mtaap->mtaa_mthap->mtha_a_mx,
-		callb_generic_cpr, "med_a_thr");
+	    callb_generic_cpr, "med_a_thr");
 
 	mutex_enter(&mtaap->mtaa_mthap->mtha_a_mx);
 	if (mtaap->mtaa_mthap->mtha_flags & MDT_H_OK)
@@ -1457,7 +1454,7 @@
 	 * Register cpr callback
 	 */
 	CALLB_CPR_INIT(&cprinfo, &mthap->mtha_mtp->mt_mx, callb_generic_cpr,
-		"med_a_thr");
+	    "med_a_thr");
 	/*
 	 * Lock mthap->mtha_mtp->mt_mx is held early to avoid releasing the
 	 * locks out of order.
@@ -1756,7 +1753,7 @@
 		(void) strncpy(tpp->med_tp_ents[uapi].med_te_nm,
 		    uap->ua_devname, MED_TE_NM_LEN);
 		tpp->med_tp_ents[uapi].med_te_dev =
-			(md_dev64_t)uap->ua_kn.knc_rdev;
+		    (md_dev64_t)uap->ua_kn.knc_rdev;
 	}
 
 	tpp->med_tp_nents = med_addr_tab_nents;
@@ -1780,7 +1777,7 @@
 
 		mutex_enter(&uap->ua_mutex);
 		uap->ua_kn.knc_rdev = md_dev64_to_dev(
-			tpp->med_tp_ents[uapi].med_te_dev);
+		    tpp->med_tp_ents[uapi].med_te_dev);
 		mutex_exit(&uap->ua_mutex);
 	}
 
--- a/usr/src/uts/common/nfs/nfs.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/nfs/nfs.h	Thu Aug 21 18:01:07 2008 -0500
@@ -29,7 +29,6 @@
 #ifndef	_NFS_NFS_H
 #define	_NFS_NFS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
 /*	nfs.h 2.38 88/08/19 SMI 	*/
 
 #include <sys/isa_defs.h>
@@ -38,6 +37,7 @@
 #include <rpc/types.h>
 #include <sys/types32.h>
 #ifdef _KERNEL
+#include <rpc/rpc_rdma.h>
 #include <rpc/rpc.h>
 #include <sys/fcntl.h>
 #include <sys/kstat.h>
@@ -470,6 +470,11 @@
 	uint32_t	wa_count;
 	char		*wa_data;	/* data to write (up to NFS_MAXDATA) */
 	mblk_t		*wa_mblk;	/* pointer to mblks containing data */
+#ifdef _KERNEL
+	/* rdma related info */
+	struct clist	*wa_rlist;
+	CONN		*wa_conn;
+#endif /* _KERNEL */
 };
 #define	wa_fhandle	wa_args->otw_wa_fhandle
 #define	wa_begoff	wa_args->otw_wa_begoff
@@ -517,6 +522,12 @@
 	uint32_t	ra_offset;	/* byte offset in file */
 	uint32_t	ra_count;	/* immediate read count */
 	uint32_t	ra_totcount;	/* total read cnt (from this offset) */
+#ifdef _KERNEL
+	/* used in rdma transports */
+	caddr_t		ra_data;	/* destination for read data */
+	struct clist	*ra_wlist;
+	CONN		*ra_conn;
+#endif
 };
 
 /*
@@ -528,6 +539,10 @@
 	char		*rrok_data;	/* data (up to NFS_MAXDATA bytes) */
 	uint_t		rrok_bufsize;	/* size of kmem_alloc'd buffer */
 	mblk_t		*rrok_mp;	/* mblk_t contains data for reply */
+#ifdef _KERNEL
+	uint_t		rrok_wlist_len;
+	struct clist    *rrok_wlist;
+#endif
 };
 
 /*
@@ -759,6 +774,7 @@
 extern bool_t	xdr_rdlnres(XDR *, struct nfsrdlnres *);
 extern bool_t	xdr_rdresult(XDR *, struct nfsrdresult *);
 extern bool_t	xdr_readargs(XDR *, struct nfsreadargs *);
+extern bool_t	xdr_readlink(XDR *, fhandle_t *);
 extern bool_t	xdr_rnmargs(XDR *, struct nfsrnmargs *);
 extern bool_t	xdr_rrok(XDR *, struct nfsrrok *);
 extern bool_t	xdr_saargs(XDR *, struct nfssaargs *);
@@ -1408,6 +1424,13 @@
 	nfs_fh3 file;
 	offset3 offset;
 	count3 count;
+#ifdef _KERNEL
+	/* for read using rdma */
+	char *res_data_val_alt;
+	struct uio *res_uiop;
+	struct clist *wlist;
+	CONN *conn;
+#endif
 };
 typedef struct READ3args READ3args;
 
@@ -1421,6 +1444,10 @@
 		mblk_t *mp;
 	} data;
 	uint_t size;
+#ifdef _KERNEL
+	uint_t wlist_len;
+	struct clist *wlist;
+#endif
 };
 typedef struct READ3resok READ3resok;
 
@@ -1452,6 +1479,9 @@
 		char *data_val;
 	} data;
 	uint_t size;
+
+	uint_t wlist_len;
+	struct clist *wlist;
 };
 typedef struct READ3vres READ3vres;
 #endif /* _KERNEL */
@@ -1466,6 +1496,10 @@
 	bool_t eof;
 	struct uio *uiop;
 	uint_t size;		/* maximum reply size */
+#ifdef _KERNEL
+	uint_t wlist_len;
+	struct clist *wlist;
+#endif
 };
 typedef struct READ3uiores READ3uiores;
 
@@ -1486,6 +1520,10 @@
 		char *data_val;
 	} data;
 	mblk_t *mblk;
+#ifdef _KERNEL
+	struct clist *rlist;
+	CONN *conn;
+#endif
 };
 typedef struct WRITE3args WRITE3args;
 
--- a/usr/src/uts/common/nfs/nfs4_kprot.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/nfs/nfs4_kprot.h	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _NFS4_KPROT_H
 #define	_NFS4_KPROT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Kernel specific version.
  * NFS Version 4 protocol definitions.  From nfs4_prot.x rev 1.119.
@@ -39,6 +36,9 @@
 #endif
 
 #include <rpc/rpc.h>
+#ifdef _KERNEL
+#include <rpc/rpc_rdma.h>
+#endif
 #include <sys/stream.h>
 
 #define	NFS4_FHSIZE 128
@@ -1091,7 +1091,10 @@
 	mblk_t *res_mblk;
 	struct uio *res_uiop;
 	uint_t res_maxsize;
-
+#ifdef _KERNEL
+	struct clist *wlist;
+	CONN *conn;
+#endif
 };
 typedef struct READ4args READ4args;
 
@@ -1101,6 +1104,10 @@
 	uint_t data_len;
 	char *data_val;
 	mblk_t *mblk;
+#ifdef _KERNEL
+	struct clist *wlist;
+	uint_t wlist_len;
+#endif
 };
 typedef struct READ4res READ4res;
 
@@ -1309,6 +1316,10 @@
 	uint_t data_len;
 	char *data_val;
 	mblk_t *mblk;
+#ifdef _KERNEL
+	struct clist *rlist;
+	CONN *conn;
+#endif
 };
 typedef struct WRITE4args WRITE4args;
 
--- a/usr/src/uts/common/rpc/clnt.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/clnt.h	Thu Aug 21 18:01:07 2008 -0500
@@ -37,8 +37,6 @@
 #ifndef	_RPC_CLNT_H
 #define	_RPC_CLNT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <rpc/rpc_com.h>
 #include <rpc/clnt_stat.h>
@@ -78,6 +76,7 @@
 	((s) == RPC_PROCUNAVAIL) || \
 	((s) == RPC_PROGUNAVAIL) || \
 	((s) == RPC_PROGVERSMISMATCH) || \
+	((s) == RPC_SYSTEMERROR) || \
 	((s) == RPC_CANTDECODEARGS))
 
 /* Maximum rpc backoff time */
--- a/usr/src/uts/common/rpc/clnt_rdma.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/clnt_rdma.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -31,8 +30,6 @@
  * California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/user.h>
@@ -49,6 +46,7 @@
 #include <sys/time.h>
 #include <sys/isa_defs.h>
 #include <sys/zone.h>
+#include <sys/sdt.h>
 
 #include <rpc/types.h>
 #include <rpc/xdr.h>
@@ -56,7 +54,26 @@
 #include <rpc/clnt.h>
 #include <rpc/rpc_msg.h>
 #include <rpc/rpc_rdma.h>
+#include <nfs/nfs.h>
+#include <nfs/nfs4_kprot.h>
 
+static uint32_t rdma_bufs_rqst = RDMA_BUFS_RQST;
+
+static int clnt_compose_rpcmsg(CLIENT *, rpcproc_t, rdma_buf_t *,
+			    XDR *, xdrproc_t, caddr_t);
+static int  clnt_compose_rdma_header(CONN *, CLIENT *, rdma_buf_t *,
+		    XDR **, uint_t *);
+static int clnt_setup_rlist(CONN *, XDR *, XDR *);
+static int clnt_setup_wlist(CONN *, XDR *, XDR *);
+static int clnt_setup_long_reply(CONN *, struct clist **, uint_t);
+static void clnt_check_credit(CONN *);
+static void clnt_return_credit(CONN *);
+static void clnt_decode_long_reply(CONN *, struct clist *,
+		struct clist *, XDR *, XDR **, struct clist *,
+		struct clist *, uint_t, uint_t);
+
+static void clnt_update_credit(CONN *, uint32_t);
+static void check_dereg_wlist(CONN *, struct clist *);
 
 static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
     caddr_t, xdrproc_t, caddr_t, struct timeval);
@@ -85,6 +102,13 @@
  * The size of the preserialized RPC header information.
  */
 #define	CKU_HDRSIZE	20
+#define	CLNT_RDMA_SUCCESS 0
+#define	CLNT_RDMA_FAIL (-1)
+
+#define	AUTH_REFRESH_COUNT 2
+
+#define	IS_RPCSEC_GSS(authh)			\
+	(authh->cl_auth->ah_cred.oa_flavor == RPCSEC_GSS)
 
 /*
  * Per RPC RDMA endpoint details
@@ -155,6 +179,28 @@
 #define	ptoh(p)		(&((p)->cku_client))
 #define	htop(h)		((cku_private_t *)((h)->cl_private))
 
+uint_t
+calc_length(uint_t len)
+{
+	len = RNDUP(len);
+
+	if (len <= 64 * 1024) {
+		if (len > 32 * 1024) {
+			len = 64 * 1024;
+		} else {
+			if (len > 16 * 1024) {
+				len = 32 * 1024;
+			} else {
+				if (len > 8 * 1024) {
+					len = 16 * 1024;
+				} else {
+					len = 8 * 1024;
+				}
+			}
+		}
+	}
+	return (len);
+}
 int
 clnt_rdma_kcreate(char *proto, void *handle, struct netbuf *raddr, int family,
     rpcprog_t pgm, rpcvers_t vers, struct cred *cred, CLIENT **cl)
@@ -285,28 +331,253 @@
 	h->cl_ops = &rdma_clnt_ops;
 }
 
+static int
+clnt_compose_rpcmsg(CLIENT *h, rpcproc_t procnum,
+    rdma_buf_t *rpcmsg, XDR *xdrs,
+    xdrproc_t xdr_args, caddr_t argsp)
+{
+	cku_private_t *p = htop(h);
+
+	if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
+		/*
+		 * Copy in the preserialized RPC header
+		 * information.
+		 */
+		bcopy(p->cku_rpchdr, rpcmsg->addr, CKU_HDRSIZE);
+
+		/*
+		 * transaction id is the 1st thing in the output
+		 * buffer.
+		 */
+		/* LINTED pointer alignment */
+		(*(uint32_t *)(rpcmsg->addr)) = p->cku_xid;
+
+		/* Skip the preserialized stuff. */
+		XDR_SETPOS(xdrs, CKU_HDRSIZE);
+
+		/* Serialize dynamic stuff into the output buffer. */
+		if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
+		    (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
+		    (!(*xdr_args)(xdrs, argsp))) {
+			DTRACE_PROBE(krpc__e__clntrdma__rpcmsg__dynargs);
+			return (CLNT_RDMA_FAIL);
+		}
+		p->cku_outsz = XDR_GETPOS(xdrs);
+	} else {
+		uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
+		IXDR_PUT_U_INT32(uproc, procnum);
+		(*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
+		XDR_SETPOS(xdrs, 0);
+
+		/* Serialize the procedure number and the arguments. */
+		if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
+		    CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
+			if (rpcmsg->addr != xdrs->x_base) {
+				rpcmsg->addr = xdrs->x_base;
+				rpcmsg->len = xdr_getbufsize(xdrs);
+			}
+			DTRACE_PROBE(krpc__e__clntrdma__rpcmsg__procnum);
+			return (CLNT_RDMA_FAIL);
+		}
+		/*
+		 * If we had to allocate a new buffer while encoding
+		 * then update the addr and len.
+		 */
+		if (rpcmsg->addr != xdrs->x_base) {
+			rpcmsg->addr = xdrs->x_base;
+			rpcmsg->len = xdr_getbufsize(xdrs);
+		}
+
+		p->cku_outsz = XDR_GETPOS(xdrs);
+		DTRACE_PROBE1(krpc__i__compose__size__sec, int, p->cku_outsz)
+	}
+
+	return (CLNT_RDMA_SUCCESS);
+}
+
+static int
+clnt_compose_rdma_header(CONN *conn, CLIENT *h, rdma_buf_t *clmsg,
+    XDR **xdrs, uint_t *op)
+{
+	cku_private_t *p = htop(h);
+	uint_t vers;
+	uint32_t rdma_credit = rdma_bufs_rqst;
+
+	vers = RPCRDMA_VERS;
+	clmsg->type = SEND_BUFFER;
+
+	if (rdma_buf_alloc(conn, clmsg)) {
+		return (CLNT_RDMA_FAIL);
+	}
+
+	*xdrs = &p->cku_outxdr;
+	xdrmem_create(*xdrs, clmsg->addr, clmsg->len, XDR_ENCODE);
+
+	(*(uint32_t *)clmsg->addr) = p->cku_xid;
+	XDR_SETPOS(*xdrs, sizeof (uint32_t));
+	(void) xdr_u_int(*xdrs, &vers);
+	(void) xdr_u_int(*xdrs, &rdma_credit);
+	(void) xdr_u_int(*xdrs, op);
+
+	return (CLNT_RDMA_SUCCESS);
+}
+
+/*
+ * If xp_cl is NULL value, then the RPC payload will NOT carry
+ * an RDMA READ chunk list, in this case we insert FALSE into
+ * the XDR stream. Otherwise we use the clist and RDMA register
+ * the memory and encode the clist into the outbound XDR stream.
+ */
+static int
+clnt_setup_rlist(CONN *conn, XDR *xdrs, XDR *call_xdrp)
+{
+	int status;
+	struct clist *rclp;
+	int32_t xdr_flag = XDR_RDMA_RLIST_REG;
+
+	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &rclp);
+
+	if (rclp != NULL) {
+		status = clist_register(conn, rclp, CLIST_REG_SOURCE);
+		if (status != RDMA_SUCCESS) {
+			return (CLNT_RDMA_FAIL);
+		}
+		XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag);
+	}
+	(void) xdr_do_clist(xdrs, &rclp);
+
+	return (CLNT_RDMA_SUCCESS);
+}
+
+/*
+ * If xp_wcl is NULL value, then the RPC payload will NOT carry
+ * an RDMA WRITE chunk list, in this case we insert FALSE into
+ * the XDR stream. Otherwise we use the clist and  RDMA register
+ * the memory and encode the clist into the outbound XDR stream.
+ */
+static int
+clnt_setup_wlist(CONN *conn, XDR *xdrs, XDR *call_xdrp)
+{
+	int status;
+	struct clist *wlist;
+	int32_t xdr_flag = XDR_RDMA_WLIST_REG;
+
+	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_WLIST, &wlist);
+
+	if (wlist != NULL) {
+		status = clist_register(conn, wlist, CLIST_REG_DST);
+		if (status != RDMA_SUCCESS) {
+			return (CLNT_RDMA_FAIL);
+		}
+		XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag);
+	}
+
+	if (!xdr_encode_wlist(xdrs, wlist))
+		return (CLNT_RDMA_FAIL);
+
+	return (CLNT_RDMA_SUCCESS);
+}
+
+static int
+clnt_setup_long_reply(CONN *conn, struct clist **clpp, uint_t length)
+{
+	if (length == 0) {
+		*clpp = NULL;
+		return (CLNT_RDMA_SUCCESS);
+	}
+
+	*clpp = clist_alloc();
+
+	(*clpp)->rb_longbuf.len = calc_length(length);
+	(*clpp)->rb_longbuf.type = RDMA_LONG_BUFFER;
+
+	if (rdma_buf_alloc(conn, &((*clpp)->rb_longbuf))) {
+		clist_free(*clpp);
+		*clpp = NULL;
+		return (CLNT_RDMA_FAIL);
+	}
+
+	(*clpp)->u.c_daddr3 = (*clpp)->rb_longbuf.addr;
+	(*clpp)->c_len = (*clpp)->rb_longbuf.len;
+	(*clpp)->c_next = NULL;
+	(*clpp)->c_dmemhandle = (*clpp)->rb_longbuf.handle;
+
+	if (clist_register(conn, *clpp, CLIST_REG_DST)) {
+		DTRACE_PROBE(krpc__e__clntrdma__longrep_regbuf);
+		rdma_buf_free(conn, &((*clpp)->rb_longbuf));
+		clist_free(*clpp);
+		return (CLNT_RDMA_FAIL);
+	}
+
+	return (CLNT_RDMA_SUCCESS);
+}
+
 /* ARGSUSED */
 static enum clnt_stat
 clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
-    caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
+    caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp,
+    struct timeval wait)
 {
 	cku_private_t *p = htop(h);
+
+	int 	try_call_again;
+	int	refresh_attempt = AUTH_REFRESH_COUNT;
 	int 	status;
-	XDR 	*xdrs;
-	XDR	*cxdrp = NULL, callxdr;	/* for xdrrdma encoding the RPC call */
-	XDR	*rxdrp = NULL, replxdr;	/* for xdrrdma decoding the RPC reply */
+	int 	msglen;
+
+	XDR	*call_xdrp, callxdr; /* for xdrrdma encoding the RPC call */
+	XDR	*reply_xdrp, replyxdr; /* for xdrrdma decoding the RPC reply */
+	XDR 	*rdmahdr_o_xdrs, *rdmahdr_i_xdrs;
+
 	struct rpc_msg 	reply_msg;
-	struct clist *sendlist, *recvlist = NULL;
-	struct clist *cl = NULL, *cle = NULL;
-	uint_t vers, op;
+
+	struct clist *cl_sendlist;
+	struct clist *cl_recvlist;
+	struct clist *cl;
+	struct clist *cl_rpcmsg;
+	struct clist *cl_rdma_reply;
+	struct clist *cl_rpcreply_wlist;
+	struct clist *cl_long_reply;
+
+	uint_t vers;
+	uint_t op;
 	uint_t off;
-	uint32_t xid;
+	uint32_t seg_array_len;
+	uint_t long_reply_len;
+	uint_t rpcsec_gss;
+	uint_t gss_i_or_p;
+
 	CONN *conn = NULL;
-	rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply;
-	int msglen;
+	rdma_buf_t clmsg;
+	rdma_buf_t rpcmsg;
+	rdma_chunkinfo_lengths_t rcil;
+
 	clock_t	ticks;
+	bool_t wlist_exists_reply;
+
+	uint32_t rdma_credit = rdma_bufs_rqst;
 
 	RCSTAT_INCR(rccalls);
+
+call_again:
+
+	bzero(&clmsg, sizeof (clmsg));
+	bzero(&rpcmsg, sizeof (rpcmsg));
+	try_call_again = 0;
+	cl_sendlist = NULL;
+	cl_recvlist = NULL;
+	cl = NULL;
+	cl_rpcmsg = NULL;
+	cl_rdma_reply = NULL;
+	call_xdrp = NULL;
+	reply_xdrp = NULL;
+	wlist_exists_reply  = FALSE;
+	cl_rpcreply_wlist = NULL;
+	cl_long_reply = NULL;
+	rcil.rcil_len = 0;
+	rcil.rcil_len_alt = 0;
+	long_reply_len = 0;
+
 	/*
 	 * Get unique xid
 	 */
@@ -316,6 +587,11 @@
 	status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_addr,
 	    p->cku_addrfmly, p->cku_rd_handle, &conn);
 
+	/*
+	 * If there is a problem with the connection reflect the issue
+	 * back to the higher level to address, we MAY delay for a short
+	 * period so that we are kind to the transport.
+	 */
 	if (conn == NULL) {
 		/*
 		 * Connect failed to server. Could be because of one
@@ -363,182 +639,118 @@
 
 		return (p->cku_err.re_status);
 	}
-	/*
-	 * Get the size of the rpc call message. Need this
-	 * to determine if the rpc call message will fit in
-	 * the pre-allocated RDMA buffers. If the rpc call
-	 * message length is greater that the pre-allocated
-	 * buffers then, it is a Long RPC. A one time use
-	 * buffer is allocated and registered for the Long
-	 * RPC call.
-	 */
-	xdrs = &callxdr;
-	msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT;
-	if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
-		msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred,
-				rdma_minchunk);
-		msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
+
+	clnt_check_credit(conn);
 
-		if (msglen > RPC_MSG_SZ) {
+	status = CLNT_RDMA_FAIL;
+
+	rpcsec_gss = gss_i_or_p = FALSE;
 
-			/*
-			 * Long RPC. Allocate one time use custom buffer.
-			 */
-			rpcmsg.type = CHUNK_BUFFER;
-			rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
-			cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-			cle->c_xdroff = 0;
-			cle->c_len  = rpcmsg.len = msglen;
-			cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr;
-			cle->c_next = NULL;
-			xdrrdma_create(xdrs, rpcmsg.addr, msglen,
-			    rdma_minchunk, cle, XDR_ENCODE, NULL);
-			cxdrp = xdrs;
-			op = RDMA_NOMSG;
-		} else {
-			/*
-			 * Get a pre-allocated buffer for rpc call
-			 */
-			rpcmsg.type = SEND_BUFFER;
-			if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
-				p->cku_err.re_status = RPC_CANTSEND;
-				p->cku_err.re_errno = EIO;
-				RCSTAT_INCR(rcnomem);
-				cmn_err(CE_WARN,
-				    "clnt_rdma_kcallit: no buffers!");
-				goto done;
-			}
-			xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len,
-			    rdma_minchunk, NULL, XDR_ENCODE, NULL);
-			cxdrp = xdrs;
-			op = RDMA_MSG;
-		}
-	} else {
-		/*
-		 * For RPCSEC_GSS since we cannot accurately presize the
-		 * buffer required for encoding, we assume that its going
-		 * to be a Long RPC to start with. We also create the
-		 * the XDR stream with min_chunk set to 0 which instructs
-		 * the XDR layer to not chunk the incoming byte stream.
-		 */
-
-		msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth);
-		msglen += xdr_sizeof(xdr_args, argsp);
-
-		/*
-		 * Long RPC. Allocate one time use custom buffer.
-		 */
-		longmsg.type = CHUNK_BUFFER;
-		longmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
-		cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-		cle->c_xdroff = 0;
-		cle->c_len  = longmsg.len = msglen;
-		cle->c_saddr = (uint64)(uintptr_t)longmsg.addr;
-		cle->c_next = NULL;
-		xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle,
-		    XDR_ENCODE, NULL);
-		cxdrp = xdrs;
-		op = RDMA_NOMSG;
+	if (IS_RPCSEC_GSS(h)) {
+		rpcsec_gss = TRUE;
+		if (rpc_gss_get_service_type(h->cl_auth) ==
+		    rpc_gss_svc_integrity ||
+		    rpc_gss_get_service_type(h->cl_auth) ==
+		    rpc_gss_svc_privacy)
+			gss_i_or_p = TRUE;
 	}
 
-	if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
+	/*
+	 * Try a regular RDMA message if RPCSEC_GSS is not being used
+	 * or if RPCSEC_GSS is being used for authentication only.
+	 */
+	if (rpcsec_gss == FALSE ||
+	    (rpcsec_gss == TRUE && gss_i_or_p == FALSE)) {
 		/*
-		 * Copy in the preserialized RPC header
-		 * information.
-		 */
-		bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE);
-
-		/*
-		 * transaction id is the 1st thing in the output
-		 * buffer.
+		 * Grab a send buffer for the request.  Try to
+		 * encode it to see if it fits. If not, then it
+		 * needs to be sent in a chunk.
 		 */
-		/* LINTED pointer alignment */
-		(*(uint32_t *)(rpcmsg.addr)) = p->cku_xid;
-
-		/* Skip the preserialized stuff. */
-		XDR_SETPOS(xdrs, CKU_HDRSIZE);
-
-		/* Serialize dynamic stuff into the output buffer. */
-		if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
-		    (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
-		    (!(*xdr_args)(xdrs, argsp))) {
-			rdma_buf_free(conn, &rpcmsg);
-			if (cle)
-				clist_free(cle);
-			p->cku_err.re_status = RPC_CANTENCODEARGS;
-			p->cku_err.re_errno = EIO;
-			cmn_err(CE_WARN,
-	"clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed");
+		rpcmsg.type = SEND_BUFFER;
+		if (rdma_buf_alloc(conn, &rpcmsg)) {
+			DTRACE_PROBE(krpc__e__clntrdma__callit_nobufs);
 			goto done;
 		}
-		p->cku_outsz = XDR_GETPOS(xdrs);
-	} else {
-		uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
-		IXDR_PUT_U_INT32(uproc, procnum);
-		(*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
-		XDR_SETPOS(xdrs, 0);
+
+		/* First try to encode into regular send buffer */
+		op = RDMA_MSG;
 
-		/* Serialize the procedure number and the arguments. */
-		if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
-		    CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
-			if (longmsg.addr != xdrs->x_base) {
-				longmsg.addr = xdrs->x_base;
-				longmsg.len = xdr_getbufsize(xdrs);
-			}
-			rdma_buf_free(conn, &longmsg);
-			clist_free(cle);
-			p->cku_err.re_status = RPC_CANTENCODEARGS;
-			p->cku_err.re_errno = EIO;
-			cmn_err(CE_WARN,
-		"clnt_rdma_kcallit: AUTH_WRAP failed");
-			goto done;
-		}
-		/*
-		 * If we had to allocate a new buffer while encoding
-		 * then update the addr and len.
-		 */
-		if (longmsg.addr != xdrs->x_base) {
-			longmsg.addr = xdrs->x_base;
-			longmsg.len = xdr_getbufsize(xdrs);
-		}
+		call_xdrp = &callxdr;
+
+		xdrrdma_create(call_xdrp, rpcmsg.addr, rpcmsg.len,
+		    rdma_minchunk, NULL, XDR_ENCODE, conn);
 
-		/*
-		 * If it so happens that the encoded message is after all
-		 * not long enough to be a Long RPC then allocate a
-		 * SEND_BUFFER and copy the encoded message into it.
-		 */
-		p->cku_outsz = XDR_GETPOS(xdrs);
-		if (p->cku_outsz > RPC_MSG_SZ) {
-			rpcmsg.type = CHUNK_BUFFER;
-			rpcmsg.addr = longmsg.addr;
-			rpcmsg.len = longmsg.len;
+		status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp,
+		    xdr_args, argsp);
+
+		if (status != CLNT_RDMA_SUCCESS) {
+			/* Clean up from previous encode attempt */
+			rdma_buf_free(conn, &rpcmsg);
+			XDR_DESTROY(call_xdrp);
 		} else {
-			clist_free(cle);
-			XDR_DESTROY(cxdrp);
-			cxdrp = NULL;
-			/*
-			 * Get a pre-allocated buffer for rpc call
-			 */
-			rpcmsg.type = SEND_BUFFER;
-			if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
-				p->cku_err.re_status = RPC_CANTSEND;
-				p->cku_err.re_errno = EIO;
-				RCSTAT_INCR(rcnomem);
-				cmn_err(CE_WARN,
-				    "clnt_rdma_kcallit: no buffers!");
-				rdma_buf_free(conn, &longmsg);
-				goto done;
-			}
-			bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz);
-			xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0,
-			    NULL, XDR_ENCODE, NULL);
-			cxdrp = xdrs;
-			rdma_buf_free(conn, &longmsg);
-			op = RDMA_MSG;
+			XDR_CONTROL(call_xdrp, XDR_RDMA_GET_CHUNK_LEN, &rcil);
 		}
 	}
 
-	cl = xdrrdma_clist(xdrs);
+	/* If the encode didn't work, then try a NOMSG */
+	if (status != CLNT_RDMA_SUCCESS) {
+
+		msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT + MAX_AUTH_BYTES +
+		    xdr_sizeof(xdr_args, argsp);
+
+		msglen = calc_length(msglen);
+
+		/* pick up the lengths for the reply buffer needed */
+		(void) xdrrdma_sizeof(xdr_args, argsp, 0,
+		    &rcil.rcil_len, &rcil.rcil_len_alt);
+
+		/*
+		 * Construct a clist to describe the CHUNK_BUFFER
+		 * for the rpcmsg.
+		 */
+		cl_rpcmsg = clist_alloc();
+		cl_rpcmsg->c_len = msglen;
+		cl_rpcmsg->rb_longbuf.type = RDMA_LONG_BUFFER;
+		cl_rpcmsg->rb_longbuf.len = msglen;
+		if (rdma_buf_alloc(conn, &cl_rpcmsg->rb_longbuf)) {
+			clist_free(cl_rpcmsg);
+			goto done;
+		}
+		cl_rpcmsg->w.c_saddr3 = cl_rpcmsg->rb_longbuf.addr;
+
+		op = RDMA_NOMSG;
+		call_xdrp = &callxdr;
+
+		xdrrdma_create(call_xdrp, cl_rpcmsg->rb_longbuf.addr,
+		    cl_rpcmsg->rb_longbuf.len, 0,
+		    cl_rpcmsg, XDR_ENCODE, conn);
+
+		status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp,
+		    xdr_args, argsp);
+
+		if (status != CLNT_RDMA_SUCCESS) {
+			p->cku_err.re_status = RPC_CANTENCODEARGS;
+			p->cku_err.re_errno = EIO;
+			DTRACE_PROBE(krpc__e__clntrdma__callit__composemsg);
+			goto done;
+		}
+	}
+
+	/*
+	 * During the XDR_ENCODE we may have "allocated" an RDMA READ or
+	 * RDMA WRITE clist.
+	 *
+	 * First pull the RDMA READ chunk list from the XDR private
+	 * area to keep it handy.
+	 */
+	XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &cl);
+
+	if (gss_i_or_p) {
+		long_reply_len = rcil.rcil_len + rcil.rcil_len_alt;
+		long_reply_len += MAX_AUTH_BYTES;
+	} else {
+		long_reply_len = rcil.rcil_len;
+	}
 
 	/*
 	 * Update the chunk size information for the Long RPC msg.
@@ -547,66 +759,85 @@
 		cl->c_len = p->cku_outsz;
 
 	/*
-	 * Set up the RDMA chunk message
+	 * Prepare the RDMA header. On success xdrs will hold the result
+	 * of xdrmem_create() for a SEND_BUFFER.
 	 */
-	vers = RPCRDMA_VERS;
-	clmsg.type = SEND_BUFFER;
-	if (RDMA_BUF_ALLOC(conn, &clmsg)) {
+	status = clnt_compose_rdma_header(conn, h, &clmsg,
+	    &rdmahdr_o_xdrs, &op);
+
+	if (status != CLNT_RDMA_SUCCESS) {
 		p->cku_err.re_status = RPC_CANTSEND;
 		p->cku_err.re_errno = EIO;
-		rdma_buf_free(conn, &rpcmsg);
 		RCSTAT_INCR(rcnomem);
-		cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!");
+		DTRACE_PROBE(krpc__e__clntrdma__callit__nobufs2);
 		goto done;
 	}
-	xdrs = &p->cku_outxdr;
-	xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
+
 	/*
-	 * Treat xid as opaque (xid is the first entity
-	 * in the rpc rdma message).
+	 * Now insert the RDMA READ list iff present
 	 */
-	(*(uint32_t *)clmsg.addr) = p->cku_xid;
-	/* Skip xid and set the xdr position accordingly. */
-	XDR_SETPOS(xdrs, sizeof (uint32_t));
-	(void) xdr_u_int(xdrs, &vers);
-	(void) xdr_u_int(xdrs, &op);
+	status = clnt_setup_rlist(conn, rdmahdr_o_xdrs, call_xdrp);
+	if (status != CLNT_RDMA_SUCCESS) {
+		DTRACE_PROBE(krpc__e__clntrdma__callit__clistreg);
+		rdma_buf_free(conn, &clmsg);
+		p->cku_err.re_status = RPC_CANTSEND;
+		p->cku_err.re_errno = EIO;
+		goto done;
+	}
+
+	/*
+	 * Setup RDMA WRITE chunk list for nfs read operation
+	 * other operations will have a NULL which will result
+	 * as a NULL list in the XDR stream.
+	 */
+	status = clnt_setup_wlist(conn, rdmahdr_o_xdrs, call_xdrp);
+	if (status != CLNT_RDMA_SUCCESS) {
+		rdma_buf_free(conn, &clmsg);
+		p->cku_err.re_status = RPC_CANTSEND;
+		p->cku_err.re_errno = EIO;
+		goto done;
+	}
 
 	/*
-	 * Now XDR the chunk list
+	 * If NULL call and RPCSEC_GSS, provide a chunk such that
+	 * large responses can flow back to the client.
+	 * If RPCSEC_GSS with integrity or privacy is in use, get chunk.
 	 */
-	if (cl != NULL) {
+	if ((procnum == 0 && rpcsec_gss == TRUE) ||
+	    (rpcsec_gss == TRUE && gss_i_or_p == TRUE))
+		long_reply_len += 1024;
 
-		/*
-		 * Register the chunks in the list
-		 */
-		status = clist_register(conn, cl, 1);
-		if (status != RDMA_SUCCESS) {
-			cmn_err(CE_WARN,
-		"clnt_rdma_kcallit: clist register failed");
-			rdma_buf_free(conn, &clmsg);
-			rdma_buf_free(conn, &rpcmsg);
-			clist_free(cl);
-			p->cku_err.re_status = RPC_CANTSEND;
-			p->cku_err.re_errno = EIO;
-			goto done;
-		}
+	status = clnt_setup_long_reply(conn, &cl_long_reply, long_reply_len);
 
+	if (status != CLNT_RDMA_SUCCESS) {
+		rdma_buf_free(conn, &clmsg);
+		p->cku_err.re_status = RPC_CANTSEND;
+		p->cku_err.re_errno = EIO;
+		goto done;
 	}
-	(void) xdr_do_clist(xdrs, &cl);
 
 	/*
+	 * XDR encode the RDMA_REPLY write chunk
+	 */
+	seg_array_len = (cl_long_reply ? 1 : 0);
+	(void) xdr_encode_reply_wchunk(rdmahdr_o_xdrs, cl_long_reply,
+	    seg_array_len);
+
+	/*
+	 * Construct a clist in "sendlist" that represents what we
+	 * will push over the wire.
+	 *
 	 * Start with the RDMA header and clist (if any)
 	 */
-	sendlist = NULL;
-	clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle,
-		clmsg.addr, NULL, NULL);
+	clist_add(&cl_sendlist, 0, XDR_GETPOS(rdmahdr_o_xdrs), &clmsg.handle,
+	    clmsg.addr, NULL, NULL);
 
 	/*
-	 * Put the RPC call message in the send list if small RPC
+	 * Put the RPC call message in  sendlist if small RPC
 	 */
 	if (op == RDMA_MSG) {
-		clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle,
-			rpcmsg.addr, NULL, NULL);
+		clist_add(&cl_sendlist, 0, p->cku_outsz, &rpcmsg.handle,
+		    rpcmsg.addr, NULL, NULL);
 	} else {
 		/* Long RPC already in chunk list */
 		RCSTAT_INCR(rclongrpcs);
@@ -618,27 +849,19 @@
 	status = rdma_clnt_postrecv(conn, p->cku_xid);
 	if (status != RDMA_SUCCESS) {
 		rdma_buf_free(conn, &clmsg);
-		rdma_buf_free(conn, &rpcmsg);
-		if (cl) {
-			(void) clist_deregister(conn, cl, 1);
-			clist_free(cl);
-		}
-		clist_free(sendlist);
 		p->cku_err.re_status = RPC_CANTSEND;
 		p->cku_err.re_errno = EIO;
 		goto done;
 	}
+
 	/*
 	 * sync the memory for dma
 	 */
 	if (cl != NULL) {
-		status = clist_syncmem(conn, cl, 1);
+		status = clist_syncmem(conn, cl, CLIST_REG_SOURCE);
 		if (status != RDMA_SUCCESS) {
+			(void) rdma_clnt_postrecv_remove(conn, p->cku_xid);
 			rdma_buf_free(conn, &clmsg);
-			rdma_buf_free(conn, &rpcmsg);
-			(void) clist_deregister(conn, cl, 1);
-			clist_free(cl);
-			clist_free(sendlist);
 			p->cku_err.re_status = RPC_CANTSEND;
 			p->cku_err.re_errno = EIO;
 			goto done;
@@ -646,72 +869,33 @@
 	}
 
 	/*
-	 * Send the call message to the server
+	 * Send the RDMA Header and RPC call message to the server
 	 */
-	status = RDMA_SEND(conn, sendlist, p->cku_xid);
+	status = RDMA_SEND(conn, cl_sendlist, p->cku_xid);
 	if (status != RDMA_SUCCESS) {
-		if (cl) {
-			(void) clist_deregister(conn, cl, 1);
-			clist_free(cl);
-			/*
-			 * If this was a long RPC message, need
-			 * to free that buffer.
-			 */
-			if (rpcmsg.type == CHUNK_BUFFER)
-				rdma_buf_free(conn, &rpcmsg);
-		}
-		clist_free(sendlist);
+		(void) rdma_clnt_postrecv_remove(conn, p->cku_xid);
 		p->cku_err.re_status = RPC_CANTSEND;
 		p->cku_err.re_errno = EIO;
 		goto done;
-	} else {
-		/*
-		 * RDMA plugin now owns the send msg buffers.
-		 * Clear them out and don't free them here.
-		 */
-		clmsg.addr = NULL;
-		if (rpcmsg.type == SEND_BUFFER)
-			rpcmsg.addr = NULL;
 	}
-	clist_free(sendlist);
-#ifdef DEBUG
-if (rdma_clnt_debug) {
-		printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid);
-	}
-#endif
+
+	/*
+	 * RDMA plugin now owns the send msg buffers.
+	 * Clear them out and don't free them.
+	 */
+	clmsg.addr = NULL;
+	if (rpcmsg.type == SEND_BUFFER)
+		rpcmsg.addr = NULL;
 
 	/*
 	 * Recv rpc reply
 	 */
-	status = RDMA_RECV(conn, &recvlist, p->cku_xid);
-
-	/*
-	 * Deregister chunks sent. Do this only after the reply
-	 * is received as that is a sure indication that the
-	 * remote end has completed RDMA of the chunks.
-	 */
-	if (cl != NULL) {
-		/*
-		 * Deregister the chunks
-		 */
-		(void) clist_deregister(conn, cl, 1);
-		clist_free(cl);
-		/*
-		 * If long RPC free chunk
-		 */
-		rdma_buf_free(conn, &rpcmsg);
-	}
+	status = RDMA_RECV(conn, &cl_recvlist, p->cku_xid);
 
 	/*
 	 * Now check recv status
 	 */
 	if (status != 0) {
-#ifdef DEBUG
-		if (rdma_clnt_debug)
-			cmn_err(CE_NOTE,
-			    "clnt_rdma_kcallit: reply failed %u status %d",
-			    p->cku_xid, status);
-#endif
 		if (status == RDMA_INTR) {
 			p->cku_err.re_status = RPC_INTR;
 			p->cku_err.re_errno = EINTR;
@@ -726,112 +910,65 @@
 		}
 		goto done;
 	}
-#ifdef DEBUG
-	if (rdma_clnt_debug)
-		printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid);
-#endif
+
 	/*
 	 * Process the reply message.
 	 *
 	 * First the chunk list (if any)
 	 */
-	xdrs = &(p->cku_inxdr);
-	xdrmem_create(xdrs, (caddr_t)(uintptr_t)recvlist->c_saddr,
-	    recvlist->c_len, XDR_DECODE);
+	rdmahdr_i_xdrs = &(p->cku_inxdr);
+	xdrmem_create(rdmahdr_i_xdrs,
+	    (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3,
+	    cl_recvlist->c_len, XDR_DECODE);
+
 	/*
 	 * Treat xid as opaque (xid is the first entity
 	 * in the rpc rdma message).
+	 * Skip xid and set the xdr position accordingly.
 	 */
-	xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr;
-	/* Skip xid and set the xdr position accordingly. */
-	XDR_SETPOS(xdrs, sizeof (uint32_t));
-	(void) xdr_u_int(xdrs, &vers);
-	(void) xdr_u_int(xdrs, &op);
-	(void) xdr_do_clist(xdrs, &cl);
-	off = xdr_getpos(xdrs);
+	XDR_SETPOS(rdmahdr_i_xdrs, sizeof (uint32_t));
+	(void) xdr_u_int(rdmahdr_i_xdrs, &vers);
+	(void) xdr_u_int(rdmahdr_i_xdrs, &rdma_credit);
+	(void) xdr_u_int(rdmahdr_i_xdrs, &op);
+	(void) xdr_do_clist(rdmahdr_i_xdrs, &cl);
+
+	clnt_update_credit(conn, rdma_credit);
+
+	wlist_exists_reply = FALSE;
+	if (! xdr_decode_wlist(rdmahdr_i_xdrs, &cl_rpcreply_wlist,
+	    &wlist_exists_reply)) {
+		DTRACE_PROBE(krpc__e__clntrdma__callit__wlist_decode);
+		p->cku_err.re_status = RPC_CANTDECODERES;
+		p->cku_err.re_errno = EIO;
+		goto done;
+	}
 
 	/*
-	 * Now the RPC reply message itself. If the reply
-	 * came as a chunk item, then RDMA the reply over.
+	 * The server shouldn't have sent a RDMA_SEND that
+	 * the client needs to RDMA_WRITE a reply back to
+	 * the server.  So silently ignoring what the
+	 * server returns in the rdma_reply section of the
+	 * header.
 	 */
-	xdrs = &replxdr;
-	if (cl && op == RDMA_NOMSG) {
-		struct clist		*cle = cl;
-
-		rpcreply.type = CHUNK_BUFFER;
-		rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP);
-		rpcreply.len = cle->c_len;
-		cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr;
-		cl = cl->c_next;
-		cle->c_next = NULL;
-
-		/*
-		 * Register the rpc reply chunk destination
-		 */
-		status = clist_register(conn, cle, 0);
-		if (status) {
-			rdma_buf_free(conn, &rpcreply);
-			clist_free(cle);
-			p->cku_err.re_status = RPC_CANTDECODERES;
-			p->cku_err.re_errno = EIO;
-			cmn_err(CE_WARN,
-			    "clnt_rdma_kcallit: clist_register failed");
-			goto rdma_done;
-		}
+	(void) xdr_decode_reply_wchunk(rdmahdr_i_xdrs, &cl_rdma_reply);
+	off = xdr_getpos(rdmahdr_i_xdrs);
 
-		/*
-		 * Now read rpc reply in
-		 */
-#ifdef DEBUG
-	if (rdma_clnt_debug)
-		printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \
-			reply xid %u\n", cle->c_len, p->cku_xid, xid);
-#endif
-		status = RDMA_READ(conn, cle, WAIT);
-		if (status) {
-			(void) clist_deregister(conn, cle, 0);
-			rdma_buf_free(conn, &rpcreply);
-			clist_free(cle);
-			p->cku_err.re_status = RPC_CANTDECODERES;
-			p->cku_err.re_errno = EIO;
-			cmn_err(CE_WARN,
-				"clnt_rdma_kcallit: RDMA_READ failed");
-			goto rdma_done;
-		}
+	clnt_decode_long_reply(conn, cl_long_reply,
+	    cl_rdma_reply, &replyxdr, &reply_xdrp,
+	    cl, cl_recvlist, op, off);
 
-		/*
-		 * sync the memory for dma
-		 */
-		status = clist_syncmem(conn, cle, 0);
-		if (status != RDMA_SUCCESS) {
-			(void) clist_deregister(conn, cle, 0);
-			rdma_buf_free(conn, &rpcreply);
-			clist_free(cle);
-			p->cku_err.re_status = RPC_CANTDECODERES;
-			p->cku_err.re_errno = EIO;
-			goto rdma_done;
-		}
+	if (reply_xdrp == NULL)
+		goto done;
 
-		/*
-		 * Deregister the Long RPC chunk
-		 */
-		(void) clist_deregister(conn, cle, 0);
-		clist_free(cle);
-		xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl,
-			XDR_DECODE, conn);
-		rxdrp = xdrs;
-	} else {
-		rpcreply.addr = NULL;
-		xdrrdma_create(xdrs,
-		    (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
-		    recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
-		rxdrp = xdrs;
+	if (wlist_exists_reply) {
+		XDR_CONTROL(reply_xdrp, XDR_RDMA_SET_WLIST, cl_rpcreply_wlist);
 	}
 
 	reply_msg.rm_direction = REPLY;
 	reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
 	reply_msg.acpted_rply.ar_stat = SUCCESS;
 	reply_msg.acpted_rply.ar_verf = _null_auth;
+
 	/*
 	 *  xdr_results will be done in AUTH_UNWRAP.
 	 */
@@ -841,7 +978,7 @@
 	/*
 	 * Decode and validate the response.
 	 */
-	if (xdr_replymsg(xdrs, &reply_msg)) {
+	if (xdr_replymsg(reply_xdrp, &reply_msg)) {
 		enum clnt_stat re_status;
 
 		_seterr_reply(&reply_msg, &(p->cku_err));
@@ -856,14 +993,14 @@
 				p->cku_err.re_status = RPC_AUTHERROR;
 				p->cku_err.re_why = AUTH_INVALIDRESP;
 				RCSTAT_INCR(rcbadverfs);
-				cmn_err(CE_WARN,
-			    "clnt_rdma_kcallit: AUTH_VALIDATE failed");
-			} else if (!AUTH_UNWRAP(h->cl_auth, xdrs,
+				DTRACE_PROBE(
+				    krpc__e__clntrdma__callit__authvalidate);
+			} else if (!AUTH_UNWRAP(h->cl_auth, reply_xdrp,
 			    xdr_results, resultsp)) {
 				p->cku_err.re_status = RPC_CANTDECODERES;
 				p->cku_err.re_errno = EIO;
-				cmn_err(CE_WARN,
-				    "clnt_rdma_kcallit: AUTH_UNWRAP failed");
+				DTRACE_PROBE(
+				    krpc__e__clntrdma__callit__authunwrap);
 			}
 		} else {
 			/* set errno in case we can't recover */
@@ -873,6 +1010,24 @@
 				p->cku_err.re_errno = EIO;
 
 			if (re_status == RPC_AUTHERROR) {
+				if ((refresh_attempt > 0) &&
+				    AUTH_REFRESH(h->cl_auth, &reply_msg,
+				    p->cku_cred)) {
+					refresh_attempt--;
+					try_call_again = 1;
+					goto done;
+				}
+
+				try_call_again = 0;
+
+				/*
+				 * We have used the client handle to
+				 * do an AUTH_REFRESH and the RPC status may
+				 * be set to RPC_SUCCESS; Let's make sure to
+				 * set it to RPC_AUTHERROR.
+				 */
+				p->cku_err.re_status = RPC_AUTHERROR;
+
 				/*
 				 * Map recoverable and unrecoverable
 				 * authentication errors to appropriate
@@ -894,113 +1049,150 @@
 					p->cku_err.re_errno = EIO;
 					break;
 				}
-				RPCLOG(1, "clnt_rdma_kcallit : "
-				    "authentication failed with "
-				    "RPC_AUTHERROR of type %d\n",
-				    p->cku_err.re_why);
 			}
-			cmn_err(CE_WARN,
-				    "clnt_rdma_kcallit: RPC failed");
-
+			DTRACE_PROBE1(krpc__e__clntrdma__callit__rpcfailed,
+			    int, p->cku_err.re_why);
 		}
 	} else {
 		p->cku_err.re_status = RPC_CANTDECODERES;
 		p->cku_err.re_errno = EIO;
-		cmn_err(CE_WARN, "clnt_rdma_kcallit: xdr_replymsg failed");
+		DTRACE_PROBE(krpc__e__clntrdma__callit__replymsg);
 	}
 
+done:
+	clnt_return_credit(conn);
+
+	if (cl_sendlist != NULL)
+		clist_free(cl_sendlist);
+
 	/*
 	 * If rpc reply is in a chunk, free it now.
 	 */
-	if (rpcreply.addr != NULL)
-		rdma_buf_free(conn, &rpcreply);
-
-rdma_done:
-	if ((cl != NULL) || (op == RDMA_NOMSG)) {
-		rdma_buf_t	donemsg;
-
-		/*
-		 * Free the list holding the chunk info
-		 */
-		if (cl) {
-			clist_free(cl);
-			cl = NULL;
-		}
-
-		/*
-		 * Tell the server that the reads are done
-		 */
-		donemsg.type = SEND_BUFFER;
-		if (RDMA_BUF_ALLOC(conn, &donemsg)) {
-			p->cku_err.re_status = RPC_CANTSEND;
-			p->cku_err.re_errno = EIO;
-			RCSTAT_INCR(rcnomem);
-			cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer");
-			goto done;
-		}
-		xdrs = &p->cku_outxdr;
-		xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE);
-		vers = RPCRDMA_VERS;
-		op = RDMA_DONE;
+	if (cl_long_reply) {
+		(void) clist_deregister(conn, cl_long_reply, CLIST_REG_DST);
+		rdma_buf_free(conn, &cl_long_reply->rb_longbuf);
+		clist_free(cl_long_reply);
+	}
 
-		/*
-		 * Treat xid as opaque (xid is the first entity
-		 * in the rpc rdma message).
-		 */
-		(*(uint32_t *)donemsg.addr) = p->cku_xid;
-		/* Skip xid and set the xdr position accordingly. */
-		XDR_SETPOS(xdrs, sizeof (uint32_t));
-		if (!xdr_u_int(xdrs, &vers) ||
-		    !xdr_u_int(xdrs, &op)) {
-			cmn_err(CE_WARN,
-				"clnt_rdma_kcallit: xdr_u_int failed");
-			rdma_buf_free(conn, &donemsg);
-			goto done;
-		}
+	if (call_xdrp)
+		XDR_DESTROY(call_xdrp);
 
-		sendlist = NULL;
-		clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle,
-			donemsg.addr, NULL, NULL);
-
-		status = RDMA_SEND(conn, sendlist, p->cku_xid);
-		if (status != RDMA_SUCCESS) {
-			cmn_err(CE_WARN,
-				"clnt_rdma_kcallit: RDMA_SEND failed xid %u",
-					p->cku_xid);
-		}
-#ifdef DEBUG
-		else {
-		if (rdma_clnt_debug)
-			printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n",
-				p->cku_xid);
-		}
-#endif
-		clist_free(sendlist);
+	if (reply_xdrp) {
+		(void) xdr_rpc_free_verifier(reply_xdrp, &reply_msg);
+		XDR_DESTROY(reply_xdrp);
 	}
 
-done:
-	if (cxdrp)
-		XDR_DESTROY(cxdrp);
-	if (rxdrp) {
-		(void) xdr_rpc_free_verifier(rxdrp, &reply_msg);
-		XDR_DESTROY(rxdrp);
+	if (cl_rdma_reply) {
+		clist_free(cl_rdma_reply);
 	}
 
-	if (recvlist) {
-		rdma_buf_t	recvmsg;
-
-		recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr;
+	if (cl_recvlist) {
+		rdma_buf_t	recvmsg = {0};
+		recvmsg.addr = (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3;
 		recvmsg.type = RECV_BUFFER;
 		RDMA_BUF_FREE(conn, &recvmsg);
-		clist_free(recvlist);
+		clist_free(cl_recvlist);
 	}
+
 	RDMA_REL_CONN(conn);
+
+	if (try_call_again)
+		goto call_again;
+
 	if (p->cku_err.re_status != RPC_SUCCESS) {
 		RCSTAT_INCR(rcbadcalls);
 	}
 	return (p->cku_err.re_status);
 }
 
+
+static void
+clnt_decode_long_reply(CONN *conn,
+    struct clist *cl_long_reply,
+    struct clist *cl_rdma_reply, XDR *xdrs,
+    XDR **rxdrp, struct clist *cl,
+    struct clist *cl_recvlist,
+    uint_t  op, uint_t off)
+{
+	if (op != RDMA_NOMSG) {
+		DTRACE_PROBE1(krpc__i__longrepl__rdmamsg__len,
+		    int, cl_recvlist->c_len - off);
+		xdrrdma_create(xdrs,
+		    (caddr_t)(uintptr_t)(cl_recvlist->w.c_saddr3 + off),
+		    cl_recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
+		*rxdrp = xdrs;
+		return;
+	}
+
+	/* op must be RDMA_NOMSG */
+	if (cl) {
+		DTRACE_PROBE(krpc__e__clntrdma__declongreply__serverreadlist);
+		return;
+	}
+
+	if (cl_long_reply->u.c_daddr) {
+		DTRACE_PROBE1(krpc__i__longrepl__rdmanomsg__len,
+		    int, cl_rdma_reply->c_len);
+
+		xdrrdma_create(xdrs, (caddr_t)cl_long_reply->u.c_daddr3,
+		    cl_rdma_reply->c_len, 0, NULL, XDR_DECODE, conn);
+
+		*rxdrp = xdrs;
+	}
+}
+
+static void
+clnt_return_credit(CONN *conn)
+{
+	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+	mutex_enter(&conn->c_lock);
+	cc_info->clnt_cc_in_flight_ops--;
+	cv_signal(&cc_info->clnt_cc_cv);
+	mutex_exit(&conn->c_lock);
+}
+
+static void
+clnt_update_credit(CONN *conn, uint32_t rdma_credit)
+{
+	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+	/*
+	 * If the granted has not altered, avoid taking the
+	 * mutex, to essentially do nothing..
+	 */
+	if (cc_info->clnt_cc_granted_ops == rdma_credit)
+		return;
+	/*
+	 * Get the granted number of buffers for credit control.
+	 */
+	mutex_enter(&conn->c_lock);
+	cc_info->clnt_cc_granted_ops = rdma_credit;
+	mutex_exit(&conn->c_lock);
+}
+
+static void
+clnt_check_credit(CONN *conn)
+{
+	rdma_clnt_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_clnt_cc;
+
+	/*
+	 * Make sure we are not going over our allowed buffer use
+	 * (and make sure we have gotten a granted value before).
+	 */
+	mutex_enter(&conn->c_lock);
+	while (cc_info->clnt_cc_in_flight_ops >= cc_info->clnt_cc_granted_ops &&
+	    cc_info->clnt_cc_granted_ops != 0) {
+		/*
+		 * Client has maxed out its granted buffers due to
+		 * credit control.  Current handling is to block and wait.
+		 */
+		cv_wait(&cc_info->clnt_cc_cv, &conn->c_lock);
+	}
+	cc_info->clnt_cc_in_flight_ops++;
+	mutex_exit(&conn->c_lock);
+}
+
 /* ARGSUSED */
 static void
 clnt_rdma_kabort(CLIENT *h)
@@ -1011,7 +1203,6 @@
 clnt_rdma_kerror(CLIENT *h, struct rpc_err *err)
 {
 	struct cku_private *p = htop(h);
-
 	*err = p->cku_err;
 }
 
@@ -1055,6 +1246,7 @@
 
 	if (!INGLOBALZONE(curproc))
 		return (-1);
+
 	/*
 	 * modload the RDMA plugins if not already done.
 	 */
@@ -1078,7 +1270,7 @@
 		    &handle);
 		if (status == RDMA_SUCCESS) {
 			knc = kmem_zalloc(sizeof (struct knetconfig),
-				KM_SLEEP);
+			    KM_SLEEP);
 			knc->knc_semantics = NC_TPI_RDMA;
 			pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 			p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
@@ -1103,3 +1295,22 @@
 	rw_exit(&rdma_lock);
 	return (-1);
 }
+
+static void
+check_dereg_wlist(CONN *conn, clist *rwc)
+{
+	int status;
+
+	if (rwc == NULL)
+		return;
+
+	if (rwc->c_dmemhandle.mrc_rmr && rwc->c_len) {
+
+		status = clist_deregister(conn, rwc, CLIST_REG_DST);
+
+		if (status != RDMA_SUCCESS) {
+			DTRACE_PROBE1(krpc__e__clntrdma__dereg_wlist,
+			    int, status);
+		}
+	}
+}
--- a/usr/src/uts/common/rpc/ib.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/ib.h	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,27 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2007, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ *   Ranjit Noronha: [email protected]
+ *   Lei Chai      : [email protected]
+ *   Weikuan Yu    : [email protected]
+ *
+ */
+
 
 #ifndef _IB_H
 #define	_IB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ib.h, rpcib plugin interface.
  */
@@ -41,12 +52,14 @@
 #include <rpc/rpc.h>
 #include <rpc/rpc_rdma.h>
 #include <sys/ib/ibtl/ibti.h>
+#include <sys/avl.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define	MAX_BUFS	256	/* max no. of buffers per pool */
+#define	MAX_BUFS	1024	/* max no. of buffers per pool */
+
 #define	DEF_CQ_SIZE	4096 - 1	/* default CQ size */
 				/*
 				 * Tavor returns the next higher power of 2
@@ -60,8 +73,6 @@
 #define	DSEG_MAX	2
 #define	RQ_DSEG_MAX	1	/* default RQ data seg */
 #define	IBSRM_HB	0x8000	/* high order bit of pkey */
-#define	NFS_SEC_KEY0	0x6878	/* randomly selected NFS security key */
-#define	NFS_SEC_KEY1	0x8679
 
 /* max no. of refresh attempts on IBT_CM_CONN_STALE error */
 #define	REFRESH_ATTEMPTS	3
@@ -132,24 +143,6 @@
 #define	IBD_NAME	"ibd"
 #define	N_IBD_INSTANCES	4
 
-typedef struct rpcib_ats_s {
-	int			ras_inst;
-	ib_pkey_t		ras_pkey;
-	ib_gid_t		ras_port_gid;
-	sa_family_t		ras_inet_type;
-	union {
-		struct sockaddr_in	ras_sockaddr;
-		struct sockaddr_in6	ras_sockaddr6;
-	} ra_sin;
-#define	ras_sin			ra_sin.ras_sockaddr
-#define	ras_sin6		ra_sin.ras_sockaddr6
-} rpcib_ats_t;
-
-typedef struct rpcib_ibd_insts_s {
-	int			rib_ibd_alloc;
-	int			rib_ibd_cnt;
-	rpcib_ats_t		*rib_ats;
-} rpcib_ibd_insts_t;
 
 /*
  * Service types supported by RPCIB
@@ -199,25 +192,7 @@
 typedef struct rib_service rib_service_t;
 struct rib_service {
 	uint32_t		srv_type;	/* i.e, NFS, NLM, v4CBD */
-
-	/*
-	 * service name, i.e, <IP>::NFS or <IP>::NLM. Since
-	 * each type of service can be registered with many
-	 * IP addrs(srv_name) and is running on all ports
-	 * for all HCAs.
-	 */
-	char			*srv_name;
-
-	uint32_t		srv_port;	/* port on which registered */
-	ib_svc_id_t		srv_id;		/* from ibt_register call */
 	ibt_srv_hdl_t		srv_hdl;	/* from ibt_register call */
-	ibt_sbind_hdl_t		*srv_sbind_hdl;	/* from ibt_bind call */
-	ibt_ar_t		srv_ar;
-
-	/*
-	 * pointer to the next service registered on this
-	 * particular HCA
-	 */
 	rib_service_t		*srv_next;
 };
 
@@ -263,7 +238,6 @@
 	rib_service_t	*service_list;
 	krwlock_t		service_list_lock;
 
-	rib_service_t	*ats_list;		/* Service list for ATS */
 
 	rib_conn_list_t		cl_conn_list;	/* client conn list */
 	rib_conn_list_t		srv_conn_list;	/* server conn list */
@@ -279,6 +253,18 @@
 	rib_bufpool_t		*send_pool;	/* send buf pool */
 
 	void			*iblock;	/* interrupt cookie */
+
+	kmem_cache_t	*server_side_cache;	/* long reply pool */
+	avl_tree_t	avl_tree;
+	kmutex_t	avl_lock;
+	krwlock_t	avl_rw_lock;
+	volatile bool_t avl_init;
+	kmutex_t	cache_allocation;
+	ddi_taskq_t *reg_cache_clean_up;
+	ib_svc_id_t	srv_id;
+	ibt_srv_hdl_t 	srv_hdl;
+	uint_t		reg_state;
+
 };
 
 
@@ -294,6 +280,12 @@
 	rib_qp_t	*qp;
 	int		nsbufs;			/* # of send buffers posted */
 	uint64_t	sbufaddr[DSEG_MAX];	/* posted send buffers */
+	caddr_t		c;
+	caddr_t		c1;
+	int		l1;
+	caddr_t		c2;
+	int		l2;
+	int		wl, rl;
 };
 
 /*
@@ -362,6 +354,7 @@
 	kcondvar_t 		cb_conn_cv;
 
 	caddr_t			q;	/* upstream queue */
+	struct send_wid		wd;
 };
 
 #define	ctoqp(conn)	((rib_qp_t *)((conn)->c_private))
--- a/usr/src/uts/common/rpc/rdma_subr.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/rdma_subr.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,23 +19,32 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2007, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ *   Ranjit Noronha: [email protected]
+ *   Lei Chai      : [email protected]
+ *   Weikuan Yu    : [email protected]
+ *
+ */
 
 #include <sys/systm.h>
 #include <sys/kstat.h>
 #include <sys/modctl.h>
+#include <sys/sdt.h>
 #include <rpc/rpc_rdma.h>
 
 #include <sys/ib/ibtl/ibti.h>
 
-/*
- * RDMA chunk size
- */
-#define	RDMA_MINCHUNK	1024
 uint_t rdma_minchunk = RDMA_MINCHUNK;
 
 /*
@@ -49,6 +57,8 @@
 krwlock_t	rdma_lock;		/* protects rdma_mod_head list */
 ldi_ident_t rpcmod_li = NULL;	/* identifies us with ldi_ framework */
 
+kmem_cache_t *clist_cache = NULL;
+
 /*
  * Statics
  */
@@ -153,6 +163,18 @@
 	return (RDMA_FAILED);
 }
 
+struct clist *
+clist_alloc(void)
+{
+	struct clist *clp;
+
+	clp = kmem_cache_alloc(clist_cache, KM_SLEEP);
+
+	bzero(clp, sizeof (*clp));
+
+	return (clp);
+}
+
 /*
  * Creates a new chunk list entry, and
  * adds it to the end of a chunk list.
@@ -169,13 +191,13 @@
 	while (*clp != NULL)
 		clp = &((*clp)->c_next);
 
-	cl = kmem_zalloc(sizeof (*cl), KM_SLEEP);
+	cl = clist_alloc();
 	cl->c_xdroff = xdroff;
 	cl->c_len = len;
-	cl->c_saddr = (uint64_t)(uintptr_t)saddr;
+	cl->w.c_saddr = (uint64_t)(uintptr_t)saddr;
 	if (shandle)
 		cl->c_smemhandle = *shandle;
-	cl->c_daddr = (uint64_t)(uintptr_t)daddr;
+	cl->u.c_daddr = (uint64_t)(uintptr_t)daddr;
 	if (dhandle)
 		cl->c_dmemhandle = *dhandle;
 	cl->c_next = NULL;
@@ -183,24 +205,35 @@
 	*clp = cl;
 }
 
-int
-clist_register(CONN *conn, struct clist *cl, bool_t src)
+rdma_stat
+clist_register(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
 {
 	struct clist *c;
 	int status;
 
 	for (c = cl; c; c = c->c_next) {
-		if (src) {
+		if (c->c_len <= 0)
+			continue;
+		switch (dstsrc) {
+		case CLIST_REG_SOURCE:
 			status = RDMA_REGMEMSYNC(conn,
-			    (caddr_t)(uintptr_t)c->c_saddr, c->c_len,
-			    &c->c_smemhandle, (void **)&c->c_ssynchandle);
-		} else {
+			    (caddr_t)(struct as *)cl->c_adspc,
+			    (caddr_t)(uintptr_t)c->w.c_saddr3, c->c_len,
+			    &c->c_smemhandle, (void **)&c->c_ssynchandle,
+			    (void *)c->rb_longbuf.rb_private);
+			break;
+		case CLIST_REG_DST:
 			status = RDMA_REGMEMSYNC(conn,
-			    (caddr_t)(uintptr_t)c->c_daddr, c->c_len,
-			    &c->c_dmemhandle, (void **)&c->c_dsynchandle);
+			    (caddr_t)(struct as *)cl->c_adspc,
+			    (caddr_t)(uintptr_t)c->u.c_daddr3, c->c_len,
+			    &c->c_dmemhandle, (void **)&c->c_dsynchandle,
+			    (void *)c->rb_longbuf.rb_private);
+			break;
+		default:
+			return (RDMA_INVAL);
 		}
 		if (status != RDMA_SUCCESS) {
-			(void) clist_deregister(conn, cl, src);
+			(void) clist_deregister(conn, cl, dstsrc);
 			return (status);
 		}
 	}
@@ -208,36 +241,84 @@
 	return (RDMA_SUCCESS);
 }
 
-int
-clist_deregister(CONN *conn, struct clist *cl, bool_t src)
+rdma_stat
+clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
 {
 	struct clist *c;
 
 	for (c = cl; c; c = c->c_next) {
-		if (src) {
+		switch (dstsrc) {
+		case CLIST_REG_SOURCE:
 			if (c->c_smemhandle.mrc_rmr != 0) {
 				(void) RDMA_DEREGMEMSYNC(conn,
-				    (caddr_t)(uintptr_t)c->c_saddr,
+				    (caddr_t)(uintptr_t)c->w.c_saddr3,
 				    c->c_smemhandle,
-				    (void *)(uintptr_t)c->c_ssynchandle);
+				    (void *)(uintptr_t)c->c_ssynchandle,
+				    (void *)c->rb_longbuf.rb_private);
 				c->c_smemhandle.mrc_rmr = 0;
 				c->c_ssynchandle = NULL;
 			}
-		} else {
+			break;
+		case CLIST_REG_DST:
 			if (c->c_dmemhandle.mrc_rmr != 0) {
 				(void) RDMA_DEREGMEMSYNC(conn,
-				    (caddr_t)(uintptr_t)c->c_daddr,
+				    (caddr_t)(uintptr_t)c->u.c_daddr3,
 				    c->c_dmemhandle,
-				    (void *)(uintptr_t)c->c_dsynchandle);
+				    (void *)(uintptr_t)c->c_dsynchandle,
+				    (void *)c->rb_longbuf.rb_private);
 				c->c_dmemhandle.mrc_rmr = 0;
 				c->c_dsynchandle = NULL;
 			}
+			break;
+		default:
+			return (RDMA_INVAL);
 		}
 	}
 
 	return (RDMA_SUCCESS);
 }
 
+rdma_stat
+clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
+{
+	struct clist *c;
+	rdma_stat status;
+
+	c = cl;
+	switch (dstsrc) {
+	case CLIST_REG_SOURCE:
+		while (c != NULL) {
+			if (c->c_ssynchandle) {
+				status = RDMA_SYNCMEM(conn,
+				    (void *)(uintptr_t)c->c_ssynchandle,
+				    (caddr_t)(uintptr_t)c->w.c_saddr3,
+				    c->c_len, 0);
+				if (status != RDMA_SUCCESS)
+					return (status);
+			}
+			c = c->c_next;
+		}
+		break;
+	case CLIST_REG_DST:
+		while (c != NULL) {
+			if (c->c_ssynchandle) {
+				status = RDMA_SYNCMEM(conn,
+				    (void *)(uintptr_t)c->c_dsynchandle,
+				    (caddr_t)(uintptr_t)c->u.c_daddr3,
+				    c->c_len, 1);
+				if (status != RDMA_SUCCESS)
+					return (status);
+			}
+			c = c->c_next;
+		}
+		break;
+	default:
+		return (RDMA_INVAL);
+	}
+
+	return (RDMA_SUCCESS);
+}
+
 /*
  * Frees up entries in chunk list
  */
@@ -248,7 +329,7 @@
 
 	while (c != NULL) {
 		cl = cl->c_next;
-		kmem_free(c, sizeof (struct clist));
+		kmem_cache_free(clist_cache, c);
 		c = cl;
 	}
 }
@@ -258,33 +339,40 @@
 {
 	struct clist *cl = NULL;
 	rdma_stat retval;
-	rdma_buf_t rbuf;
+	rdma_buf_t rbuf = {0};
 
 	rbuf.type = RECV_BUFFER;
 	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
-		retval = RDMA_NORESOURCE;
-	} else {
-		clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
-			NULL, NULL);
-		retval = RDMA_CLNT_RECVBUF(conn, cl, xid);
-		clist_free(cl);
+		return (RDMA_NORESOURCE);
 	}
+
+	clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
+	    NULL, NULL);
+	retval = RDMA_CLNT_RECVBUF(conn, cl, xid);
+	clist_free(cl);
+
 	return (retval);
 }
 
 rdma_stat
+rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid)
+{
+	return (RDMA_CLNT_RECVBUF_REMOVE(conn, xid));
+}
+
+rdma_stat
 rdma_svc_postrecv(CONN *conn)
 {
 	struct clist *cl = NULL;
 	rdma_stat retval;
-	rdma_buf_t rbuf;
+	rdma_buf_t rbuf = {0};
 
 	rbuf.type = RECV_BUFFER;
 	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
 		retval = RDMA_NORESOURCE;
 	} else {
 		clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
-			NULL, NULL);
+		    NULL, NULL);
 		retval = RDMA_SVC_RECVBUF(conn, cl);
 		clist_free(cl);
 	}
@@ -292,32 +380,9 @@
 }
 
 rdma_stat
-clist_syncmem(CONN *conn, struct clist *cl, bool_t src)
+rdma_buf_alloc(CONN *conn, rdma_buf_t *rbuf)
 {
-	struct clist *c;
-	rdma_stat status;
-
-	c = cl;
-	if (src) {
-		while (c != NULL) {
-			status = RDMA_SYNCMEM(conn,
-			    (void *)(uintptr_t)c->c_ssynchandle,
-			    (caddr_t)(uintptr_t)c->c_saddr, c->c_len, 0);
-			if (status != RDMA_SUCCESS)
-				return (status);
-			c = c->c_next;
-		}
-	} else {
-		while (c != NULL) {
-			status = RDMA_SYNCMEM(conn,
-			    (void *)(uintptr_t)c->c_dsynchandle,
-			    (caddr_t)(uintptr_t)c->c_daddr, c->c_len, 1);
-			if (status != RDMA_SUCCESS)
-				return (status);
-			c = c->c_next;
-		}
-	}
-	return (RDMA_SUCCESS);
+	return (RDMA_BUF_ALLOC(conn, rbuf));
 }
 
 void
@@ -326,14 +391,8 @@
 	if (!rbuf || rbuf->addr == NULL) {
 		return;
 	}
-	if (rbuf->type != CHUNK_BUFFER) {
-		/* pool buffer */
-		RDMA_BUF_FREE(conn, rbuf);
-	} else {
-		kmem_free(rbuf->addr, rbuf->len);
-	}
-	rbuf->addr = NULL;
-	rbuf->len = 0;
+	RDMA_BUF_FREE(conn, rbuf);
+	bzero(rbuf, sizeof (rdma_buf_t));
 }
 
 /*
@@ -369,6 +428,11 @@
 
 	/* success */
 	rdma_kstat_init();
+
+	clist_cache = kmem_cache_create("rdma_clist",
+	    sizeof (struct clist), _POINTER_ALIGNMENT, NULL,
+	    NULL, NULL, NULL, 0, 0);
+
 	return (0);
 }
 
--- a/usr/src/uts/common/rpc/rpc_prot.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/rpc_prot.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 1996 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,8 +31,6 @@
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * rpc_prot.c
  * This set of routines implements the rpc message definition,
@@ -281,8 +278,8 @@
 			bcopy(oa->oa_base, buf, oa->oa_length);
 			buf = (int32_t *)(((caddr_t)buf) + oa->oa_length);
 			if ((rndup = (rndup - oa->oa_length)) > 0) {
-			    bzero(buf, rndup);
-			    buf = (int32_t *)(((caddr_t)buf) + rndup);
+				bzero(buf, rndup);
+				buf = (int32_t *)(((caddr_t)buf) + rndup);
 			}
 		}
 		/*
@@ -443,7 +440,6 @@
 {
 	if (msg->rm_direction == REPLY &&
 	    msg->rm_reply.rp_stat == MSG_ACCEPTED &&
-	    msg->acpted_rply.ar_stat == SUCCESS &&
 	    msg->acpted_rply.ar_verf.oa_base != NULL) {
 		xdrs->x_op = XDR_FREE;
 		return (xdr_opaque_auth(xdrs, &(msg->acpted_rply.ar_verf)));
--- a/usr/src/uts/common/rpc/rpc_rdma.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/rpc_rdma.h	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,27 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2007, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ *   Ranjit Noronha: [email protected]
+ *   Lei Chai      : [email protected]
+ *   Weikuan Yu    : [email protected]
+ *
+ */
+
 #ifndef	_RPC_RPC_RDMA_H
 #define	_RPC_RPC_RDMA_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <rpc/rpc.h>
 #include <rpc/rpc_sztypes.h>
 #include <sys/sunddi.h>
@@ -38,14 +49,19 @@
 extern "C" {
 #endif
 
-#define	RPCRDMA_VERS	0	/* Version of the RPC over RDMA protocol */
+#define	RPCRDMA_VERS	1	/* Version of the RPC over RDMA protocol */
 #define	RDMATF_VERS	1	/* Version of the API used by RPC for RDMA */
 #define	RDMATF_VERS_1	1	/* Current version of RDMATF */
 
 /*
  * The size of an RPC call or reply message
  */
-#define	RPC_MSG_SZ  1024
+#define	RPC_MSG_SZ	1024
+
+/*
+ * RDMA chunk size
+ */
+#define	RDMA_MINCHUNK	1024
 
 /*
  * Storage for a chunk list
@@ -69,7 +85,58 @@
  * RDMA xdr buffer control and other control flags. Add new flags here,
  * set them in private structure for xdr over RDMA in xdr_rdma.c
  */
-#define	RDMA_NOCHUNK		0x1
+#define	XDR_RDMA_CHUNK			0x1
+#define	XDR_RDMA_WLIST_REG		0x2
+#define	XDR_RDMA_RLIST_REG		0x4
+
+#define	LONG_REPLY_LEN	65536
+#define	WCL_BUF_LEN	32768
+#define	RCL_BUF_LEN	32768
+
+
+#define	RDMA_BUFS_RQST	34	/* Num bufs requested by client */
+#define	RDMA_BUFS_GRANT	32	/* Num bufs granted by server */
+
+struct xdr_ops *xdrrdma_xops(void);
+
+/*
+ * Credit Control Structures.
+ */
+typedef enum rdma_cc_type {
+	RDMA_CC_CLNT,	/* CONN is for a client */
+	RDMA_CC_SRV	/* CONN is for a server */
+} rdma_cc_type_t;
+
+/*
+ * Client side credit control data structure.
+ */
+typedef struct rdma_clnt_cred_ctrl {
+	uint32_t	clnt_cc_granted_ops;
+	uint32_t	clnt_cc_in_flight_ops;
+	kcondvar_t	clnt_cc_cv;
+} rdma_clnt_cred_ctrl_t;
+
+/*
+ * Server side credit control data structure.
+ */
+typedef struct rdma_srv_cred_ctrl {
+	uint32_t	srv_cc_buffers_granted;
+	uint32_t	srv_cc_cur_buffers_used;
+	uint32_t	srv_cc_posted;
+	uint32_t	srv_cc_max_buf_size;	/* to be determined by CCP */
+	uint32_t	srv_cc_cur_buf_size;	/* to be determined by CCP */
+} rdma_srv_cred_ctrl_t;
+
+typedef enum {
+    RPCCALL_WLIST,
+    RPCCALL_WCHUNK,
+    RPCCALL_NOWRITE
+}rpccall_write_t;
+
+typedef enum {
+	CLIST_REG_SOURCE,
+	CLIST_REG_DST
+} clist_dstsrc;
 
 /*
  * Return codes from RDMA operations
@@ -133,6 +200,32 @@
 #define	mrc_linfo	lhdl.mr.linfo
 
 /*
+ * Memory management for the RDMA buffers
+ */
+/*
+ * RDMA buffer types
+ */
+typedef enum {
+	SEND_BUFFER,	/* buf for send msg */
+	SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
+	RECV_BUFFER,	/* buf for recv msg */
+	RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
+	RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */
+} rdma_btype;
+
+/*
+ * RDMA buffer information
+ */
+typedef struct rdma_buf {
+	rdma_btype	type;	/* buffer type */
+	uint_t		len;	/* length of buffer */
+	caddr_t		addr;	/* buffer address */
+	struct mrc	handle;	/* buffer registration handle */
+	caddr_t		rb_private;
+} rdma_buf_t;
+
+
+/*
  * The XDR offset value is used by the XDR
  * routine to identify the position in the
  * RPC message where the opaque object would
@@ -151,15 +244,30 @@
 	uint32		c_len;		/* Length */
 	struct mrc	c_smemhandle;	/* src memory handle */
 	uint64 		c_ssynchandle;	/* src sync handle */
-	uint64		c_saddr;	/* src address */
+	union {
+		uint64		c_saddr;	/* src address */
+		caddr_t 	c_saddr3;
+	} w;
 	struct mrc	c_dmemhandle;	/* dst memory handle */
 	uint64		c_dsynchandle;	/* dst sync handle */
-	uint64		c_daddr;	/* dst address */
+	union {
+		uint64	c_daddr;	/* dst address */
+		caddr_t	c_daddr3;
+	} u;
+	struct as	*c_adspc;	/* address space for saddr/daddr */
+	rdma_buf_t	rb_longbuf;	/* used for long requests/replies */
 	struct clist	*c_next;	/* Next chunk */
 };
 
 typedef struct clist clist;
 
+/*
+ * max 4M wlist xfer size
+ * This is defined because the rfs3_tsize service requires
+ * svc_req struct (which we don't have that in krecv).
+ */
+#define	MAX_SVC_XFER_SIZE (4*1024*1024)
+
 enum rdma_proc {
 	RDMA_MSG	= 0,	/* chunk list and RPC msg follow */
 	RDMA_NOMSG	= 1,	/* only chunk list follows */
@@ -206,6 +314,15 @@
 	uint_t  mtu;		/* native mtu size of unlerlying network */
 } rdma_info_t;
 
+typedef enum {
+	C_IDLE		= 0x00000001,
+	C_CONN_PEND	= 0x00000002,
+	C_CONNECTED	= 0x00000004,
+	C_ERROR_CONN	= 0x00000008,
+	C_DISCONN_PEND	= 0x00000010,
+	C_REMOTE_DOWN	= 0x00000020
+} conn_c_state;
+
 /*
  * RDMA Connection information
  */
@@ -217,52 +334,52 @@
 	struct conn	*c_next;	/* next in list of connections */
 	struct conn	*c_prev;	/* prev in list of connections */
 	caddr_t		c_private;	/* transport specific stuff */
-
-#define	C_IDLE		0x80000000
-#define	C_CONN_PEND	0x40000000
-#define	C_CONNECTED	0x20000000
-#define	C_ERROR		0x10000000
-#define	C_DISCONN_PEND	0x08000000
-#define	C_REMOTE_DOWN	0x04000000
-
-	uint_t		c_state;	/* state of connection */
+	conn_c_state	c_state;	/* state of connection */
+	rdma_cc_type_t	c_cc_type;	/* client or server, for credit cntrl */
+	union {
+		rdma_clnt_cred_ctrl_t	c_clnt_cc;
+		rdma_srv_cred_ctrl_t	c_srv_cc;
+	} rdma_conn_cred_ctrl_u;
 	kmutex_t	c_lock;		/* protect c_state and c_ref fields */
 	kcondvar_t	c_cv;		/* to signal when pending is done */
 } CONN;
 
 
 /*
- * Memory management for the RDMA buffers
- */
-/*
- * RDMA buffer types
- */
-typedef enum {
-	SEND_BUFFER,	/* buf for send msg */
-	SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
-	RECV_BUFFER,	/* buf for recv msg */
-	RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
-	CHUNK_BUFFER	/* chunk buf used in RDMATF only and not in plugins */
-} rdma_btype;
-
-/*
- * RDMA buffer information
- */
-typedef struct rdma_buf {
-	rdma_btype	type;	/* buffer type */
-	int		len;	/* length of buffer */
-	caddr_t		addr;	/* buffer address */
-	struct mrc	handle;	/* buffer registration handle */
-} rdma_buf_t;
-
-/*
  * Data transferred from plugin interrupt to svc_queuereq()
  */
-struct recv_data {
+typedef struct rdma_recv_data {
 	CONN		*conn;
 	int		status;
 	rdma_buf_t	rpcmsg;
-};
+} rdma_recv_data_t;
+
+/* structure used to pass information for READ over rdma write */
+typedef enum {
+	RCI_WRITE_UIO_CHUNK = 1,
+	RCI_WRITE_ADDR_CHUNK = 2,
+	RCI_REPLY_CHUNK = 3
+} rci_type_t;
+
+typedef struct {
+	rci_type_t rci_type;
+	union {
+		struct uio *rci_uiop;
+		caddr_t    rci_addr;
+	} rci_a;
+	uint32	rci_len;
+	struct clist	**rci_clpp; /* point to write chunk list in readargs */
+} rdma_chunkinfo_t;
+
+typedef struct {
+	uint_t rcil_len;
+	uint_t rcil_len_alt;
+} rdma_chunkinfo_lengths_t;
+
+typedef struct {
+	struct	clist	*rwci_wlist;
+	CONN		*rwci_conn;
+} rdma_wlist_conn_info_t;
 
 /*
  * Operations vector for RDMA transports.
@@ -279,12 +396,13 @@
 	void		(*rdma_svc_listen)(struct rdma_svc_data *);
 	void		(*rdma_svc_stop)(struct rdma_svc_data *);
 	/* Memory */
-	rdma_stat	(*rdma_regmem)(CONN *, caddr_t, uint_t, struct mrc *);
+	rdma_stat	(*rdma_regmem)(CONN *, caddr_t, caddr_t,
+			    uint_t, struct mrc *);
 	rdma_stat	(*rdma_deregmem)(CONN *, caddr_t, struct mrc);
-	rdma_stat	(*rdma_regmemsync)(CONN *, caddr_t, uint_t,
-				struct mrc *, void **);
+	rdma_stat	(*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t,
+				struct mrc *, void **, void *);
 	rdma_stat	(*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
-				void *);
+			    void *, void *);
 	rdma_stat	(*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
 	/* Buffer */
 	rdma_stat	(*rdma_buf_alloc)(CONN *, rdma_buf_t *);
@@ -293,6 +411,7 @@
 	rdma_stat	(*rdma_send)(CONN *, clist *, uint32_t);
 	rdma_stat	(*rdma_send_resp)(CONN *, clist *, uint32_t);
 	rdma_stat	(*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
+	rdma_stat	(*rdma_clnt_recvbuf_remove)(CONN *, uint32_t);
 	rdma_stat	(*rdma_svc_recvbuf)(CONN *, clist *);
 	rdma_stat	(*rdma_recv)(CONN *, clist **, uint32_t);
 	/* RDMA */
@@ -300,7 +419,6 @@
 	rdma_stat	(*rdma_write)(CONN *, clist *, int);
 	/* INFO */
 	rdma_stat	(*rdma_getinfo)(rdma_info_t *info);
-
 } rdmaops_t;
 
 /*
@@ -315,19 +433,20 @@
 #define	RDMA_REL_CONN(conn)	\
 	(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
 
-#define	RDMA_REGMEM(conn, buff, len, handle)	\
-	(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, buff, len, handle)
+#define	RDMA_REGMEM(conn, adsp, buff, len, handle)	\
+	(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp,	\
+		buff, len, handle)
 
 #define	RDMA_DEREGMEM(conn, buff, handle)	\
 	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
 
-#define	RDMA_REGMEMSYNC(conn, buff, len, handle, synchandle)	\
-	(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, buff, \
-	    len, handle, synchandle)
+#define	RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc)	\
+	(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
+	len, handle, synchandle, lrc)
 
-#define	RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle)	\
-	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
-	    handle, synchandle)
+#define	RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc)	\
+	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff,	\
+	handle, synchandle, lrc)
 
 #define	RDMA_SYNCMEM(conn, handle, buff, len, direction)	\
 	(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
@@ -348,6 +467,9 @@
 #define	RDMA_CLNT_RECVBUF(conn, cl, xid)	\
 	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
 
+#define	RDMA_CLNT_RECVBUF_REMOVE(conn, xid)	\
+	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)
+
 #define	RDMA_SVC_RECVBUF(conn, cl)	\
 	(*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
 
@@ -375,19 +497,22 @@
 /*
  * General RDMA routines
  */
-extern void clist_add(struct clist **clp, uint32_t xdroff, int len,
-	struct mrc *shandle, caddr_t saddr,
-	struct mrc *dhandle, caddr_t daddr);
-extern void clist_free(struct clist *cl);
-extern int clist_register(CONN *conn, struct clist *cl, bool_t src);
-extern int clist_deregister(CONN *conn, struct clist *cl, bool_t src);
-rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
-rdma_stat rdma_svc_postrecv(CONN *conn);
-extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, bool_t src);
+extern struct clist *clist_alloc(void);
+extern void clist_add(struct clist **, uint32_t, int,
+	struct mrc *, caddr_t, struct mrc *, caddr_t);
+extern void clist_free(struct clist *);
+extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc);
+extern rdma_stat clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc);
+extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc);
+extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
+extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid);
+extern rdma_stat rdma_svc_postrecv(CONN *conn);
 extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
-extern void rdma_buf_free(CONN *conn, rdma_buf_t *rbuf);
+extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *);
+extern void rdma_buf_free(CONN *, rdma_buf_t *);
 extern int rdma_modload();
+extern bool_t   rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *);
 
 /*
  * RDMA XDR
@@ -395,14 +520,30 @@
 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
 	enum xdr_op, CONN *);
 extern void xdrrdma_destroy(XDR *);
-extern struct clist *xdrrdma_clist(XDR *);
+
 extern uint_t xdrrdma_getpos(XDR *);
 extern bool_t xdrrdma_setpos(XDR *, uint_t);
 extern bool_t xdr_clist(XDR *, clist *);
 extern bool_t xdr_do_clist(XDR *, clist **);
 extern uint_t xdr_getbufsize(XDR *);
-unsigned int xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk);
-unsigned int xdrrdma_authsize(AUTH *auth, struct cred *cred, int min_chunk);
+extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *);
+extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int);
+
+extern void xdrrdma_store_wlist(XDR *, struct clist *);
+extern struct clist *xdrrdma_wclist(XDR *);
+extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **);
+extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
+extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *,
+	uint32_t *, CONN *);
+extern bool_t xdr_encode_rlist_svc(XDR *, clist *);
+extern bool_t xdr_encode_wlist(XDR *, clist *);
+extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *,
+		uint32_t seg_array_len);
+bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *,
+	CONN **conn, const uint_t);
+bool_t xdrrdma_read_from_client(struct clist **, CONN **, uint_t);
+bool_t xdrrdma_send_read_data(XDR *, struct clist *);
+bool_t xdrrdma_free_clist(CONN *, struct clist *);
 #endif /* _KERNEL */
 
 #ifdef __cplusplus
--- a/usr/src/uts/common/rpc/rpcib.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/rpcib.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,11 +19,24 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Copyright (c) 2007, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ *   Ranjit Noronha: [email protected]
+ *   Lei Chai      : [email protected]
+ *   Weikuan Yu    : [email protected]
+ *
+ */
 
 /*
  * The rpcib plugin. Implements the interface for RDMATF's
@@ -56,7 +68,9 @@
 #include <sys/callb.h>
 #include <sys/sunddi.h>
 #include <sys/sunndi.h>
-
+#include <sys/sunldi.h>
+#include <sys/sdt.h>
+#include <sys/dlpi.h>
 #include <sys/ib/ibtl/ibti.h>
 #include <rpc/rpc.h>
 #include <rpc/ib.h>
@@ -70,7 +84,13 @@
 #include <sys/tiuser.h>
 #include <net/if.h>
 #include <sys/cred.h>
-
+#include <rpc/rpc_rdma.h>
+
+#include <nfs/nfs.h>
+#include <sys/kstat.h>
+#include <sys/atomic.h>
+
+#define	NFS_RDMA_PORT	2050
 
 extern char *inet_ntop(int, const void *, char *, int);
 
@@ -81,9 +101,30 @@
 
 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
-			    void *, void **);
+				void *, void **);
 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
-
+static int	rpcib_is_ib_interface(char *);
+static int	rpcib_dl_info(ldi_handle_t, dl_info_ack_t *);
+static int	rpcib_do_ip_ioctl(int, int, caddr_t);
+static boolean_t	rpcib_get_ib_addresses(struct sockaddr_in *,
+			struct sockaddr_in6 *, uint_t *, uint_t *);
+static	uint_t rpcib_get_number_interfaces(void);
+static int rpcib_cache_kstat_update(kstat_t *, int);
+static void rib_force_cleanup(void *);
+
+struct {
+	kstat_named_t cache_limit;
+	kstat_named_t cache_allocation;
+	kstat_named_t cache_hits;
+	kstat_named_t cache_misses;
+	kstat_named_t cache_misses_above_the_limit;
+} rpcib_kstat = {
+	{"cache_limit",			KSTAT_DATA_UINT64 },
+	{"cache_allocation",		KSTAT_DATA_UINT64 },
+	{"cache_hits",			KSTAT_DATA_UINT64 },
+	{"cache_misses",		KSTAT_DATA_UINT64 },
+	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
+};
 
 /* rpcib cb_ops */
 static struct cb_ops rpcib_cbops = {
@@ -107,6 +148,9 @@
 	nodev			/* int (*cb_awrite)() */
 };
 
+
+
+
 /*
  * Device options
  */
@@ -130,7 +174,7 @@
 
 static struct modldrv rib_modldrv = {
 	&mod_driverops,			    /* Driver module */
-	"RPCIB plugin driver, ver %I%", /* Driver name and version */
+	"RPCIB plugin driver, ver 1.30", /* Driver name and version */
 	&rpcib_ops,		    /* Driver ops */
 };
 
@@ -140,6 +184,41 @@
 	NULL
 };
 
+typedef struct rib_lrc_entry {
+	struct rib_lrc_entry *forw;
+	struct rib_lrc_entry *back;
+	char *lrc_buf;
+
+	uint32_t lrc_len;
+	void  *avl_node;
+	bool_t registered;
+
+	struct mrc lrc_mhandle;
+	bool_t lrc_on_freed_list;
+} rib_lrc_entry_t;
+
+typedef	struct cache_struct	{
+	rib_lrc_entry_t		r;
+	uint32_t		len;
+	uint32_t		elements;
+	kmutex_t		node_lock;
+	avl_node_t		avl_link;
+} cache_avl_struct_t;
+
+
+static uint64_t 	rib_total_buffers = 0;
+uint64_t	cache_limit = 100 * 1024 * 1024;
+static volatile uint64_t	cache_allocation = 0;
+static uint64_t	cache_watermark = 80 * 1024 * 1024;
+static uint64_t	cache_hits = 0;
+static uint64_t	cache_misses = 0;
+static uint64_t	cache_cold_misses = 0;
+static uint64_t	cache_hot_misses = 0;
+static uint64_t	cache_misses_above_the_limit = 0;
+static bool_t	stats_enabled = FALSE;
+
+static uint64_t max_unsignaled_rws = 5;
+
 /*
  * rib_stat: private data pointer used when registering
  *	with the IBTF.  It is returned to the consumer
@@ -147,10 +226,10 @@
  */
 static rpcib_state_t *rib_stat = NULL;
 
-#define	RNR_RETRIES	2
+#define	RNR_RETRIES	IBT_RNR_RETRY_1
 #define	MAX_PORTS	2
 
-int preposted_rbufs = 16;
+int preposted_rbufs = RDMA_BUFS_GRANT;
 int send_threshold = 1;
 
 /*
@@ -165,22 +244,31 @@
 int		plugin_state;
 kmutex_t	plugin_state_lock;
 
+ldi_ident_t rpcib_li;
 
 /*
  * RPCIB RDMATF operations
  */
+#if defined(MEASURE_POOL_DEPTH)
+static void rib_posted_rbufs(uint32_t x) { return; }
+#endif
 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
 static rdma_stat rib_disconnect(CONN *conn);
 static void rib_listen(struct rdma_svc_data *rd);
 static void rib_listen_stop(struct rdma_svc_data *rd);
-static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
-	struct mrc *buf_handle);
+static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
+	uint_t buflen, struct mrc *buf_handle);
 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
 	struct mrc buf_handle);
-static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
-	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
+static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
+		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
+static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
+		struct mrc buf_handle);
+static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
+	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
+	void *lrc);
 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
-	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
+	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
 	caddr_t buf, int len, int cpu);
 
@@ -194,6 +282,7 @@
 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
+static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
@@ -202,20 +291,19 @@
 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
 static rdma_stat rib_conn_release(CONN *conn);
 static rdma_stat rib_getinfo(rdma_info_t *info);
-static rdma_stat rib_register_ats(rib_hca_t *);
-static void rib_deregister_ats();
+
+static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
+static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
+static void rib_destroy_cache(rib_hca_t *hca);
+static	void	rib_server_side_cache_reclaim(void *argp);
+static int avl_compare(const void *t1, const void *t2);
+
 static void rib_stop_services(rib_hca_t *);
+static void rib_close_channels(rib_conn_list_t *);
 
 /*
  * RPCIB addressing operations
  */
-char ** get_ip_addrs(int *count);
-int get_interfaces(TIUSER *tiptr, int *num);
-int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
-int get_ibd_ipaddr(rpcib_ibd_insts_t *);
-rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
-void rib_get_ibd_insts(rpcib_ibd_insts_t *);
-
 
 /*
  * RDMA operations the RPCIB module exports
@@ -236,11 +324,12 @@
 	rib_send,
 	rib_send_resp,
 	rib_post_resp,
+	rib_post_resp_remove,
 	rib_post_recv,
 	rib_recv,
 	rib_read,
 	rib_write,
-	rib_getinfo
+	rib_getinfo,
 };
 
 /*
@@ -260,9 +349,12 @@
 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
-static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
-	ibt_mr_hdl_t *, ibt_mr_desc_t *);
-static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
+static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
+	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
+static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
+	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
+static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *,
+	ibt_ip_addr_t *, ibt_ip_addr_t *);
 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
 	rib_qp_t **);
 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
@@ -284,8 +376,8 @@
 static void rib_free_wid(struct recv_wid *);
 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
 static void rib_detach_hca(rib_hca_t *);
-static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
-	ibt_path_info_t *);
+static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int,
+	ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *);
 
 /*
  * Registration with IBTF as a consumer
@@ -317,11 +409,12 @@
  */
 int rib_debug = 0;
 
-static int ats_running = 0;
+
 int
 _init(void)
 {
 	int		error;
+	int ret;
 
 	error = mod_install((struct modlinkage *)&rib_modlinkage);
 	if (error != 0) {
@@ -330,6 +423,9 @@
 		 */
 		return (error);
 	}
+	ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li);
+	if (ret != 0)
+		rpcib_li = NULL;
 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
 
 	return (0);
@@ -344,8 +440,6 @@
 		return (EBUSY);
 	}
 
-	rib_deregister_ats();
-
 	/*
 	 * Remove module
 	 */
@@ -354,6 +448,7 @@
 		return (status);
 	}
 	mutex_destroy(&plugin_state_lock);
+	ldi_ident_release(rpcib_li);
 	return (0);
 }
 
@@ -444,7 +539,8 @@
 	}
 
 	ibt_status = ibt_attach(&rib_modinfo, dip,
-			(void *)rib_stat, &rib_stat->ibt_clnt_hdl);
+	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
+
 	if (ibt_status != IBT_SUCCESS) {
 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
 		mutex_destroy(&rib_stat->open_hca_lock);
@@ -517,53 +613,6 @@
 }
 
 
-static void
-rib_deregister_ats()
-{
-	rib_hca_t		*hca;
-	rib_service_t		*srv_list, *to_remove;
-	ibt_status_t   		ibt_status;
-
-	/*
-	 * deregister the Address Translation Service.
-	 */
-	hca = rib_stat->hca;
-	rw_enter(&hca->service_list_lock, RW_WRITER);
-	srv_list = hca->ats_list;
-	while (srv_list != NULL) {
-		to_remove = srv_list;
-		srv_list = to_remove->srv_next;
-
-		ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
-				&to_remove->srv_ar);
-		if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-		    if (rib_debug) {
-			cmn_err(CE_WARN, "_fini: "
-			    "ibt_deregister_ar FAILED"
-				" status: %d", ibt_status);
-		    }
-#endif
-		} else {
-		    mutex_enter(&rib_stat->open_hca_lock);
-		    ats_running = 0;
-		    mutex_exit(&rib_stat->open_hca_lock);
-#ifdef DEBUG
-		    if (rib_debug) {
-
-			cmn_err(CE_NOTE, "_fini: "
-			    "Successfully unregistered"
-			    " ATS service: %s",
-			    to_remove->srv_name);
-		    }
-#endif
-		}
-		kmem_free(to_remove, sizeof (rib_service_t));
-	}
-	hca->ats_list = NULL;
-	rw_exit(&hca->service_list_lock);
-}
-
 static void rib_rbufpool_free(rib_hca_t *, int);
 static void rib_rbufpool_deregister(rib_hca_t *, int);
 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
@@ -573,6 +622,7 @@
 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
 
+
 /*
  * One CQ pair per HCA
  */
@@ -594,7 +644,7 @@
 	    &real_size);
 	if (status != IBT_SUCCESS) {
 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
-				" status=%d", status);
+		    " status=%d", status);
 		error = RDMA_FAILED;
 		goto fail;
 	}
@@ -608,7 +658,7 @@
 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
 	if (status != IBT_SUCCESS) {
 		cmn_err(CE_WARN, "rib_create_cq: "
-			"enable_cq_notify failed, status %d", status);
+		    "enable_cq_notify failed, status %d", status);
 		error = RDMA_FAILED;
 		goto fail;
 	}
@@ -633,22 +683,24 @@
 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
 	uint_t			size, cq_size;
 	int			i;
+	kstat_t *ksp;
+	cache_avl_struct_t example_avl_node;
+	char rssc_name[32];
 
 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
+
 	if (ribstat->hcas == NULL)
 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
-				    sizeof (rib_hca_t), KM_SLEEP);
+		    sizeof (rib_hca_t), KM_SLEEP);
 
 	/*
 	 * Open a hca and setup for RDMA
 	 */
 	for (i = 0; i < ribstat->hca_count; i++) {
 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
-				ribstat->hca_guids[i],
-				&ribstat->hcas[i].hca_hdl);
+		    ribstat->hca_guids[i],
+		    &ribstat->hcas[i].hca_hdl);
 		if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
-				"returned %d", i, ibt_status);
 			continue;
 		}
 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
@@ -661,9 +713,6 @@
 		 */
 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
 		if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
-			    "returned %d (hca_guid 0x%llx)",
-			    ibt_status, (longlong_t)ribstat->hca_guids[i]);
 			goto fail1;
 		}
 
@@ -675,9 +724,6 @@
 		 */
 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
 		if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
-				"returned %d (hca_guid 0x%llx)",
-				ibt_status, (longlong_t)ribstat->hca_guids[i]);
 			goto fail1;
 		}
 
@@ -685,12 +731,8 @@
 		 * query HCA ports
 		 */
 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
-				0, &pinfop, &hca->hca_nports, &size);
+		    0, &pinfop, &hca->hca_nports, &size);
 		if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "open_hcas: "
-				"ibt_query_hca_ports returned %d "
-				"(hca_guid 0x%llx)",
-				ibt_status, (longlong_t)hca->hca_guid);
 			goto fail2;
 		}
 		hca->hca_ports = pinfop;
@@ -705,25 +747,25 @@
 		 * cq's will be needed.
 		 */
 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
-				&hca->svc_rcq, ribstat);
+		    &hca->svc_rcq, ribstat);
 		if (status != RDMA_SUCCESS) {
 			goto fail3;
 		}
 
 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
-				&hca->svc_scq, ribstat);
+		    &hca->svc_scq, ribstat);
 		if (status != RDMA_SUCCESS) {
 			goto fail3;
 		}
 
 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
-				&hca->clnt_rcq, ribstat);
+		    &hca->clnt_rcq, ribstat);
 		if (status != RDMA_SUCCESS) {
 			goto fail3;
 		}
 
 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
-				&hca->clnt_scq, ribstat);
+		    &hca->clnt_scq, ribstat);
 		if (status != RDMA_SUCCESS) {
 			goto fail3;
 		}
@@ -733,20 +775,63 @@
 		 * Note rib_rbuf_create also allocates memory windows.
 		 */
 		hca->recv_pool = rib_rbufpool_create(hca,
-					RECV_BUFFER, MAX_BUFS);
+		    RECV_BUFFER, MAX_BUFS);
 		if (hca->recv_pool == NULL) {
-			cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
 			goto fail3;
 		}
 
 		hca->send_pool = rib_rbufpool_create(hca,
-					SEND_BUFFER, MAX_BUFS);
+		    SEND_BUFFER, MAX_BUFS);
 		if (hca->send_pool == NULL) {
-			cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
 			rib_rbufpool_destroy(hca, RECV_BUFFER);
 			goto fail3;
 		}
 
+		if (hca->server_side_cache == NULL) {
+			(void) sprintf(rssc_name,
+			    "rib_server_side_cache_%04d", i);
+			hca->server_side_cache = kmem_cache_create(
+			    rssc_name,
+			    sizeof (cache_avl_struct_t), 0,
+			    NULL,
+			    NULL,
+			    rib_server_side_cache_reclaim,
+			    hca, NULL, 0);
+		}
+
+		avl_create(&hca->avl_tree,
+		    avl_compare,
+		    sizeof (cache_avl_struct_t),
+		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
+		    (uint_t)(uintptr_t)&example_avl_node);
+
+		rw_init(&hca->avl_rw_lock,
+		    NULL, RW_DRIVER, hca->iblock);
+		mutex_init(&hca->cache_allocation,
+		    NULL, MUTEX_DRIVER, NULL);
+		hca->avl_init = TRUE;
+
+		/* Create kstats for the cache */
+		ASSERT(INGLOBALZONE(curproc));
+
+		if (!stats_enabled) {
+			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
+			    KSTAT_TYPE_NAMED,
+			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
+			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
+			    GLOBAL_ZONEID);
+			if (ksp) {
+				ksp->ks_data = (void *) &rpcib_kstat;
+				ksp->ks_update = rpcib_cache_kstat_update;
+				kstat_install(ksp);
+				stats_enabled = TRUE;
+			}
+		}
+		if (NULL == hca->reg_cache_clean_up) {
+			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
+			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
+		}
+
 		/*
 		 * Initialize the registered service list and
 		 * the lock
@@ -757,9 +842,9 @@
 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
-			hca->iblock);
+		    hca->iblock);
 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
-			hca->iblock);
+		    hca->iblock);
 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
 		hca->inuse = TRUE;
@@ -809,15 +894,15 @@
 
 	ibt_status = IBT_SUCCESS;
 	while (ibt_status != IBT_CQ_EMPTY) {
-	    bzero(&wc, sizeof (wc));
-	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
-	    if (ibt_status != IBT_SUCCESS)
+	bzero(&wc, sizeof (wc));
+	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
+	if (ibt_status != IBT_SUCCESS)
 		return;
 
 	/*
 	 * Got a send completion
 	 */
-	    if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
+	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
 		CONN	*conn = qptoc(wd->qp);
 
@@ -845,15 +930,6 @@
  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
  *    IBT_WC_WR_FLUSHED_ERR               None            None
  */
-#ifdef DEBUG
-	if (rib_debug > 1) {
-	    if (wc.wc_status != IBT_WC_SUCCESS) {
-		    cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
-			"WR completed in error, wc.wc_status:%d, "
-			"wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
-	    }
-	}
-#endif
 			/*
 			 * Channel in error state. Set connection to
 			 * ERROR and cleanup will happen either from
@@ -862,10 +938,11 @@
 			wd->status = RDMA_FAILED;
 			mutex_enter(&conn->c_lock);
 			if (conn->c_state != C_DISCONN_PEND)
-				conn->c_state = C_ERROR;
+				conn->c_state = C_ERROR_CONN;
 			mutex_exit(&conn->c_lock);
 			break;
 		}
+
 		if (wd->cv_sig == 1) {
 			/*
 			 * Notify poster
@@ -879,12 +956,12 @@
 			 */
 			for (i = 0; i < wd->nsbufs; i++) {
 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
-					(void *)(uintptr_t)wd->sbufaddr[i]);
-			}
+				    (void *)(uintptr_t)wd->sbufaddr[i]);
+				}
 			mutex_exit(&wd->sendwait_lock);
 			(void) rib_free_sendwait(wd);
+			}
 		}
-	    }
 	}
 }
 
@@ -904,48 +981,42 @@
 
 	ibt_status = IBT_SUCCESS;
 	while (ibt_status != IBT_CQ_EMPTY) {
-	    bzero(&wc, sizeof (wc));
-	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
-	    if (ibt_status != IBT_SUCCESS)
-		return;
-
-	/*
-	 * Got a send completion
-	 */
-#ifdef DEBUG
-	    if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
-		cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
-			"wc.wc_status:%d, wc_id:%llX",
-			wc.wc_status, (longlong_t)wc.wc_id);
-	    }
-#endif
-	    if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
-		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
-
-		mutex_enter(&wd->sendwait_lock);
-		if (wd->cv_sig == 1) {
-			/*
-			 * Update completion status and notify poster
-			 */
-			if (wc.wc_status == IBT_WC_SUCCESS)
-				wd->status = RDMA_SUCCESS;
-			else
-				wd->status = RDMA_FAILED;
-			cv_signal(&wd->wait_cv);
-			mutex_exit(&wd->sendwait_lock);
-		} else {
-			/*
-			 * Poster not waiting for notification.
-			 * Free the send buffers and send_wid
-			 */
-			for (i = 0; i < wd->nsbufs; i++) {
-				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
-					(void *)(uintptr_t)wd->sbufaddr[i]);
+		bzero(&wc, sizeof (wc));
+		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
+		if (ibt_status != IBT_SUCCESS)
+			return;
+
+		/*
+		 * Got a send completion
+		 */
+		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
+			struct send_wid *wd =
+			    (struct send_wid *)(uintptr_t)wc.wc_id;
+			mutex_enter(&wd->sendwait_lock);
+			if (wd->cv_sig == 1) {
+				/*
+				 * Update completion status and notify poster
+				 */
+				if (wc.wc_status == IBT_WC_SUCCESS)
+					wd->status = RDMA_SUCCESS;
+				else
+					wd->status = RDMA_FAILED;
+				cv_signal(&wd->wait_cv);
+				mutex_exit(&wd->sendwait_lock);
+			} else {
+				/*
+				 * Poster not waiting for notification.
+				 * Free the send buffers and send_wid
+				 */
+				for (i = 0; i < wd->nsbufs; i++) {
+					rib_rbuf_free(qptoc(wd->qp),
+					    SEND_BUFFER,
+					    (void *)(uintptr_t)wd->sbufaddr[i]);
+				}
+				mutex_exit(&wd->sendwait_lock);
+				(void) rib_free_sendwait(wd);
 			}
-			mutex_exit(&wd->sendwait_lock);
-			(void) rib_free_sendwait(wd);
 		}
-	    }
 	}
 }
 
@@ -972,77 +1043,83 @@
 		bzero(&wc, sizeof (wc));
 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
 		if (ibt_status != IBT_SUCCESS)
-		    return;
+			return;
 
 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
 		qp = rwid->qp;
 		if (wc.wc_status == IBT_WC_SUCCESS) {
-		    XDR			inxdrs, *xdrs;
-		    uint_t		xid, vers, op, find_xid = 0;
-		    struct reply	*r;
-		    CONN *conn = qptoc(qp);
-
-		    xdrs = &inxdrs;
-		    xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
-			wc.wc_bytes_xfer, XDR_DECODE);
-		/*
-		 * Treat xid as opaque (xid is the first entity
-		 * in the rpc rdma message).
-		 */
-		    xid = *(uint32_t *)(uintptr_t)rwid->addr;
-		/* Skip xid and set the xdr position accordingly. */
-		    XDR_SETPOS(xdrs, sizeof (uint32_t));
-		    (void) xdr_u_int(xdrs, &vers);
-		    (void) xdr_u_int(xdrs, &op);
-		    XDR_DESTROY(xdrs);
-		    if (vers != RPCRDMA_VERS) {
+			XDR	inxdrs, *xdrs;
+			uint_t	xid, vers, op, find_xid = 0;
+			struct reply	*r;
+			CONN *conn = qptoc(qp);
+			uint32_t rdma_credit = 0;
+
+			xdrs = &inxdrs;
+			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
+			    wc.wc_bytes_xfer, XDR_DECODE);
 			/*
-			 * Invalid RPC/RDMA version. Cannot interoperate.
-			 * Set connection to ERROR state and bail out.
+			 * Treat xid as opaque (xid is the first entity
+			 * in the rpc rdma message).
 			 */
-			mutex_enter(&conn->c_lock);
-			if (conn->c_state != C_DISCONN_PEND)
-				conn->c_state = C_ERROR;
-			mutex_exit(&conn->c_lock);
-			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)rwid->addr);
-			rib_free_wid(rwid);
-			continue;
-		    }
-
-		    mutex_enter(&qp->replylist_lock);
-		    for (r = qp->replylist; r != NULL; r = r->next) {
-			if (r->xid == xid) {
-			    find_xid = 1;
-			    switch (op) {
-			    case RDMA_MSG:
-			    case RDMA_NOMSG:
-			    case RDMA_MSGP:
-				r->status = RDMA_SUCCESS;
-				r->vaddr_cq = rwid->addr;
-				r->bytes_xfer = wc.wc_bytes_xfer;
-				cv_signal(&r->wait_cv);
-				break;
-			    default:
+			xid = *(uint32_t *)(uintptr_t)rwid->addr;
+
+			/* Skip xid and set the xdr position accordingly. */
+			XDR_SETPOS(xdrs, sizeof (uint32_t));
+			(void) xdr_u_int(xdrs, &vers);
+			(void) xdr_u_int(xdrs, &rdma_credit);
+			(void) xdr_u_int(xdrs, &op);
+			XDR_DESTROY(xdrs);
+
+			if (vers != RPCRDMA_VERS) {
+				/*
+				 * Invalid RPC/RDMA version. Cannot
+				 * interoperate.  Set connection to
+				 * ERROR state and bail out.
+				 */
+				mutex_enter(&conn->c_lock);
+				if (conn->c_state != C_DISCONN_PEND)
+					conn->c_state = C_ERROR_CONN;
+				mutex_exit(&conn->c_lock);
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (void *)(uintptr_t)rwid->addr);
+				rib_free_wid(rwid);
+				continue;
+			}
+
+			mutex_enter(&qp->replylist_lock);
+			for (r = qp->replylist; r != NULL; r = r->next) {
+				if (r->xid == xid) {
+					find_xid = 1;
+					switch (op) {
+					case RDMA_MSG:
+					case RDMA_NOMSG:
+					case RDMA_MSGP:
+						r->status = RDMA_SUCCESS;
+						r->vaddr_cq = rwid->addr;
+						r->bytes_xfer =
+						    wc.wc_bytes_xfer;
+						cv_signal(&r->wait_cv);
+						break;
+					default:
+						rib_rbuf_free(qptoc(qp),
+						    RECV_BUFFER,
+						    (void *)(uintptr_t)
+						    rwid->addr);
+						break;
+					}
+					break;
+				}
+			}
+			mutex_exit(&qp->replylist_lock);
+			if (find_xid == 0) {
+				/* RPC caller not waiting for reply */
+
+				DTRACE_PROBE1(rpcib__i__nomatchxid1,
+				    int, xid);
+
 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
-						(void *)(uintptr_t)rwid->addr);
-				break;
-			    }
-			    break;
+				    (void *)(uintptr_t)rwid->addr);
 			}
-		    }
-		    mutex_exit(&qp->replylist_lock);
-		    if (find_xid == 0) {
-			/* RPC caller not waiting for reply */
-#ifdef DEBUG
-			    if (rib_debug) {
-			cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
-			    "NO matching xid %u!\n", xid);
-			    }
-#endif
-			rib_rbuf_free(qptoc(qp), RECV_BUFFER,
-				(void *)(uintptr_t)rwid->addr);
-		    }
 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
 			CONN *conn = qptoc(qp);
 
@@ -1051,7 +1128,7 @@
 			 * the posted buffer
 			 */
 			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)rwid->addr);
+			    (void *)(uintptr_t)rwid->addr);
 		} else {
 			CONN *conn = qptoc(qp);
 /*
@@ -1070,10 +1147,10 @@
 			 */
 			mutex_enter(&conn->c_lock);
 			if (conn->c_state != C_DISCONN_PEND)
-				conn->c_state = C_ERROR;
+				conn->c_state = C_ERROR_CONN;
 			mutex_exit(&conn->c_lock);
 			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)rwid->addr);
+			    (void *)(uintptr_t)rwid->addr);
 		}
 		rib_free_wid(rwid);
 	}
@@ -1084,7 +1161,7 @@
 static void
 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
 {
-	struct recv_data *rd;
+	rdma_recv_data_t *rdp;
 	rib_qp_t	*qp;
 	ibt_status_t	ibt_status;
 	ibt_wc_t	wc;
@@ -1103,110 +1180,114 @@
 		bzero(&wc, sizeof (wc));
 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
 		if (ibt_status != IBT_SUCCESS)
-		    return;
+			return;
 
 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
 		qp = s_recvp->qp;
 		conn = qptoc(qp);
 		mutex_enter(&qp->posted_rbufs_lock);
 		qp->n_posted_rbufs--;
+#if defined(MEASURE_POOL_DEPTH)
+		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
+#endif
 		if (qp->n_posted_rbufs == 0)
 			cv_signal(&qp->posted_rbufs_cv);
 		mutex_exit(&qp->posted_rbufs_lock);
 
 		if (wc.wc_status == IBT_WC_SUCCESS) {
-		    XDR		inxdrs, *xdrs;
-		    uint_t	xid, vers, op;
-
-		    xdrs = &inxdrs;
-		    /* s_recvp->vaddr stores data */
-		    xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
-			wc.wc_bytes_xfer, XDR_DECODE);
-
-		/*
-		 * Treat xid as opaque (xid is the first entity
-		 * in the rpc rdma message).
-		 */
-		    xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
-		/* Skip xid and set the xdr position accordingly. */
-		    XDR_SETPOS(xdrs, sizeof (uint32_t));
-		    if (!xdr_u_int(xdrs, &vers) ||
-			!xdr_u_int(xdrs, &op)) {
-			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)s_recvp->vaddr);
+			XDR	inxdrs, *xdrs;
+			uint_t	xid, vers, op;
+			uint32_t rdma_credit;
+
+			xdrs = &inxdrs;
+			/* s_recvp->vaddr stores data */
+			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
+			    wc.wc_bytes_xfer, XDR_DECODE);
+
+			/*
+			 * Treat xid as opaque (xid is the first entity
+			 * in the rpc rdma message).
+			 */
+			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
+			/* Skip xid and set the xdr position accordingly. */
+			XDR_SETPOS(xdrs, sizeof (uint32_t));
+			if (!xdr_u_int(xdrs, &vers) ||
+			    !xdr_u_int(xdrs, &rdma_credit) ||
+			    !xdr_u_int(xdrs, &op)) {
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (void *)(uintptr_t)s_recvp->vaddr);
+				XDR_DESTROY(xdrs);
+				(void) rib_free_svc_recv(s_recvp);
+				continue;
+			}
 			XDR_DESTROY(xdrs);
-#ifdef DEBUG
-			cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
-			    "xdr_u_int failed for qp %p, wc_id=%llx",
-			    (void *)qp, (longlong_t)wc.wc_id);
-#endif
-			(void) rib_free_svc_recv(s_recvp);
-			continue;
-		    }
-		    XDR_DESTROY(xdrs);
-
-		    if (vers != RPCRDMA_VERS) {
-			/*
-			 * Invalid RPC/RDMA version. Drop rpc rdma message.
-			 */
-			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)s_recvp->vaddr);
-			(void) rib_free_svc_recv(s_recvp);
-			continue;
-		    }
+
+			if (vers != RPCRDMA_VERS) {
+				/*
+				 * Invalid RPC/RDMA version.
+				 * Drop rpc rdma message.
+				 */
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (void *)(uintptr_t)s_recvp->vaddr);
+				(void) rib_free_svc_recv(s_recvp);
+				continue;
+			}
 			/*
 			 * Is this for RDMA_DONE?
 			 */
-		    if (op == RDMA_DONE) {
-			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)s_recvp->vaddr);
-			/*
-			 * Wake up the thread waiting on
-			 * a RDMA_DONE for xid
-			 */
-			mutex_enter(&qp->rdlist_lock);
-			rdma_done_notify(qp, xid);
-			mutex_exit(&qp->rdlist_lock);
-			(void) rib_free_svc_recv(s_recvp);
-			continue;
-		    }
-
-		    mutex_enter(&plugin_state_lock);
-		    if (plugin_state == ACCEPT) {
-			while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
-			    (void) strwaitbuf(sizeof (*rd), BPRI_LO);
-			/*
-			 * Plugin is in accept state, hence the master
-			 * transport queue for this is still accepting
-			 * requests. Hence we can call svc_queuereq to
-			 * queue this recieved msg.
-			 */
-			rd = (struct recv_data *)mp->b_rptr;
-			rd->conn = conn;
-			rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
-			rd->rpcmsg.type = RECV_BUFFER;
-			rd->rpcmsg.len = wc.wc_bytes_xfer;
-			rd->status = wc.wc_status;
-			mutex_enter(&conn->c_lock);
-			conn->c_ref++;
-			mutex_exit(&conn->c_lock);
-			mp->b_wptr += sizeof (*rd);
-			svc_queuereq((queue_t *)rib_stat->q, mp);
-			mutex_exit(&plugin_state_lock);
-		    } else {
-			/*
-			 * The master transport for this is going
-			 * away and the queue is not accepting anymore
-			 * requests for krpc, so don't do anything, just
-			 * free the msg.
-			 */
-			mutex_exit(&plugin_state_lock);
-			rib_rbuf_free(conn, RECV_BUFFER,
-			(void *)(uintptr_t)s_recvp->vaddr);
-		    }
+			if (op == RDMA_DONE) {
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (void *)(uintptr_t)s_recvp->vaddr);
+				/*
+				 * Wake up the thread waiting on
+				 * a RDMA_DONE for xid
+				 */
+				mutex_enter(&qp->rdlist_lock);
+				rdma_done_notify(qp, xid);
+				mutex_exit(&qp->rdlist_lock);
+				(void) rib_free_svc_recv(s_recvp);
+				continue;
+			}
+
+			mutex_enter(&plugin_state_lock);
+			if (plugin_state == ACCEPT) {
+				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
+				    == NULL)
+					(void) strwaitbuf(
+					    sizeof (*rdp), BPRI_LO);
+				/*
+				 * Plugin is in accept state, hence the master
+				 * transport queue for this is still accepting
+				 * requests. Hence we can call svc_queuereq to
+				 * queue this recieved msg.
+				 */
+				rdp = (rdma_recv_data_t *)mp->b_rptr;
+				rdp->conn = conn;
+				rdp->rpcmsg.addr =
+				    (caddr_t)(uintptr_t)s_recvp->vaddr;
+				rdp->rpcmsg.type = RECV_BUFFER;
+				rdp->rpcmsg.len = wc.wc_bytes_xfer;
+				rdp->status = wc.wc_status;
+				mutex_enter(&conn->c_lock);
+				conn->c_ref++;
+				mutex_exit(&conn->c_lock);
+				mp->b_wptr += sizeof (*rdp);
+				svc_queuereq((queue_t *)rib_stat->q, mp);
+				mutex_exit(&plugin_state_lock);
+			} else {
+				/*
+				 * The master transport for this is going
+				 * away and the queue is not accepting anymore
+				 * requests for krpc, so don't do anything, just
+				 * free the msg.
+				 */
+				mutex_exit(&plugin_state_lock);
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (void *)(uintptr_t)s_recvp->vaddr);
+			}
 		} else {
 			rib_rbuf_free(conn, RECV_BUFFER,
-				(void *)(uintptr_t)s_recvp->vaddr);
+			    (void *)(uintptr_t)s_recvp->vaddr);
 		}
 		(void) rib_free_svc_recv(s_recvp);
 	}
@@ -1230,54 +1311,57 @@
 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
 		rib_detach_hca(rib_stat->hca);
 #ifdef DEBUG
-	cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
 #endif
 		break;
 	}
 #ifdef DEBUG
 	case IBT_EVENT_PATH_MIGRATED:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): "
+		    "IBT_EVENT_PATH_MIGRATED\n");
 		break;
 	case IBT_EVENT_SQD:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
 		break;
 	case IBT_EVENT_COM_EST:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
 		break;
 	case IBT_ERROR_CATASTROPHIC_CHAN:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): "
+		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
 		break;
 	case IBT_ERROR_INVALID_REQUEST_CHAN:
-	cmn_err(CE_NOTE, "rib_async_handler(): "
-		"IBT_ERROR_INVALID_REQUEST_CHAN\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): "
+		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
 		break;
 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
-	cmn_err(CE_NOTE, "rib_async_handler(): "
-		"IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): "
+		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
 		break;
 	case IBT_ERROR_PATH_MIGRATE_REQ:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): "
+		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
 		break;
 	case IBT_ERROR_CQ:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
 		break;
 	case IBT_ERROR_PORT_DOWN:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
 		break;
 	case IBT_EVENT_PORT_UP:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
 		break;
 	case IBT_ASYNC_OPAQUE1:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
 		break;
 	case IBT_ASYNC_OPAQUE2:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
 		break;
 	case IBT_ASYNC_OPAQUE3:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
 		break;
 	case IBT_ASYNC_OPAQUE4:
-	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
+		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
 		break;
 #endif
 	default:
@@ -1308,28 +1392,10 @@
 
 	if (status == RDMA_SUCCESS) {
 		*handle = (void *)hca;
-		/*
-		 * Register the Address translation service
-		 */
-		mutex_enter(&rib_stat->open_hca_lock);
-		if (ats_running == 0) {
-			if (rib_register_ats(rib_stat->hca)
-			    == RDMA_SUCCESS) {
-				ats_running = 1;
-				mutex_exit(&rib_stat->open_hca_lock);
-				return (RDMA_SUCCESS);
-			} else {
-				mutex_exit(&rib_stat->open_hca_lock);
-				return (RDMA_FAILED);
-			}
-		} else {
-			mutex_exit(&rib_stat->open_hca_lock);
-			return (RDMA_SUCCESS);
-		}
+		return (RDMA_SUCCESS);
 	} else {
 		*handle = NULL;
-		if (rib_debug > 2)
-		    cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
+		DTRACE_PROBE(rpcib__i__pingfailed);
 		return (RDMA_FAILED);
 	}
 }
@@ -1340,6 +1406,7 @@
 {
 	rib_qp_t	*kqp = NULL;
 	CONN		*conn;
+	rdma_clnt_cred_ctrl_t *cc_info;
 
 	ASSERT(qp != NULL);
 	*qp = NULL;
@@ -1355,7 +1422,6 @@
 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
-
 	/*
 	 * Initialize
 	 */
@@ -1367,6 +1433,15 @@
 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
+	/*
+	 * Initialize the client credit control
+	 * portion of the rdmaconn struct.
+	 */
+	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
+	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
+	cc_info->clnt_cc_granted_ops = 0;
+	cc_info->clnt_cc_in_flight_ops = 0;
+	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
 
 	*qp = kqp;
 	return (RDMA_SUCCESS);
@@ -1380,8 +1455,8 @@
 	ibt_chan_sizes_t	chan_sizes;
 	ibt_rc_chan_alloc_args_t	qp_attr;
 	ibt_status_t		ibt_status;
-
-	ASSERT(qp != NULL);
+	rdma_srv_cred_ctrl_t *cc_info;
+
 	*qp = NULL;
 
 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
@@ -1409,8 +1484,8 @@
 	rw_enter(&hca->state_lock, RW_READER);
 	if (hca->state != HCA_DETACHED) {
 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
-			IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
-			&chan_sizes);
+		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
+		    &chan_sizes);
 	} else {
 		rw_exit(&hca->state_lock);
 		goto fail;
@@ -1418,9 +1493,8 @@
 	rw_exit(&hca->state_lock);
 
 	if (ibt_status != IBT_SUCCESS) {
-		cmn_err(CE_WARN, "rib_svc_create_chan: "
-			"ibt_alloc_rc_channel failed, ibt_status=%d.",
-			ibt_status);
+		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
+		    int, ibt_status);
 		goto fail;
 	}
 
@@ -1441,7 +1515,19 @@
 	 */
 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
 	kqp->rdmaconn.c_state = C_CONNECTED;
+
+	/*
+	 * Initialize the server credit control
+	 * portion of the rdmaconn struct.
+	 */
+	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
+	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
+	cc_info->srv_cc_buffers_granted = preposted_rbufs;
+	cc_info->srv_cc_cur_buffers_used = 0;
+	cc_info->srv_cc_posted = preposted_rbufs;
+
 	*qp = kqp;
+
 	return (RDMA_SUCCESS);
 fail:
 	if (kqp)
@@ -1450,52 +1536,6 @@
 	return (RDMA_FAILED);
 }
 
-void
-rib_dump_pathrec(ibt_path_info_t *path_rec)
-{
-	ib_pkey_t	pkey;
-
-	if (rib_debug > 1) {
-	    cmn_err(CE_NOTE, "Path Record:\n");
-
-	    cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
-		(longlong_t)path_rec->pi_hca_guid);
-	    cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
-		(longlong_t)path_rec->pi_sid);
-	    cmn_err(CE_NOTE, "Port Num        = %02d\n",
-		path_rec->pi_prim_cep_path.cep_hca_port_num);
-	    cmn_err(CE_NOTE, "P_Key Index     = %04d\n",
-		path_rec->pi_prim_cep_path.cep_pkey_ix);
-
-	    (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
-			path_rec->pi_prim_cep_path.cep_hca_port_num,
-			path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
-	    cmn_err(CE_NOTE, "P_Key		= 0x%x\n", pkey);
-
-
-	    cmn_err(CE_NOTE, "SGID:           = %llx:%llx\n",
-		(longlong_t)
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
-		(longlong_t)
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
-
-	    cmn_err(CE_NOTE, "DGID:           = %llx:%llx\n",
-		(longlong_t)
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
-		(longlong_t)
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
-
-	    cmn_err(CE_NOTE, "Path Rate       = %02x\n",
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
-	    cmn_err(CE_NOTE, "SL              = %02x\n",
-		path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
-	    cmn_err(CE_NOTE, "Prim Packet LT  = %02x\n",
-		path_rec->pi_prim_pkt_lt);
-	    cmn_err(CE_NOTE, "Path MTU        = %02x\n",
-		path_rec->pi_path_mtu);
-	}
-}
-
 /* ARGSUSED */
 ibt_cm_status_t
 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
@@ -1545,7 +1585,7 @@
 				break;
 			}
 
-			conn->c_state = C_ERROR;
+			conn->c_state = C_ERROR_CONN;
 
 			/*
 			 * Free the rc_channel. Channel has already
@@ -1565,14 +1605,14 @@
 				conn->c_state = C_DISCONN_PEND;
 				mutex_exit(&conn->c_lock);
 				(void) rib_disconnect_channel(conn,
-					&hca->cl_conn_list);
+				    &hca->cl_conn_list);
 			} else {
 				mutex_exit(&conn->c_lock);
 			}
 #ifdef DEBUG
 			if (rib_debug)
 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
-					"(CONN_CLOSED) channel disconnected");
+				    "(CONN_CLOSED) channel disconnected");
 #endif
 			break;
 		}
@@ -1584,111 +1624,73 @@
 	return (IBT_CM_ACCEPT);
 }
 
-
-/* Check if server has done ATS registration */
+/* Check server ib address */
 rdma_stat
-rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
-	int addr_type, ibt_path_info_t *path)
+rib_chk_srv_ibaddr(struct netbuf *raddr,
+	int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip,
+	ibt_ip_addr_t *d_ip)
 {
 	struct sockaddr_in	*sin4;
 	struct sockaddr_in6	*sin6;
-	ibt_path_attr_t		path_attr;
 	ibt_status_t		ibt_status;
-	ib_pkey_t		pkey;
-	ibt_ar_t		ar_query, ar_result;
-	rib_service_t		*ats;
-	ib_gid_t		sgid;
-	ibt_path_info_t		paths[MAX_PORTS];
-	uint8_t			npaths, i;
-
-	(void) bzero(&path_attr, sizeof (ibt_path_attr_t));
+	ibt_ip_path_attr_t	ipattr;
+	uint8_t npaths = 0;
+	ibt_path_ip_src_t	srcip;
+
+	ASSERT(raddr->buf != NULL);
+
 	(void) bzero(path, sizeof (ibt_path_info_t));
 
-	/*
-	 * Construct svc name
-	 */
-	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
 	switch (addr_type) {
 	case AF_INET:
 		sin4 = (struct sockaddr_in *)raddr->buf;
-		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
-		    IB_SVC_NAME_LEN);
+		d_ip->family = AF_INET;
+		d_ip->un.ip4addr = htonl(sin4->sin_addr.s_addr);
 		break;
 
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)raddr->buf;
-		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
-		    path_attr.pa_sname, IB_SVC_NAME_LEN);
+		d_ip->family = AF_INET6;
+		d_ip->un.ip6addr = sin6->sin6_addr;
 		break;
 
 	default:
-		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
 		return (RDMA_INVAL);
 	}
-	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
-
-	/*
-	 * Attempt a path to the server on an ATS-registered port.
-	 * Try all ATS-registered ports until one succeeds.
-	 * The first one that succeeds will be used to connect
-	 * to the server.  If none of them succeed, return RDMA_FAILED.
-	 */
-	rw_enter(&hca->state_lock, RW_READER);
-	if (hca->state != HCA_DETACHED) {
-	    rw_enter(&hca->service_list_lock, RW_READER);
-	    for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
-		path_attr.pa_hca_guid = hca->hca_guid;
-		path_attr.pa_hca_port_num = ats->srv_port;
-		ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
-			IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
-		if (ibt_status == IBT_SUCCESS ||
-			ibt_status == IBT_INSUFF_DATA) {
-		    for (i = 0; i < npaths; i++) {
-			if (paths[i].pi_hca_guid) {
-			/*
-			 * do ibt_query_ar()
-			 */
-			    sgid =
-				paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
-
-			    (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
-				paths[i].pi_prim_cep_path.cep_hca_port_num,
-				paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
-
-			    bzero(&ar_query, sizeof (ar_query));
-			    bzero(&ar_result, sizeof (ar_result));
-			    ar_query.ar_gid =
-				paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
-			    ar_query.ar_pkey = pkey;
-			    ibt_status = ibt_query_ar(&sgid, &ar_query,
-					&ar_result);
-			    if (ibt_status == IBT_SUCCESS) {
-#ifdef DEBUG
-				if (rib_debug > 1)
-				    rib_dump_pathrec(&paths[i]);
-#endif
-				bcopy(&paths[i], path,
-					sizeof (ibt_path_info_t));
-				rw_exit(&hca->service_list_lock);
-				kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
-				rw_exit(&hca->state_lock);
-				return (RDMA_SUCCESS);
-			    }
-#ifdef DEBUG
-			    if (rib_debug) {
-				cmn_err(CE_NOTE, "rib_chk_srv_ats: "
-				    "ibt_query_ar FAILED, return\n");
-			    }
-#endif
-			}
-		    }
-		}
-	    }
-	    rw_exit(&hca->service_list_lock);
+
+	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
+	bzero(&srcip, sizeof (ibt_path_ip_src_t));
+
+	ipattr.ipa_dst_ip 	= d_ip;
+	ipattr.ipa_hca_guid 	= rib_stat->hca->hca_guid;
+	ipattr.ipa_ndst		= 1;
+	ipattr.ipa_max_paths	= 1;
+	npaths = 0;
+
+	ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
+	    IBT_PATH_NO_FLAGS,
+	    &ipattr,
+	    path,
+	    &npaths,
+	    &srcip);
+
+	if (ibt_status != IBT_SUCCESS ||
+	    npaths < 1 ||
+	    path->pi_hca_guid != rib_stat->hca->hca_guid) {
+
+		bzero(s_ip, sizeof (ibt_path_ip_src_t));
+		return (RDMA_FAILED);
 	}
-	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
-	rw_exit(&hca->state_lock);
-	return (RDMA_FAILED);
+
+	if (srcip.ip_primary.family == AF_INET) {
+		s_ip->family = AF_INET;
+		s_ip->un.ip4addr = htonl(srcip.ip_primary.un.ip4addr);
+	} else {
+		s_ip->family = AF_INET6;
+		s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr;
+	}
+
+	return (RDMA_SUCCESS);
 }
 
 
@@ -1696,7 +1698,8 @@
  * Connect to the server.
  */
 rdma_stat
-rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
+rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path,
+		ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip)
 {
 	ibt_chan_open_args_t	chan_args;	/* channel args */
 	ibt_chan_sizes_t	chan_sizes;
@@ -1704,9 +1707,41 @@
 	ibt_status_t		ibt_status;
 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
+	ibt_ip_cm_info_t	ipcm_info;
+	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
+
 
 	(void) bzero(&chan_args, sizeof (chan_args));
 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
+	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
+
+	switch (ipcm_info.src_addr.family = s_ip->family) {
+	case AF_INET:
+		ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr;
+		break;
+	case AF_INET6:
+		ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr;
+		break;
+	}
+
+	switch (ipcm_info.dst_addr.family = d_ip->family) {
+	case AF_INET:
+		ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr;
+		break;
+	case AF_INET6:
+		ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr;
+		break;
+	}
+
+	ipcm_info.src_port = NFS_RDMA_PORT;
+
+	ibt_status = ibt_format_ip_private_data(&ipcm_info,
+	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
+
+	if (ibt_status != IBT_SUCCESS) {
+		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
+		return (-1);
+	}
 
 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
 	/* Alloc a RC channel */
@@ -1721,20 +1756,24 @@
 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
 	qp_attr.rc_flags = IBT_WR_SIGNALED;
 
+	path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
 	chan_args.oc_path = path;
 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
-	chan_args.oc_rdma_ra_out = 1;
-	chan_args.oc_rdma_ra_in = 1;
+	chan_args.oc_rdma_ra_out = 4;
+	chan_args.oc_rdma_ra_in = 4;
 	chan_args.oc_path_retry_cnt = 2;
 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
+	chan_args.oc_priv_data = cmp_ip_pvt;
+	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
 
 refresh:
 	rw_enter(&hca->state_lock, RW_READER);
 	if (hca->state != HCA_DETACHED) {
 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
-			IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
-			&chan_sizes);
+		    IBT_ACHAN_NO_FLAGS,
+		    &qp_attr, &qp->qp_hdl,
+		    &chan_sizes);
 	} else {
 		rw_exit(&hca->state_lock);
 		return (RDMA_FAILED);
@@ -1742,10 +1781,8 @@
 	rw_exit(&hca->state_lock);
 
 	if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-		cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
-		"failed, ibt_status=%d.", ibt_status);
-#endif
+		DTRACE_PROBE1(rpcib__i_conntosrv,
+		    int, ibt_status);
 		return (RDMA_FAILED);
 	}
 
@@ -1753,20 +1790,16 @@
 	(void) bzero(&ret_args, sizeof (ret_args));
 	mutex_enter(&qp->cb_lock);
 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
-			IBT_BLOCKING, &chan_args, &ret_args);
+	    IBT_BLOCKING, &chan_args, &ret_args);
 	if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-		if (rib_debug)
-			cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
-				" failed for qp %p, status=%d, "
-				"ret_args.rc_status=%d\n",
-				(void *)qp, ibt_status, ret_args.rc_status);
-#endif
+		DTRACE_PROBE2(rpcib__i_openrctosrv,
+		    int, ibt_status, int, ret_args.rc_status);
+
 		(void) ibt_free_channel(qp->qp_hdl);
 		qp->qp_hdl = NULL;
 		mutex_exit(&qp->cb_lock);
 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
-			ret_args.rc_status == IBT_CM_CONN_STALE) {
+		    ret_args.rc_status == IBT_CM_CONN_STALE) {
 			/*
 			 * Got IBT_CM_CONN_STALE probably because of stale
 			 * data on the passive end of a channel that existed
@@ -1789,58 +1822,123 @@
 rdma_stat
 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
 {
-	struct sockaddr_in	*sin4;
-	struct sockaddr_in6	*sin6;
-	ibt_path_attr_t		path_attr;
+	struct sockaddr_in	*sin4, *sin4arr;
+	struct sockaddr_in6	*sin6, *sin6arr;
+	uint_t			nif, nif4, nif6, i;
 	ibt_path_info_t		path;
 	ibt_status_t		ibt_status;
+	uint8_t			num_paths_p;
+	ibt_ip_path_attr_t	ipattr;
+	ibt_ip_addr_t		dstip;
+	ibt_path_ip_src_t	srcip;
+
+
+	*hca = NULL;
 
 	ASSERT(raddr->buf != NULL);
 
-	bzero(&path_attr, sizeof (ibt_path_attr_t));
 	bzero(&path, sizeof (ibt_path_info_t));
-
-	/*
-	 * Conctruct svc name
-	 */
-	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
+	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
+	bzero(&srcip, sizeof (ibt_path_ip_src_t));
+
+	/* Obtain the source IP addresses for the system */
+	nif = rpcib_get_number_interfaces();
+	sin4arr = (struct sockaddr_in *)
+	    kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP);
+	sin6arr = (struct sockaddr_in6 *)
+	    kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP);
+
+	(void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6);
+
+	/* Are there really any IB interfaces available */
+	if (nif4 == 0 && nif6 == 0) {
+		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
+		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
+		return (RDMA_FAILED);
+	}
+
+	/* Prep the destination address */
 	switch (addr_type) {
 	case AF_INET:
 		sin4 = (struct sockaddr_in *)raddr->buf;
-		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
-		    IB_SVC_NAME_LEN);
+		dstip.family = AF_INET;
+		dstip.un.ip4addr = htonl(sin4->sin_addr.s_addr);
+
+		for (i = 0; i < nif4; i++) {
+			num_paths_p = 0;
+			ipattr.ipa_dst_ip 	= &dstip;
+			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
+			ipattr.ipa_ndst		= 1;
+			ipattr.ipa_max_paths	= 1;
+			ipattr.ipa_src_ip.family = dstip.family;
+			ipattr.ipa_src_ip.un.ip4addr =
+			    htonl(sin4arr[i].sin_addr.s_addr);
+
+			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
+			    IBT_PATH_NO_FLAGS,
+			    &ipattr,
+			    &path,
+			    &num_paths_p,
+			    &srcip);
+			if (ibt_status == IBT_SUCCESS &&
+			    num_paths_p != 0 &&
+			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
+				*hca = rib_stat->hca;
+
+				kmem_free(sin4arr,
+				    sizeof (struct sockaddr_in) * nif);
+				kmem_free(sin6arr,
+				    sizeof (struct sockaddr_in6) * nif);
+
+				return (RDMA_SUCCESS);
+			}
+		}
 		break;
 
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)raddr->buf;
-		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
-		    path_attr.pa_sname, IB_SVC_NAME_LEN);
+		dstip.family = AF_INET6;
+		dstip.un.ip6addr = sin6->sin6_addr;
+
+		for (i = 0; i < nif6; i++) {
+			num_paths_p = 0;
+			ipattr.ipa_dst_ip 	= &dstip;
+			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
+			ipattr.ipa_ndst		= 1;
+			ipattr.ipa_max_paths	= 1;
+			ipattr.ipa_src_ip.family = dstip.family;
+			ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr;
+
+			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
+			    IBT_PATH_NO_FLAGS,
+			    &ipattr,
+			    &path,
+			    &num_paths_p,
+			    &srcip);
+			if (ibt_status == IBT_SUCCESS &&
+			    num_paths_p != 0 &&
+			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
+				*hca = rib_stat->hca;
+
+				kmem_free(sin4arr,
+				    sizeof (struct sockaddr_in) * nif);
+				kmem_free(sin6arr,
+				    sizeof (struct sockaddr_in6) * nif);
+
+				return (RDMA_SUCCESS);
+			}
+		}
+
 		break;
 
 	default:
-#ifdef	DEBUG
-	    if (rib_debug) {
-		cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
-	    }
-#endif
-		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
+		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
+		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
 		return (RDMA_INVAL);
 	}
-	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
-
-	ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
-		IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
-	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
-	if (ibt_status != IBT_SUCCESS) {
-	    if (rib_debug > 1) {
-		cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
-			" status=%d\n", ibt_status);
-	    }
-	} else if (path.pi_hca_guid) {
-		ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
-		*hca = rib_stat->hca;
-		return (RDMA_SUCCESS);
-	}
+
+	kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
+	kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
 	return (RDMA_FAILED);
 }
 
@@ -1860,6 +1958,7 @@
 	hca = qp->hca;
 	if (conn_list != NULL)
 		(void) rib_rm_conn(conn, conn_list);
+
 	if (qp->qp_hdl != NULL) {
 		/*
 		 * If the channel has not been establised,
@@ -1868,10 +1967,10 @@
 		 * called.  The channel is then freed.
 		 */
 		if (conn_list != NULL)
-		    (void) ibt_close_rc_channel(qp->qp_hdl,
-			IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
+			(void) ibt_close_rc_channel(qp->qp_hdl,
+			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
 		else
-		    (void) ibt_flush_channel(qp->qp_hdl);
+			(void) ibt_flush_channel(qp->qp_hdl);
 
 		mutex_enter(&qp->posted_rbufs_lock);
 		while (qp->n_posted_rbufs)
@@ -1880,7 +1979,9 @@
 		(void) ibt_free_channel(qp->qp_hdl);
 		qp->qp_hdl = NULL;
 	}
+
 	ASSERT(qp->rdlist == NULL);
+
 	if (qp->replylist != NULL) {
 		(void) rib_rem_replylist(qp);
 	}
@@ -1902,6 +2003,16 @@
 	if (conn->c_laddr.buf != NULL) {
 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
 	}
+
+	/*
+	 * Credit control cleanup.
+	 */
+	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
+		rdma_clnt_cred_ctrl_t *cc_info;
+		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
+		cv_destroy(&cc_info->clnt_cc_cv);
+	}
+
 	kmem_free(qp, sizeof (rib_qp_t));
 
 	/*
@@ -1914,7 +2025,8 @@
 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
 			if (hca->srv_conn_list.conn_hd == NULL) {
 				rw_enter(&hca->cl_conn_list.conn_lock,
-					RW_READER);
+				    RW_READER);
+
 				if (hca->cl_conn_list.conn_hd == NULL) {
 					mutex_enter(&hca->inuse_lock);
 					hca->inuse = FALSE;
@@ -1927,6 +2039,7 @@
 		}
 		rw_exit(&hca->state_lock);
 	}
+
 	return (RDMA_SUCCESS);
 }
 
@@ -1950,18 +2063,16 @@
 	if (wd->status == (uint_t)SEND_WAIT) {
 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
 		    ddi_get_lbolt();
+
 		if (qp->mode == RIB_SERVER) {
 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
-				    &wd->sendwait_lock, timout)) > 0 &&
+			    &wd->sendwait_lock, timout)) > 0 &&
 			    wd->status == (uint_t)SEND_WAIT)
 				;
 			switch (cv_wait_ret) {
 			case -1:	/* timeout */
-#ifdef DEBUG
-				if (rib_debug > 2)
-					cmn_err(CE_WARN, "rib_sendwait: "
-					    "timed out qp %p\n", (void *)qp);
-#endif
+				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
+
 				wd->cv_sig = 0;		/* no signal needed */
 				error = RDMA_TIMEDOUT;
 				break;
@@ -1970,26 +2081,19 @@
 			}
 		} else {
 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
-				    &wd->sendwait_lock, timout)) > 0 &&
+			    &wd->sendwait_lock, timout)) > 0 &&
 			    wd->status == (uint_t)SEND_WAIT)
 				;
 			switch (cv_wait_ret) {
 			case -1:	/* timeout */
-#ifdef DEBUG
-				if (rib_debug > 2)
-					cmn_err(CE_WARN, "rib_sendwait: "
-					    "timed out qp %p\n", (void *)qp);
-#endif
+				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
+
 				wd->cv_sig = 0;		/* no signal needed */
 				error = RDMA_TIMEDOUT;
 				break;
 			case 0:		/* interrupted */
-#ifdef DEBUG
-				if (rib_debug > 2)
-					cmn_err(CE_NOTE, "rib_sendwait:"
-					    " interrupted on qp %p\n",
-					    (void *)qp);
-#endif
+				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
+
 				wd->cv_sig = 0;		/* no signal needed */
 				error = RDMA_INTR;
 				break;
@@ -2002,20 +2106,19 @@
 	if (wd->status != (uint_t)SEND_WAIT) {
 		/* got send completion */
 		if (wd->status != RDMA_SUCCESS) {
-		    error = wd->status;
-		    if (wd->status != RDMA_CONNLOST)
+			error = wd->status;
+		if (wd->status != RDMA_CONNLOST)
 			error = RDMA_FAILED;
 		}
 		for (i = 0; i < wd->nsbufs; i++) {
 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
-				(void *)(uintptr_t)wd->sbufaddr[i]);
+			    (void *)(uintptr_t)wd->sbufaddr[i]);
 		}
 		mutex_exit(&wd->sendwait_lock);
 		(void) rib_free_sendwait(wd);
 	} else {
 		mutex_exit(&wd->sendwait_lock);
 	}
-
 	return (error);
 }
 
@@ -2050,9 +2153,9 @@
 {
 	mutex_enter(&qp->replylist_lock);
 	if (rep != NULL) {
-	    (void) rib_remreply(qp, rep);
-	    mutex_exit(&qp->replylist_lock);
-	    return (RDMA_SUCCESS);
+		(void) rib_remreply(qp, rep);
+		mutex_exit(&qp->replylist_lock);
+		return (RDMA_SUCCESS);
 	}
 	mutex_exit(&qp->replylist_lock);
 	return (RDMA_FAILED);
@@ -2065,7 +2168,7 @@
  */
 rdma_stat
 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
-	int send_sig, int cv_sig)
+	int send_sig, int cv_sig, caddr_t *swid)
 {
 	struct send_wid	*wdesc;
 	struct clist	*clp;
@@ -2075,7 +2178,9 @@
 	int		i, nds;
 	ibt_wr_ds_t	sgl[DSEG_MAX];
 	uint_t		total_msg_size;
-	rib_qp_t	*qp = ctoqp(conn);
+	rib_qp_t	*qp;
+
+	qp = ctoqp(conn);
 
 	ASSERT(cl != NULL);
 
@@ -2086,11 +2191,10 @@
 	clp = cl;
 	while (clp != NULL) {
 		if (nds >= DSEG_MAX) {
-			cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
-			    " too small!");
+			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
 			return (RDMA_FAILED);
 		}
-		sgl[nds].ds_va = clp->c_saddr;
+		sgl[nds].ds_va = clp->w.c_saddr;
 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
 		sgl[nds].ds_len = clp->c_len;
 		total_msg_size += clp->c_len;
@@ -2102,9 +2206,11 @@
 		/* Set SEND_SIGNAL flag. */
 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
+		*swid = (caddr_t)wdesc;
 	} else {
 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
 		wdesc = rib_init_sendwait(msgid, 0, qp);
+		*swid = (caddr_t)wdesc;
 	}
 	wdesc->nsbufs = nds;
 	for (i = 0; i < nds; i++) {
@@ -2118,59 +2224,50 @@
 	tx_wr.wr_sgl = sgl;
 
 	mutex_enter(&conn->c_lock);
-	if (conn->c_state & C_CONNECTED) {
+	if (conn->c_state == C_CONNECTED) {
 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
 	}
-	if (((conn->c_state & C_CONNECTED) == 0) ||
-		ibt_status != IBT_SUCCESS) {
+	if (conn->c_state != C_CONNECTED ||
+	    ibt_status != IBT_SUCCESS) {
+		if (conn->c_state != C_DISCONN_PEND)
+			conn->c_state = C_ERROR_CONN;
 		mutex_exit(&conn->c_lock);
 		for (i = 0; i < nds; i++) {
 			rib_rbuf_free(conn, SEND_BUFFER,
-				(void *)(uintptr_t)wdesc->sbufaddr[i]);
+			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
 		}
+
 		(void) rib_free_sendwait(wdesc);
-#ifdef DEBUG
-		if (rib_debug && ibt_status != IBT_SUCCESS)
-			cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
-				"failed! wr_id %llx on qpn %p, status=%d!",
-				(longlong_t)tx_wr.wr_id, (void *)qp,
-				ibt_status);
-#endif
-		return (RDMA_FAILED);
+
+		return (RDMA_CONNLOST);
 	}
 	mutex_exit(&conn->c_lock);
 
 	if (send_sig) {
-	    if (cv_sig) {
-		/*
-		 * cv_wait for send to complete.
-		 * We can fail due to a timeout or signal or
-		 * unsuccessful send.
-		 */
-		ret = rib_sendwait(qp, wdesc);
-#ifdef DEBUG
-	    if (rib_debug > 2)
-		if (ret != 0) {
-		    cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
-			"FAILED, rdma stat=%d, wr_id %llx, qp %p!",
-			ret, (longlong_t)tx_wr.wr_id, (void *)qp);
+		if (cv_sig) {
+			/*
+			 * cv_wait for send to complete.
+			 * We can fail due to a timeout or signal or
+			 * unsuccessful send.
+			 */
+			ret = rib_sendwait(qp, wdesc);
+
+			return (ret);
 		}
-#endif
-		return (ret);
-	    }
 	}
 
 	return (RDMA_SUCCESS);
 }
 
+
 rdma_stat
 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
 {
 	rdma_stat	ret;
+	caddr_t		wd;
 
 	/* send-wait & cv_signal */
-	ret = rib_send_and_wait(conn, cl, msgid, 1, 1);
-
+	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
 	return (ret);
 }
 
@@ -2184,42 +2281,34 @@
 	rdma_stat ret = RDMA_SUCCESS;
 	struct rdma_done_list *rd;
 	clock_t timout, cv_wait_ret;
+	caddr_t *wid = NULL;
 	rib_qp_t *qp = ctoqp(conn);
 
 	mutex_enter(&qp->rdlist_lock);
 	rd = rdma_done_add(qp, msgid);
 
 	/* No cv_signal (whether send-wait or no-send-wait) */
-	ret = rib_send_and_wait(conn, cl, msgid, 1, 0);
+	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
+
 	if (ret != RDMA_SUCCESS) {
-#ifdef DEBUG
-	    cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
-		"failed, msgid %u, qp %p", msgid, (void *)qp);
-#endif
-	    rdma_done_rm(qp, rd);
-	    goto done;
+		rdma_done_rm(qp, rd);
+	} else {
+		/*
+		 * Wait for RDMA_DONE from remote end
+		 */
+		timout =
+		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
+		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
+		    &qp->rdlist_lock,
+		    timout);
+
+		rdma_done_rm(qp, rd);
+
+		if (cv_wait_ret < 0) {
+			ret = RDMA_TIMEDOUT;
+		}
 	}
 
-	/*
-	 * Wait for RDMA_DONE from remote end
-	 */
-	timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
-	cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
-	    timout);
-	rdma_done_rm(qp, rd);
-	if (cv_wait_ret < 0) {
-#ifdef DEBUG
-		if (rib_debug > 1) {
-			cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
-			    " recv'd for qp %p, xid:%u\n",
-			    (void *)qp, msgid);
-		}
-#endif
-		ret = RDMA_TIMEDOUT;
-		goto done;
-	}
-
-done:
 	mutex_exit(&qp->rdlist_lock);
 	return (ret);
 }
@@ -2263,11 +2352,10 @@
 	nds = 0;
 	while (cl != NULL) {
 		if (nds >= DSEG_MAX) {
-		    cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
-		    ret = RDMA_FAILED;
-		    goto done;
+			ret = RDMA_FAILED;
+			goto done;
 		}
-		sgl[nds].ds_va = cl->c_saddr;
+		sgl[nds].ds_va = cl->w.c_saddr;
 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
 		sgl[nds].ds_len = cl->c_len;
 		cl = cl->c_next;
@@ -2275,45 +2363,42 @@
 	}
 
 	if (nds != 1) {
-	    cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
-	    ret = RDMA_FAILED;
-	    goto done;
+		ret = RDMA_FAILED;
+		goto done;
 	}
+
 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
 	recv_wr.wr_nds = nds;
 	recv_wr.wr_sgl = sgl;
 
 	rwid = rib_create_wid(qp, &sgl[0], msgid);
 	if (rwid) {
-	    recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
+		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
 	} else {
-		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
 		ret = RDMA_NORESOURCE;
 		goto done;
 	}
 	rep = rib_addreplylist(qp, msgid);
 	if (!rep) {
-		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
 		rib_free_wid(rwid);
 		ret = RDMA_NORESOURCE;
 		goto done;
 	}
 
 	mutex_enter(&conn->c_lock);
-	if (conn->c_state & C_CONNECTED) {
+
+	if (conn->c_state == C_CONNECTED) {
 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
 	}
-	if (((conn->c_state & C_CONNECTED) == 0) ||
-		ibt_status != IBT_SUCCESS) {
+
+	if (conn->c_state != C_CONNECTED ||
+	    ibt_status != IBT_SUCCESS) {
+		if (conn->c_state != C_DISCONN_PEND)
+			conn->c_state = C_ERROR_CONN;
 		mutex_exit(&conn->c_lock);
-#ifdef DEBUG
-		cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
-		    "ibt_post_recv(), msgid=%d, status=%d",
-		    (void *)qp,  msgid, ibt_status);
-#endif
 		rib_free_wid(rwid);
 		(void) rib_rem_rep(qp, rep);
-		ret = RDMA_FAILED;
+		ret = RDMA_CONNLOST;
 		goto done;
 	}
 	mutex_exit(&conn->c_lock);
@@ -2321,8 +2406,9 @@
 
 done:
 	while (clp != NULL) {
-	    rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
-	    clp = clp->c_next;
+		rib_rbuf_free(conn, RECV_BUFFER,
+		    (void *)(uintptr_t)clp->w.c_saddr3);
+		clp = clp->c_next;
 	}
 	return (ret);
 }
@@ -2340,10 +2426,9 @@
 	nds = 0;
 	while (cl != NULL) {
 		if (nds >= DSEG_MAX) {
-		    cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
-		    return (RDMA_FAILED);
+			return (RDMA_FAILED);
 		}
-		sgl[nds].ds_va = cl->c_saddr;
+		sgl[nds].ds_va = cl->w.c_saddr;
 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
 		sgl[nds].ds_len = cl->c_len;
 		cl = cl->c_next;
@@ -2351,10 +2436,12 @@
 	}
 
 	if (nds != 1) {
-	    cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
-	    rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
-	    return (RDMA_FAILED);
+		rib_rbuf_free(conn, RECV_BUFFER,
+		    (caddr_t)(uintptr_t)sgl[0].ds_va);
+
+		return (RDMA_FAILED);
 	}
+
 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
 	recv_wr.wr_nds = nds;
 	recv_wr.wr_sgl = sgl;
@@ -2363,21 +2450,19 @@
 	/* Use s_recvp's addr as wr id */
 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
 	mutex_enter(&conn->c_lock);
-	if (conn->c_state & C_CONNECTED) {
+	if (conn->c_state == C_CONNECTED) {
 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
 	}
-	if (((conn->c_state & C_CONNECTED) == 0) ||
-		ibt_status != IBT_SUCCESS) {
+	if (conn->c_state != C_CONNECTED ||
+	    ibt_status != IBT_SUCCESS) {
+		if (conn->c_state != C_DISCONN_PEND)
+			conn->c_state = C_ERROR_CONN;
 		mutex_exit(&conn->c_lock);
-#ifdef DEBUG
-		cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
-		    "ibt_post_recv(), status=%d",
-		    (void *)qp, ibt_status);
-#endif
 		rib_rbuf_free(conn, RECV_BUFFER,
-			(caddr_t)(uintptr_t)sgl[0].ds_va);
+		    (caddr_t)(uintptr_t)sgl[0].ds_va);
 		(void) rib_free_svc_recv(s_recvp);
-		return (RDMA_FAILED);
+
+		return (RDMA_CONNLOST);
 	}
 	mutex_exit(&conn->c_lock);
 
@@ -2392,6 +2477,29 @@
 	return (rib_clnt_post(conn, cl, msgid));
 }
 
+/* Client */
+rdma_stat
+rib_post_resp_remove(CONN* conn, uint32_t msgid)
+{
+	rib_qp_t	*qp = ctoqp(conn);
+	struct reply	*rep;
+
+	mutex_enter(&qp->replylist_lock);
+	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
+		if (rep->xid == msgid) {
+			if (rep->vaddr_cq) {
+				rib_rbuf_free(conn, RECV_BUFFER,
+				    (caddr_t)(uintptr_t)rep->vaddr_cq);
+			}
+			(void) rib_remreply(qp, rep);
+			break;
+		}
+	}
+	mutex_exit(&qp->replylist_lock);
+
+	return (RDMA_SUCCESS);
+}
+
 /* Server */
 rdma_stat
 rib_post_recv(CONN *conn, struct clist *cl)
@@ -2425,9 +2533,10 @@
 	mutex_enter(&qp->replylist_lock);
 
 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
-	    if (rep->xid == msgid)
-		break;
+		if (rep->xid == msgid)
+			break;
 	}
+
 	if (rep != NULL) {
 		/*
 		 * If message not yet received, wait.
@@ -2435,9 +2544,11 @@
 		if (rep->status == (uint_t)REPLY_WAIT) {
 			timout = ddi_get_lbolt() +
 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
+
 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
-				    &qp->replylist_lock, timout)) > 0 &&
-			    rep->status == (uint_t)REPLY_WAIT);
+			    &qp->replylist_lock, timout)) > 0 &&
+			    rep->status == (uint_t)REPLY_WAIT)
+				;
 
 			switch (cv_wait_ret) {
 			case -1:	/* timeout */
@@ -2468,7 +2579,7 @@
 				 */
 				ret = rep->status;
 				rib_rbuf_free(conn, RECV_BUFFER,
-					(caddr_t)(uintptr_t)rep->vaddr_cq);
+				    (caddr_t)(uintptr_t)rep->vaddr_cq);
 			}
 		}
 		(void) rib_remreply(qp, rep);
@@ -2478,10 +2589,7 @@
 		 * reply wait list.
 		 */
 		ret = RDMA_INVAL;
-#ifdef DEBUG
-		cmn_err(CE_WARN, "rib_recv: no matching reply for "
-		    "xid %u, qp %p\n", msgid, (void *)qp);
-#endif
+		DTRACE_PROBE(rpcib__i__nomatchxid2);
 	}
 
 	/*
@@ -2498,74 +2606,90 @@
 rib_write(CONN *conn, struct clist *cl, int wait)
 {
 	ibt_send_wr_t	tx_wr;
-	int		nds;
 	int		cv_sig;
+	int		i;
 	ibt_wr_ds_t	sgl[DSEG_MAX];
 	struct send_wid	*wdesc;
 	ibt_status_t	ibt_status;
 	rdma_stat	ret = RDMA_SUCCESS;
 	rib_qp_t	*qp = ctoqp(conn);
+	uint64_t	n_writes = 0;
+	bool_t		force_wait = FALSE;
 
 	if (cl == NULL) {
-		cmn_err(CE_WARN, "rib_write: NULL clist\n");
 		return (RDMA_FAILED);
 	}
 
-	bzero(&tx_wr, sizeof (ibt_send_wr_t));
-	/*
-	 * Remote address is at the head chunk item in list.
-	 */
-	tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
-	tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
-
-	nds = 0;
-	while (cl != NULL) {
-		if (nds >= DSEG_MAX) {
-			cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!");
-			return (RDMA_FAILED);
+
+	while ((cl != NULL)) {
+		if (cl->c_len > 0) {
+			bzero(&tx_wr, sizeof (ibt_send_wr_t));
+			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
+			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
+			    cl->c_dmemhandle.mrc_rmr; /* rkey */
+			sgl[0].ds_va = cl->w.c_saddr;
+			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
+			sgl[0].ds_len = cl->c_len;
+
+			if (wait) {
+				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
+				cv_sig = 1;
+			} else {
+				if (n_writes > max_unsignaled_rws) {
+					n_writes = 0;
+					force_wait = TRUE;
+					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
+					cv_sig = 1;
+				} else {
+					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
+					cv_sig = 0;
+				}
+			}
+
+			wdesc = rib_init_sendwait(0, cv_sig, qp);
+			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
+			tx_wr.wr_opcode = IBT_WRC_RDMAW;
+			tx_wr.wr_trans = IBT_RC_SRV;
+			tx_wr.wr_nds = 1;
+			tx_wr.wr_sgl = sgl;
+
+			mutex_enter(&conn->c_lock);
+			if (conn->c_state == C_CONNECTED) {
+				ibt_status =
+				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
+			}
+			if (conn->c_state != C_CONNECTED ||
+			    ibt_status != IBT_SUCCESS) {
+				if (conn->c_state != C_DISCONN_PEND)
+					conn->c_state = C_ERROR_CONN;
+				mutex_exit(&conn->c_lock);
+				(void) rib_free_sendwait(wdesc);
+				return (RDMA_CONNLOST);
+			}
+			mutex_exit(&conn->c_lock);
+
+			/*
+			 * Wait for send to complete
+			 */
+			if (wait || force_wait) {
+				force_wait = FALSE;
+				ret = rib_sendwait(qp, wdesc);
+				if (ret != 0) {
+					return (ret);
+				}
+			} else {
+				mutex_enter(&wdesc->sendwait_lock);
+				for (i = 0; i < wdesc->nsbufs; i++) {
+					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
+					    (void *)(uintptr_t)
+					    wdesc->sbufaddr[i]);
+				}
+				mutex_exit(&wdesc->sendwait_lock);
+				(void) rib_free_sendwait(wdesc);
+			}
+			n_writes ++;
 		}
-		sgl[nds].ds_va = cl->c_saddr;
-		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
-		sgl[nds].ds_len = cl->c_len;
 		cl = cl->c_next;
-		nds++;
-	}
-
-	if (wait) {
-		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
-		cv_sig = 1;
-	} else {
-		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
-		cv_sig = 0;
-	}
-
-	wdesc = rib_init_sendwait(0, cv_sig, qp);
-	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
-	tx_wr.wr_opcode = IBT_WRC_RDMAW;
-	tx_wr.wr_trans = IBT_RC_SRV;
-	tx_wr.wr_nds = nds;
-	tx_wr.wr_sgl = sgl;
-
-	mutex_enter(&conn->c_lock);
-	if (conn->c_state & C_CONNECTED) {
-		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
-	}
-	if (((conn->c_state & C_CONNECTED) == 0) ||
-		ibt_status != IBT_SUCCESS) {
-		mutex_exit(&conn->c_lock);
-		(void) rib_free_sendwait(wdesc);
-		return (RDMA_FAILED);
-	}
-	mutex_exit(&conn->c_lock);
-
-	/*
-	 * Wait for send to complete
-	 */
-	if (wait) {
-		ret = rib_sendwait(qp, wdesc);
-		if (ret != 0) {
-			return (ret);
-		}
 	}
 	return (RDMA_SUCCESS);
 }
@@ -2577,97 +2701,82 @@
 rib_read(CONN *conn, struct clist *cl, int wait)
 {
 	ibt_send_wr_t	rx_wr;
-	int		nds;
 	int		cv_sig;
-	ibt_wr_ds_t	sgl[DSEG_MAX];	/* is 2 sufficient? */
+	int		i;
+	ibt_wr_ds_t	sgl;
 	struct send_wid	*wdesc;
 	ibt_status_t	ibt_status = IBT_SUCCESS;
 	rdma_stat	ret = RDMA_SUCCESS;
 	rib_qp_t	*qp = ctoqp(conn);
 
 	if (cl == NULL) {
-		cmn_err(CE_WARN, "rib_read: NULL clist\n");
 		return (RDMA_FAILED);
 	}
 
-	bzero(&rx_wr, sizeof (ibt_send_wr_t));
-	/*
-	 * Remote address is at the head chunk item in list.
-	 */
-	rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
-	rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
-
-	nds = 0;
 	while (cl != NULL) {
-		if (nds >= DSEG_MAX) {
-			cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
-			return (RDMA_FAILED);
+		bzero(&rx_wr, sizeof (ibt_send_wr_t));
+		/*
+		 * Remote address is at the head chunk item in list.
+		 */
+		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
+		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
+
+		sgl.ds_va = cl->u.c_daddr;
+		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
+		sgl.ds_len = cl->c_len;
+
+		if (wait) {
+			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
+			cv_sig = 1;
+		} else {
+			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
+			cv_sig = 0;
 		}
-		sgl[nds].ds_va = cl->c_daddr;
-		sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
-		sgl[nds].ds_len = cl->c_len;
+
+		wdesc = rib_init_sendwait(0, cv_sig, qp);
+		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
+		rx_wr.wr_opcode = IBT_WRC_RDMAR;
+		rx_wr.wr_trans = IBT_RC_SRV;
+		rx_wr.wr_nds = 1;
+		rx_wr.wr_sgl = &sgl;
+
+		mutex_enter(&conn->c_lock);
+		if (conn->c_state == C_CONNECTED) {
+			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
+		}
+		if (conn->c_state != C_CONNECTED ||
+		    ibt_status != IBT_SUCCESS) {
+			if (conn->c_state != C_DISCONN_PEND)
+				conn->c_state = C_ERROR_CONN;
+			mutex_exit(&conn->c_lock);
+			(void) rib_free_sendwait(wdesc);
+			return (RDMA_CONNLOST);
+		}
+		mutex_exit(&conn->c_lock);
+
+		/*
+		 * Wait for send to complete if this is the
+		 * last item in the list.
+		 */
+		if (wait && cl->c_next == NULL) {
+			ret = rib_sendwait(qp, wdesc);
+			if (ret != 0) {
+				return (ret);
+			}
+		} else {
+			mutex_enter(&wdesc->sendwait_lock);
+			for (i = 0; i < wdesc->nsbufs; i++) {
+				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
+				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
+			}
+			mutex_exit(&wdesc->sendwait_lock);
+			(void) rib_free_sendwait(wdesc);
+		}
 		cl = cl->c_next;
-		nds++;
-	}
-
-	if (wait) {
-		rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
-		cv_sig = 1;
-	} else {
-		rx_wr.wr_flags = IBT_WR_NO_FLAGS;
-		cv_sig = 0;
 	}
-
-	wdesc = rib_init_sendwait(0, cv_sig, qp);
-	rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
-	rx_wr.wr_opcode = IBT_WRC_RDMAR;
-	rx_wr.wr_trans = IBT_RC_SRV;
-	rx_wr.wr_nds = nds;
-	rx_wr.wr_sgl = sgl;
-
-	mutex_enter(&conn->c_lock);
-	if (conn->c_state & C_CONNECTED) {
-		ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
-	}
-	if (((conn->c_state & C_CONNECTED) == 0) ||
-		ibt_status != IBT_SUCCESS) {
-		mutex_exit(&conn->c_lock);
-#ifdef DEBUG
-		if (rib_debug && ibt_status != IBT_SUCCESS)
-			cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
-				" wr_id %llx on qp %p, status=%d",
-				(longlong_t)rx_wr.wr_id, (void *)qp,
-				ibt_status);
-#endif
-		(void) rib_free_sendwait(wdesc);
-		return (RDMA_FAILED);
-	}
-	mutex_exit(&conn->c_lock);
-
-	/*
-	 * Wait for send to complete
-	 */
-	if (wait) {
-		ret = rib_sendwait(qp, wdesc);
-		if (ret != 0) {
-			return (ret);
-		}
-	}
-
 	return (RDMA_SUCCESS);
 }
 
-int
-is_for_ipv4(ibt_ar_t *result)
-{
-	int	i, size = sizeof (struct in_addr);
-	uint8_t	zero = 0;
-
-	for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
-		zero |= result->ar_data[i];
-	return (zero == 0);
-}
-
 /*
  * rib_srv_cm_handler()
  *    Connection Manager callback to handle RC connection requests.
@@ -2685,14 +2794,15 @@
 	rdma_stat	status = RDMA_SUCCESS;
 	int		i;
 	struct clist	cl;
-	rdma_buf_t	rdbuf;
+	rdma_buf_t	rdbuf = {0};
 	void		*buf = NULL;
-	ibt_cm_req_rcv_t	cm_req_rcv;
 	CONN		*conn;
-	ibt_status_t ibt_status;
-	ibt_ar_t	ar_query, ar_result;
-	ib_gid_t	sgid;
-
+	ibt_ip_cm_info_t	ipinfo;
+	struct sockaddr_in *s;
+	struct sockaddr_in6 *s6;
+	int sin_size = sizeof (struct sockaddr_in);
+	int in_size = sizeof (struct in_addr);
+	int sin6_size = sizeof (struct sockaddr_in6);
 
 	ASSERT(any != NULL);
 	ASSERT(event != NULL);
@@ -2719,59 +2829,22 @@
 		 * timeout on us.
 		 */
 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
-			    event->cm_event.req.req_timeout * 8, NULL, 0);
+		    event->cm_event.req.req_timeout * 8, NULL, 0);
 
 		mutex_enter(&rib_stat->open_hca_lock);
 		q = rib_stat->q;
 		mutex_exit(&rib_stat->open_hca_lock);
+
 		status = rib_svc_create_chan(hca, (caddr_t)q,
-			event->cm_event.req.req_prim_hca_port, &qp);
+		    event->cm_event.req.req_prim_hca_port, &qp);
+
 		if (status) {
-#ifdef DEBUG
-			cmn_err(CE_WARN, "rib_srv_cm_handler: "
-			    "create_channel failed %d", status);
-#endif
 			return (IBT_CM_REJECT);
 		}
-		cm_req_rcv = event->cm_event.req;
-
-#ifdef DEBUG
-		if (rib_debug > 2) {
-		    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			"server recv'ed IBT_CM_EVENT_REQ_RCV\n");
-		    cmn_err(CE_NOTE, "\t\t SID:%llx\n",
-				(longlong_t)cm_req_rcv.req_service_id);
-		    cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
-				cm_req_rcv.req_prim_hca_port);
-		    cmn_err(CE_NOTE,
-			"\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
-			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
-			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
-		    cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
-			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
-			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
-		    cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
-			cm_req_rcv.req_remote_qpn);
-		    cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
-			cm_req_rcv.req_remote_qkey);
-		    cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
-			(void *)qp, (void *)qp->qp_hdl);
-		}
-
-		if (rib_debug > 2) {
-		    ibt_rc_chan_query_attr_t	chan_attrs;
-
-		    if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
-			== IBT_SUCCESS) {
-			cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
-			    "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
-		    }
-		}
-#endif
 
 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
-		ret_args->cm_ret.rep.cm_rdma_ra_out = 1;
-		ret_args->cm_ret.rep.cm_rdma_ra_in = 1;
+		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
+		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
 
 		/*
@@ -2779,129 +2852,80 @@
 		 */
 		conn = qptoc(qp);
 		for (i = 0; i < preposted_rbufs; i++) {
-		    bzero(&rdbuf, sizeof (rdbuf));
-		    rdbuf.type = RECV_BUFFER;
-		    buf = rib_rbuf_alloc(conn, &rdbuf);
-		    if (buf == NULL) {
-			cmn_err(CE_WARN, "rib_svc_cm_handler: "
-			    "No RECV_BUFFER buf!\n");
-			(void) rib_disconnect_channel(conn, NULL);
-			return (IBT_CM_REJECT);
-		    }
-
-		    bzero(&cl, sizeof (cl));
-		    cl.c_saddr = (uintptr_t)rdbuf.addr;
-		    cl.c_len = rdbuf.len;
-		    cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
-		    cl.c_next = NULL;
-		    status = rib_post_recv(conn, &cl);
-		    if (status != RDMA_SUCCESS) {
-			cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
-			    "posting RPC_REQ buf to qp %p!", (void *)qp);
-			(void) rib_disconnect_channel(conn, NULL);
-			return (IBT_CM_REJECT);
-		    }
+			bzero(&rdbuf, sizeof (rdbuf));
+			rdbuf.type = RECV_BUFFER;
+			buf = rib_rbuf_alloc(conn, &rdbuf);
+			if (buf == NULL) {
+				(void) rib_disconnect_channel(conn, NULL);
+				return (IBT_CM_REJECT);
+			}
+
+			bzero(&cl, sizeof (cl));
+			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
+			cl.c_len = rdbuf.len;
+			cl.c_smemhandle.mrc_lmr =
+			    rdbuf.handle.mrc_lmr; /* lkey */
+			cl.c_next = NULL;
+			status = rib_post_recv(conn, &cl);
+			if (status != RDMA_SUCCESS) {
+				(void) rib_disconnect_channel(conn, NULL);
+				return (IBT_CM_REJECT);
+			}
 		}
 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
 
 		/*
-		 * Get the address translation service record from ATS
+		 * Get the address translation
 		 */
 		rw_enter(&hca->state_lock, RW_READER);
 		if (hca->state == HCA_DETACHED) {
-		    rw_exit(&hca->state_lock);
-		    return (IBT_CM_REJECT);
+			rw_exit(&hca->state_lock);
+			return (IBT_CM_REJECT);
 		}
 		rw_exit(&hca->state_lock);
 
-		for (i = 0; i < hca->hca_nports; i++) {
-		    ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
-					&sgid, NULL);
-		    if (ibt_status != IBT_SUCCESS) {
-			if (rib_debug) {
-			    cmn_err(CE_WARN, "rib_srv_cm_handler: "
-				"ibt_get_port_state FAILED!"
-				"status = %d\n", ibt_status);
-			}
-		    } else {
-			/*
-			 * do ibt_query_ar()
-			 */
-			bzero(&ar_query, sizeof (ar_query));
-			bzero(&ar_result, sizeof (ar_result));
-			ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
-			ar_query.ar_pkey = event->cm_event.req.req_pkey;
-			ibt_status = ibt_query_ar(&sgid, &ar_query,
-							&ar_result);
-			if (ibt_status != IBT_SUCCESS) {
-			    if (rib_debug) {
-				cmn_err(CE_WARN, "rib_srv_cm_handler: "
-				    "ibt_query_ar FAILED!"
-				    "status = %d\n", ibt_status);
-			    }
-			} else {
-			    conn = qptoc(qp);
-
-			    if (is_for_ipv4(&ar_result)) {
-				struct sockaddr_in *s;
-				int sin_size = sizeof (struct sockaddr_in);
-				int in_size = sizeof (struct in_addr);
-				uint8_t	*start_pos;
-
-				conn->c_raddr.maxlen =
-					conn->c_raddr.len = sin_size;
-				conn->c_raddr.buf = kmem_zalloc(sin_size,
-						KM_SLEEP);
-				s = (struct sockaddr_in *)conn->c_raddr.buf;
-				s->sin_family = AF_INET;
-				/*
-				 * For IPv4,  the IP addr is stored in
-				 * the last four bytes of ar_data.
-				 */
-				start_pos = ar_result.ar_data +
-					ATS_AR_DATA_LEN - in_size;
-				bcopy(start_pos, &s->sin_addr, in_size);
-				if (rib_debug > 1) {
-				    char print_addr[INET_ADDRSTRLEN];
-
-				    bzero(print_addr, INET_ADDRSTRLEN);
-				    (void) inet_ntop(AF_INET, &s->sin_addr,
-						print_addr, INET_ADDRSTRLEN);
-				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-					"remote clnt_addr: %s\n", print_addr);
-				}
-			    } else {
-				struct sockaddr_in6 *s6;
-				int sin6_size = sizeof (struct sockaddr_in6);
-
-				conn->c_raddr.maxlen =
-					conn->c_raddr.len = sin6_size;
-				conn->c_raddr.buf = kmem_zalloc(sin6_size,
-					KM_SLEEP);
-
-				s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
-				s6->sin6_family = AF_INET6;
-				/* sin6_addr is stored in ar_data */
-				bcopy(ar_result.ar_data, &s6->sin6_addr,
-					sizeof (struct in6_addr));
-				if (rib_debug > 1) {
-				    char print_addr[INET6_ADDRSTRLEN];
-
-				    bzero(print_addr, INET6_ADDRSTRLEN);
-				    (void) inet_ntop(AF_INET6, &s6->sin6_addr,
-						print_addr, INET6_ADDRSTRLEN);
-				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-					"remote clnt_addr: %s\n", print_addr);
-				}
-			    }
-			    return (IBT_CM_ACCEPT);
-			}
-		    }
+		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
+
+		if (ibt_get_ip_data(event->cm_priv_data_len,
+		    event->cm_priv_data,
+		    &ipinfo) != IBT_SUCCESS) {
+
+			return (IBT_CM_REJECT);
 		}
-		if (rib_debug > 1) {
-		    cmn_err(CE_WARN, "rib_srv_cm_handler: "
-				"address record query failed!");
+
+		switch (ipinfo.src_addr.family) {
+		case AF_INET:
+
+			conn->c_raddr.maxlen =
+			    conn->c_raddr.len = sin_size;
+			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
+
+			s = (struct sockaddr_in *)conn->c_raddr.buf;
+			s->sin_family = AF_INET;
+
+			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
+			    &s->sin_addr, in_size);
+
+			break;
+
+		case AF_INET6:
+
+			conn->c_raddr.maxlen =
+			    conn->c_raddr.len = sin6_size;
+			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
+
+			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
+			s6->sin6_family = AF_INET6;
+			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
+			    &s6->sin6_addr,
+			    sizeof (struct in6_addr));
+
+			break;
+
+		default:
+			return (IBT_CM_REJECT);
 		}
+
 		break;
 
 	case IBT_CM_EVENT_CONN_CLOSED:
@@ -2936,7 +2960,7 @@
 				mutex_exit(&conn->c_lock);
 				break;
 			}
-			conn->c_state = C_ERROR;
+			conn->c_state = C_ERROR_CONN;
 
 			/*
 			 * Free the rc_channel. Channel has already
@@ -2956,49 +2980,45 @@
 				conn->c_state = C_DISCONN_PEND;
 				mutex_exit(&conn->c_lock);
 				(void) rib_disconnect_channel(conn,
-					&hca->srv_conn_list);
+				    &hca->srv_conn_list);
 			} else {
 				mutex_exit(&conn->c_lock);
 			}
-#ifdef DEBUG
-			if (rib_debug)
-				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-					" (CONN_CLOSED) channel disconnected");
-#endif
+			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
 			break;
 		}
 		break;
 	}
 	case IBT_CM_EVENT_CONN_EST:
-	/*
-	 * RTU received, hence connection established.
-	 */
+		/*
+		 * RTU received, hence connection established.
+		 */
 		if (rib_debug > 1)
 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-				"(CONN_EST) channel established");
+			    "(CONN_EST) channel established");
 		break;
 
 	default:
-	    if (rib_debug > 2) {
-		/* Let CM handle the following events. */
-		if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
-			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
-		} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
-			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
-		} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
-			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
-		} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
-			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
-		} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
-			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
-			    "server recv'ed IBT_CM_EVENT_FAILURE\n");
+		if (rib_debug > 2) {
+			/* Let CM handle the following events. */
+			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
+				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
+				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
+			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
+				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
+				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
+			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
+				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
+				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
+			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
+				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
+				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
+			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
+				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
+				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
+			}
 		}
-	    }
-	    return (IBT_CM_REJECT);
+		return (IBT_CM_DEFAULT);
 	}
 
 	/* accept all other CM messages (i.e. let the CM handle them) */
@@ -3006,203 +3026,16 @@
 }
 
 static rdma_stat
-rib_register_ats(rib_hca_t *hca)
-{
-	ibt_hca_portinfo_t	*port_infop;
-	uint_t			port_size;
-	uint_t			pki, i, num_ports, nbinds;
-	ibt_status_t		ibt_status;
-	rib_service_t		*new_service, *temp_srv;
-	rpcib_ats_t		*atsp;
-	rpcib_ibd_insts_t	ibds;
-	ib_pkey_t		pkey;
-	ibt_ar_t		ar;	/* address record */
-
-	/*
-	 * Query all ports for the given HCA
-	 */
-	rw_enter(&hca->state_lock, RW_READER);
-	if (hca->state != HCA_DETACHED) {
-		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
-		    &num_ports, &port_size);
-		rw_exit(&hca->state_lock);
-	} else {
-		rw_exit(&hca->state_lock);
-		return (RDMA_FAILED);
-	}
-	if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-	    if (rib_debug) {
-		cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
-		    "ibt_query_hca_ports, status = %d\n", ibt_status);
-	    }
-#endif
-		return (RDMA_FAILED);
-	}
-
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
-		    "%d\n", num_ports);
-
-		for (i = 0; i < num_ports; i++) {
-			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
-				cmn_err(CE_WARN, "rib_register_ats "
-				    "Port #: %d INACTIVE\n", i+1);
-			} else if (port_infop[i].p_linkstate ==
-			    IBT_PORT_ACTIVE) {
-				cmn_err(CE_NOTE, "rib_register_ats "
-				    "Port #: %d ACTIVE\n", i+1);
-			}
-		}
-	}
-#endif
-
-	ibds.rib_ibd_alloc = N_IBD_INSTANCES;
-	ibds.rib_ibd_cnt = 0;
-	ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
-			sizeof (rpcib_ats_t), KM_SLEEP);
-	rib_get_ibd_insts(&ibds);
-
-	if (ibds.rib_ibd_cnt == 0) {
-	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
-				sizeof (rpcib_ats_t));
-	    ibt_free_portinfo(port_infop, port_size);
-	    return (RDMA_FAILED);
-	}
-
-	/*
-	 * Get the IP addresses of active ports and
-	 * register them with ATS.  IPv4 addresses
-	 * have precedence over IPv6 addresses.
-	 */
-	if (get_ibd_ipaddr(&ibds) != 0) {
-#ifdef	DEBUG
-	    if (rib_debug > 1) {
-		cmn_err(CE_WARN, "rib_register_ats: "
-		    "get_ibd_ipaddr failed");
-	    }
-#endif
-	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
-				sizeof (rpcib_ats_t));
-	    ibt_free_portinfo(port_infop, port_size);
-	    return (RDMA_FAILED);
-	}
-
-	/*
-	 * Start ATS registration for active ports on this HCA.
-	 */
-	rw_enter(&hca->service_list_lock, RW_WRITER);
-	nbinds = 0;
-	new_service = NULL;
-	for (i = 0; i < num_ports; i++) {
-		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
-			continue;
-
-	    for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
-		pkey = port_infop[i].p_pkey_tbl[pki];
-		if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
-		    ar.ar_gid = port_infop[i].p_sgid_tbl[0];
-		    ar.ar_pkey = pkey;
-		    atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
-		    if (atsp == NULL)
-			continue;
-		/*
-		 * store the sin[6]_addr in ar_data
-		 */
-		    (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
-		    if (atsp->ras_inet_type == AF_INET) {
-			uint8_t *start_pos;
-
-			/*
-			 * The ipv4 addr goes into the last
-			 * four bytes of ar_data.
-			 */
-			start_pos = ar.ar_data + ATS_AR_DATA_LEN -
-				sizeof (struct in_addr);
-			bcopy(&atsp->ras_sin.sin_addr, start_pos,
-				sizeof (struct in_addr));
-		    } else if (atsp->ras_inet_type == AF_INET6) {
-			bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
-				sizeof (struct in6_addr));
-		    } else
-			continue;
-
-		    ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
-		    if (ibt_status == IBT_SUCCESS) {
-#ifdef	DEBUG
-			if (rib_debug > 1) {
-				cmn_err(CE_WARN, "rib_register_ats: "
-				    "ibt_register_ar OK on port %d", i+1);
-			}
-#endif
-			/*
-			 * Allocate and prepare a service entry
-			 */
-			new_service = kmem_zalloc(sizeof (rib_service_t),
-				KM_SLEEP);
-			new_service->srv_port = i + 1;
-			new_service->srv_ar = ar;
-			new_service->srv_next = NULL;
-
-			/*
-			 * Add to the service list for this HCA
-			 */
-			new_service->srv_next = hca->ats_list;
-			hca->ats_list = new_service;
-			new_service = NULL;
-			nbinds ++;
-		    } else {
-#ifdef	DEBUG
-			if (rib_debug > 1) {
-			    cmn_err(CE_WARN, "rib_register_ats: "
-			    "ibt_register_ar FAILED on port %d", i+1);
-			}
-#endif
-		    }
-		}
-	    }
-	}
-
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		for (temp_srv = hca->ats_list; temp_srv != NULL;
-			temp_srv = temp_srv->srv_next) {
-				cmn_err(CE_NOTE, "Service: ATS, active on"
-					" port: %d\n", temp_srv->srv_port);
-		}
-	}
-#endif
-
-	rw_exit(&hca->service_list_lock);
-	kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
-	ibt_free_portinfo(port_infop, port_size);
-
-	if (nbinds == 0) {
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
-	}
-#endif
-		return (RDMA_FAILED);
-	}
-	return (RDMA_SUCCESS);
-}
-
-static rdma_stat
 rib_register_service(rib_hca_t *hca, int service_type)
 {
 	ibt_srv_desc_t		sdesc;
-	ibt_srv_bind_t		sbind;
 	ibt_hca_portinfo_t	*port_infop;
 	ib_svc_id_t		srv_id;
 	ibt_srv_hdl_t		srv_hdl;
 	uint_t			port_size;
-	uint_t			pki, i, j, num_ports, nbinds;
+	uint_t			pki, i, num_ports, nbinds;
 	ibt_status_t		ibt_status;
-	char			**addrs;
-	int			addr_count;
-	rib_service_t		*new_service, *temp_srv;
+	rib_service_t		*new_service;
 	ib_pkey_t		pkey;
 
 	/*
@@ -3218,30 +3051,22 @@
 		return (RDMA_FAILED);
 	}
 	if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-		cmn_err(CE_NOTE, "rib_register_service: FAILED in "
-		    "ibt_query_hca_ports, status = %d\n", ibt_status);
-#endif
 		return (RDMA_FAILED);
 	}
 
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		cmn_err(CE_NOTE, "rib_register_service: Ports detected "
-		    "%d\n", num_ports);
-
-		for (i = 0; i < num_ports; i++) {
-			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
-				cmn_err(CE_WARN, "rib_register_service "
-				    "Port #: %d INACTIVE\n", i+1);
-			} else if (port_infop[i].p_linkstate ==
-			    IBT_PORT_ACTIVE) {
-				cmn_err(CE_NOTE, "rib_register_service "
-				    "Port #: %d ACTIVE\n", i+1);
-			}
+	DTRACE_PROBE1(rpcib__i__regservice_numports,
+	    int, num_ports);
+
+	for (i = 0; i < num_ports; i++) {
+		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
+			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
+			    int, i+1);
+		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
+			DTRACE_PROBE1(rpcib__i__regservice__portactive,
+			    int, i+1);
 		}
 	}
-#endif
+
 	/*
 	 * Get all the IP addresses on this system to register the
 	 * given "service type" on all DNS recognized IP addrs.
@@ -3249,25 +3074,6 @@
 	 * IP addresses as its different names. For now the only
 	 * type of service we support in RPCIB is NFS.
 	 */
-	addrs = get_ip_addrs(&addr_count);
-	if (addrs == NULL) {
-#ifdef DEBUG
-		if (rib_debug) {
-		    cmn_err(CE_WARN, "rib_register_service: "
-			"get_ip_addrs failed\n");
-		}
-#endif
-		ibt_free_portinfo(port_infop, port_size);
-		return (RDMA_FAILED);
-	}
-
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		for (i = 0; i < addr_count; i++)
-			cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
-	}
-#endif
-
 	rw_enter(&hca->service_list_lock, RW_WRITER);
 	/*
 	 * Start registering and binding service to active
@@ -3282,149 +3088,65 @@
 	 * with CM to obtain a svc_id and svc_hdl.  We do not
 	 * register the service with machine's loopback address.
 	 */
-	for (j = 1; j < addr_count; j++) {
-	    (void) bzero(&srv_id, sizeof (ib_svc_id_t));
-	    (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
-	    (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
-
-	    sdesc.sd_handler = rib_srv_cm_handler;
-	    sdesc.sd_flags = 0;
-
-	    ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
-			    &sdesc, 0, 1, &srv_hdl, &srv_id);
-	    if (ibt_status != IBT_SUCCESS) {
-#ifdef DEBUG
-		if (rib_debug) {
-		    cmn_err(CE_WARN, "rib_register_service: "
-			"ibt_register_service FAILED, status "
-			"= %d\n", ibt_status);
-		}
-#endif
-		/*
-		 * No need to go on, since we failed to obtain
-		 * a srv_id and srv_hdl. Move on to the next
-		 * IP addr as a service name.
-		 */
-		continue;
-	    }
-	    for (i = 0; i < num_ports; i++) {
+	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
+	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
+	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
+
+	sdesc.sd_handler = rib_srv_cm_handler;
+	sdesc.sd_flags = 0;
+	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
+	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
+	    1, &srv_hdl, &srv_id);
+
+	for (i = 0; i < num_ports; i++) {
 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
 			continue;
 
 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
-		    pkey = port_infop[i].p_pkey_tbl[pki];
-		    if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
-
-			/*
-			 * Allocate and prepare a service entry
-			 */
-			new_service = kmem_zalloc(1 * sizeof (rib_service_t),
-			    KM_SLEEP);
-			new_service->srv_type = service_type;
-			new_service->srv_port = i + 1;
-			new_service->srv_id = srv_id;
-			new_service->srv_hdl = srv_hdl;
-			new_service->srv_sbind_hdl = kmem_zalloc(1 *
-			    sizeof (ibt_sbind_hdl_t), KM_SLEEP);
-
-			new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
-			    KM_SLEEP);
-			(void) bcopy(addrs[j], new_service->srv_name,
-			    IB_SVC_NAME_LEN);
-			(void) strlcat(new_service->srv_name, "::NFS",
-				IB_SVC_NAME_LEN);
-			new_service->srv_next = NULL;
-
-			/*
-			 * Bind the service, specified by the IP address,
-			 * to the port/pkey using the srv_hdl returned
-			 * from ibt_register_service().
-			 */
-			(void) bzero(&sbind, sizeof (ibt_srv_bind_t));
-			sbind.sb_pkey = pkey;
-			sbind.sb_lease = 0xFFFFFFFF;
-			sbind.sb_key[0] = NFS_SEC_KEY0;
-			sbind.sb_key[1] = NFS_SEC_KEY1;
-			sbind.sb_name = new_service->srv_name;
-
-#ifdef	DEBUG
-			if (rib_debug > 1) {
-				cmn_err(CE_NOTE, "rib_register_service: "
-				    "binding service using name: %s\n",
-				    sbind.sb_name);
+			pkey = port_infop[i].p_pkey_tbl[pki];
+			if ((pkey & IBSRM_HB) &&
+			    (pkey != IB_PKEY_INVALID_FULL)) {
+
+				/*
+				 * Allocate and prepare a service entry
+				 */
+				new_service =
+				    kmem_zalloc(1 * sizeof (rib_service_t),
+				    KM_SLEEP);
+
+				new_service->srv_type = service_type;
+				new_service->srv_hdl = srv_hdl;
+				new_service->srv_next = NULL;
+
+				ibt_status = ibt_bind_service(srv_hdl,
+				    port_infop[i].p_sgid_tbl[0],
+				    NULL, rib_stat, NULL);
+
+				DTRACE_PROBE1(rpcib__i__regservice__bindres,
+				    int, ibt_status);
+
+				if (ibt_status != IBT_SUCCESS) {
+					kmem_free(new_service,
+					    sizeof (rib_service_t));
+					new_service = NULL;
+					continue;
+				}
+
+				/*
+				 * Add to the service list for this HCA
+				 */
+				new_service->srv_next = hca->service_list;
+				hca->service_list = new_service;
+				new_service = NULL;
+				nbinds++;
 			}
-#endif
-			ibt_status = ibt_bind_service(srv_hdl,
-			    port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
-			    new_service->srv_sbind_hdl);
-			if (ibt_status != IBT_SUCCESS) {
-#ifdef	DEBUG
-			    if (rib_debug) {
-				cmn_err(CE_WARN, "rib_register_service: FAILED"
-				    " in ibt_bind_service, status = %d\n",
-				    ibt_status);
-			    }
-#endif
-				kmem_free(new_service->srv_sbind_hdl,
-				    sizeof (ibt_sbind_hdl_t));
-				kmem_free(new_service->srv_name,
-				    IB_SVC_NAME_LEN);
-				kmem_free(new_service,
-				    sizeof (rib_service_t));
-				new_service = NULL;
-				continue;
-			}
-#ifdef	DEBUG
-			if (rib_debug > 1) {
-				if (ibt_status == IBT_SUCCESS)
-					cmn_err(CE_NOTE, "rib_regstr_service: "
-					    "Serv: %s REGISTERED on port: %d",
-					    sbind.sb_name, i+1);
-			}
-#endif
-			/*
-			 * Add to the service list for this HCA
-			 */
-			new_service->srv_next = hca->service_list;
-			hca->service_list = new_service;
-			new_service = NULL;
-			nbinds ++;
-		    }
 		}
-	    }
 	}
 	rw_exit(&hca->service_list_lock);
 
-#ifdef	DEBUG
-	if (rib_debug > 1) {
-		/*
-		 * Change this print to a more generic one, as rpcib
-		 * is supposed to handle multiple service types.
-		 */
-		for (temp_srv = hca->service_list; temp_srv != NULL;
-			temp_srv = temp_srv->srv_next) {
-				cmn_err(CE_NOTE, "NFS-IB, active on port:"
-					" %d\n"
-					"Using name: %s", temp_srv->srv_port,
-					temp_srv->srv_name);
-		}
-	}
-#endif
-
 	ibt_free_portinfo(port_infop, port_size);
-	for (i = 0; i < addr_count; i++) {
-		if (addrs[i])
-			kmem_free(addrs[i], IB_SVC_NAME_LEN);
-	}
-	kmem_free(addrs, addr_count * sizeof (char *));
 
 	if (nbinds == 0) {
-#ifdef	DEBUG
-	    if (rib_debug) {
-		cmn_err(CE_WARN, "rib_register_service: "
-		    "bind_service FAILED!\n");
-	    }
-#endif
 		return (RDMA_FAILED);
 	} else {
 		/*
@@ -3458,26 +3180,6 @@
 
 	rib_stat->q = &rd->q;
 	/*
-	 * Register the Address translation service
-	 */
-	mutex_enter(&rib_stat->open_hca_lock);
-	if (ats_running == 0) {
-		if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
-#ifdef	DEBUG
-		    if (rib_debug) {
-			cmn_err(CE_WARN,
-			    "rib_listen(): ats registration failed!");
-		    }
-#endif
-		    mutex_exit(&rib_stat->open_hca_lock);
-		    return;
-		} else {
-			ats_running = 1;
-		}
-	}
-	mutex_exit(&rib_stat->open_hca_lock);
-
-	/*
 	 * Right now the only service type is NFS. Hence force feed this
 	 * value. Ideally to communicate the service type it should be
 	 * passed down in rdma_svc_data.
@@ -3524,6 +3226,7 @@
 		rw_exit(&hca->state_lock);
 		return;
 	}
+	rib_close_channels(&hca->srv_conn_list);
 	rib_stop_services(hca);
 	rw_exit(&hca->state_lock);
 }
@@ -3549,7 +3252,6 @@
 rib_stop_services(rib_hca_t *hca)
 {
 	rib_service_t		*srv_list, *to_remove;
-	ibt_status_t   		ibt_status;
 
 	/*
 	 * unbind and deregister the services for this service type.
@@ -3564,35 +3266,10 @@
 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
 
-		    ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
-		    if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "rib_listen_stop: "
-			    "ibt_unbind_all_services FAILED"
-				" status: %d\n", ibt_status);
-		    }
-
-		    ibt_status =
-			ibt_deregister_service(hca->ibt_clnt_hdl,
-				to_remove->srv_hdl);
-		    if (ibt_status != IBT_SUCCESS) {
-			cmn_err(CE_WARN, "rib_listen_stop: "
-			    "ibt_deregister_service FAILED"
-				" status: %d\n", ibt_status);
-		    }
-
-#ifdef	DEBUG
-		    if (rib_debug > 1) {
-			if (ibt_status == IBT_SUCCESS)
-				cmn_err(CE_NOTE, "rib_listen_stop: "
-				    "Successfully stopped and"
-				    " UNREGISTERED service: %s\n",
-				    to_remove->srv_name);
-		    }
-#endif
+			(void) ibt_unbind_all_services(to_remove->srv_hdl);
+			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
+			    to_remove->srv_hdl);
 		}
-		kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
-		kmem_free(to_remove->srv_sbind_hdl,
-			sizeof (ibt_sbind_hdl_t));
 
 		kmem_free(to_remove, sizeof (rib_service_t));
 	}
@@ -3628,8 +3305,7 @@
 
 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
 	if (rep == NULL) {
-		mutex_exit(&qp->replylist_lock);
-		cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
+		DTRACE_PROBE(rpcib__i__addrreply__nomem);
 		return (NULL);
 	}
 	rep->xid = msgid;
@@ -3645,9 +3321,10 @@
 		qp->replylist->prev = rep;
 	}
 	qp->rep_list_size++;
-	if (rib_debug > 1)
-	    cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
-		(void *)qp, qp->rep_list_size);
+
+	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
+	    int, qp->rep_list_size);
+
 	qp->replylist = rep;
 	mutex_exit(&qp->replylist_lock);
 
@@ -3685,9 +3362,9 @@
 
 	cv_destroy(&rep->wait_cv);
 	qp->rep_list_size--;
-	if (rib_debug > 1)
-	    cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
-		(void *)qp, qp->rep_list_size);
+
+	DTRACE_PROBE1(rpcib__i__remreply__listsize,
+	    int, qp->rep_list_size);
 
 	kmem_free(rep, sizeof (*rep));
 
@@ -3695,7 +3372,7 @@
 }
 
 rdma_stat
-rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
+rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
 	struct mrc *buf_handle)
 {
 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
@@ -3706,7 +3383,7 @@
 	/*
 	 * Note: ALL buffer pools use the same memory type RDMARW.
 	 */
-	status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc);
+	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
 	if (status == RDMA_SUCCESS) {
 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
@@ -3720,15 +3397,15 @@
 }
 
 static rdma_stat
-rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
+rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
+	ibt_mr_flags_t spec,
 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
 {
 	ibt_mr_attr_t	mem_attr;
 	ibt_status_t	ibt_status;
-
 	mem_attr.mr_vaddr = (uintptr_t)buf;
 	mem_attr.mr_len = (ib_msglen_t)size;
-	mem_attr.mr_as = NULL;
+	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
@@ -3736,7 +3413,7 @@
 	rw_enter(&hca->state_lock, RW_READER);
 	if (hca->state == HCA_INITED) {
 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
-					&mem_attr, mr_hdlp, mr_descp);
+		    &mem_attr, mr_hdlp, mr_descp);
 		rw_exit(&hca->state_lock);
 	} else {
 		rw_exit(&hca->state_lock);
@@ -3744,19 +3421,17 @@
 	}
 
 	if (ibt_status != IBT_SUCCESS) {
-		cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
-			"(spec:%d) failed for addr %llX, status %d",
-			spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
 		return (RDMA_FAILED);
 	}
 	return (RDMA_SUCCESS);
 }
 
 rdma_stat
-rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
-	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
+rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
+	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
 {
 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
+	rib_lrc_entry_t *l;
 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
 	rdma_stat	status;
 	rib_hca_t	*hca = (ctoqp(conn))->hca;
@@ -3764,9 +3439,33 @@
 	/*
 	 * Non-coherent memory registration.
 	 */
-	status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
-			&mr_desc);
+	l = (rib_lrc_entry_t *)lrc;
+	if (l) {
+		if (l->registered) {
+			buf_handle->mrc_linfo =
+			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
+			buf_handle->mrc_lmr =
+			    (uint32_t)l->lrc_mhandle.mrc_lmr;
+			buf_handle->mrc_rmr =
+			    (uint32_t)l->lrc_mhandle.mrc_rmr;
+			*sync_handle = (RIB_SYNCMEM_HANDLE)
+			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
+			return (RDMA_SUCCESS);
+		} else {
+			/* Always register the whole buffer */
+			buf = (caddr_t)l->lrc_buf;
+			buflen = l->lrc_len;
+		}
+	}
+	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
+
 	if (status == RDMA_SUCCESS) {
+		if (l) {
+			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
+			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
+			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
+			l->registered		 = TRUE;
+		}
 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
@@ -3784,7 +3483,6 @@
 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
 {
 	rib_hca_t *hca = (ctoqp(conn))->hca;
-
 	/*
 	 * Allow memory deregistration even if HCA is
 	 * getting detached. Need all outstanding
@@ -3792,15 +3490,21 @@
 	 * before HCA_DETACH_EVENT can be accepted.
 	 */
 	(void) ibt_deregister_mr(hca->hca_hdl,
-			(ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
+	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
 	return (RDMA_SUCCESS);
 }
 
 /* ARGSUSED */
 rdma_stat
 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
-		RIB_SYNCMEM_HANDLE sync_handle)
+		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
 {
+	rib_lrc_entry_t *l;
+	l = (rib_lrc_entry_t *)lrc;
+	if (l)
+		if (l->registered)
+			return (RDMA_SUCCESS);
+
 	(void) rib_deregistermem(conn, buf, buf_handle);
 
 	return (RDMA_SUCCESS);
@@ -3837,10 +3541,6 @@
 	if (status == IBT_SUCCESS)
 		return (RDMA_SUCCESS);
 	else {
-#ifdef DEBUG
-		cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
-			status);
-#endif
 		return (RDMA_FAILED);
 	}
 }
@@ -3874,23 +3574,22 @@
 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
 
 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
-			num * sizeof (void *), KM_SLEEP);
+	    num * sizeof (void *), KM_SLEEP);
 
 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
 	bp->numelems = num;
 
+
 	switch (ptype) {
-	    case SEND_BUFFER:
+	case SEND_BUFFER:
 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
-		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
 		bp->rsize = RPC_MSG_SZ;
 		break;
-	    case RECV_BUFFER:
+	case RECV_BUFFER:
 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
-		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
 		bp->rsize = RPC_BUF_SIZE;
 		break;
-	    default:
+	default:
 		goto fail;
 	}
 
@@ -3900,33 +3599,35 @@
 	bp->bufsize = num * bp->rsize;
 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
-			sizeof (ibt_mr_hdl_t), KM_SLEEP);
+	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
-			sizeof (ibt_mr_desc_t), KM_SLEEP);
-
+	    sizeof (ibt_mr_desc_t), KM_SLEEP);
 	rw_enter(&hca->state_lock, RW_READER);
+
 	if (hca->state != HCA_INITED) {
 		rw_exit(&hca->state_lock);
 		goto fail;
 	}
+
 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
 		mem_attr.mr_vaddr = (uintptr_t)buf;
 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
 		mem_attr.mr_as = NULL;
 		ibt_status = ibt_register_mr(hca->hca_hdl,
-			hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
-			&rbp->mr_desc[i]);
+		    hca->pd_hdl, &mem_attr,
+		    &rbp->mr_hdl[i],
+		    &rbp->mr_desc[i]);
 		if (ibt_status != IBT_SUCCESS) {
-		    for (j = 0; j < i; j++) {
-			(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
-		    }
-		    rw_exit(&hca->state_lock);
-		    goto fail;
+			for (j = 0; j < i; j++) {
+				(void) ibt_deregister_mr(hca->hca_hdl,
+				    rbp->mr_hdl[j]);
+			}
+			rw_exit(&hca->state_lock);
+			goto fail;
 		}
 	}
 	rw_exit(&hca->state_lock);
-
 	buf = (caddr_t)bp->buf;
 	for (i = 0; i < num; i++, buf += bp->rsize) {
 		bp->buflist[i] = (void *)buf;
@@ -3937,16 +3638,16 @@
 	return (rbp);
 fail:
 	if (bp) {
-	    if (bp->buf)
-		kmem_free(bp->buf, bp->bufsize);
-	    kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
+		if (bp->buf)
+			kmem_free(bp->buf, bp->bufsize);
+		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
 	}
 	if (rbp) {
-	    if (rbp->mr_hdl)
-		kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
-	    if (rbp->mr_desc)
-		kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
-	    kmem_free(rbp, sizeof (rib_bufpool_t));
+		if (rbp->mr_hdl)
+			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
+		if (rbp->mr_desc)
+			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
+		kmem_free(rbp, sizeof (rib_bufpool_t));
 	}
 	return (NULL);
 }
@@ -4017,7 +3718,6 @@
 
 	if (rbp->mr_desc)
 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
-
 	if (bp->buf)
 		kmem_free(bp->buf, bp->bufsize);
 	mutex_destroy(&bp->buflock);
@@ -4041,6 +3741,15 @@
 static rdma_stat
 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
 {
+	rib_lrc_entry_t *rlep;
+
+	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
+		rlep = rib_get_cache_buf(conn, rdbuf->len);
+		rdbuf->rb_private =  (caddr_t)rlep;
+		rdbuf->addr = rlep->lrc_buf;
+		rdbuf->handle = rlep->lrc_mhandle;
+		return (RDMA_SUCCESS);
+	}
 
 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
 	if (rdbuf->addr) {
@@ -4059,6 +3768,15 @@
 		return (RDMA_FAILED);
 }
 
+#if defined(MEASURE_POOL_DEPTH)
+static void rib_recv_bufs(uint32_t x) {
+
+}
+
+static void rib_send_bufs(uint32_t x) {
+
+}
+#endif
 
 /*
  * Fetch a buffer of specified type.
@@ -4079,14 +3797,14 @@
 	 * Obtain pool address based on type of pool
 	 */
 	switch (ptype) {
-		case SEND_BUFFER:
-			rbp = hca->send_pool;
-			break;
-		case RECV_BUFFER:
-			rbp = hca->recv_pool;
-			break;
-		default:
-			return (NULL);
+	case SEND_BUFFER:
+		rbp = hca->send_pool;
+		break;
+	case RECV_BUFFER:
+		rbp = hca->recv_pool;
+		break;
+	default:
+		return (NULL);
 	}
 	if (rbp == NULL)
 		return (NULL);
@@ -4095,7 +3813,6 @@
 
 	mutex_enter(&bp->buflock);
 	if (bp->buffree < 0) {
-		cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
 		mutex_exit(&bp->buflock);
 		return (NULL);
 	}
@@ -4105,22 +3822,27 @@
 	rdbuf->addr = buf;
 	rdbuf->len = bp->rsize;
 	for (i = bp->numelems - 1; i >= 0; i--) {
-	    if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
-		rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
-		rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
-		rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
-		bp->buffree--;
-		if (rib_debug > 1)
-		    cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
-			"(type %d)\n", bp->buffree+1, ptype);
-
-		mutex_exit(&bp->buflock);
-
-		return (buf);
-	    }
+		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
+			rdbuf->handle.mrc_rmr =
+			    (uint32_t)rbp->mr_desc[i].md_rkey;
+			rdbuf->handle.mrc_linfo =
+			    (uintptr_t)rbp->mr_hdl[i];
+			rdbuf->handle.mrc_lmr =
+			    (uint32_t)rbp->mr_desc[i].md_lkey;
+#if defined(MEASURE_POOL_DEPTH)
+			if (ptype == SEND_BUFFER)
+				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
+			if (ptype == RECV_BUFFER)
+				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
+#endif
+			bp->buffree--;
+
+			mutex_exit(&bp->buflock);
+
+			return (buf);
+		}
 	}
-	cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
-		"type %d found!", buf, ptype);
+
 	mutex_exit(&bp->buflock);
 
 	return (NULL);
@@ -4130,6 +3852,11 @@
 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
 {
 
+	if (rdbuf->type == RDMA_LONG_BUFFER) {
+		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
+		rdbuf->rb_private = NULL;
+		return;
+	}
 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
 }
 
@@ -4145,14 +3872,14 @@
 	 * Obtain pool address based on type of pool
 	 */
 	switch (ptype) {
-		case SEND_BUFFER:
-			rbp = hca->send_pool;
-			break;
-		case RECV_BUFFER:
-			rbp = hca->recv_pool;
-			break;
-		default:
-			return;
+	case SEND_BUFFER:
+		rbp = hca->send_pool;
+		break;
+	case RECV_BUFFER:
+		rbp = hca->recv_pool;
+		break;
+	default:
+		return;
 	}
 	if (rbp == NULL)
 		return;
@@ -4164,14 +3891,9 @@
 		/*
 		 * Should never happen
 		 */
-		cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
-			"too many frees!", ptype);
 		bp->buffree--;
 	} else {
 		bp->buflist[bp->buffree] = buf;
-		if (rib_debug > 1)
-		    cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
-			"(type %d)\n", bp->buffree+1, ptype);
 	}
 	mutex_exit(&bp->buflock);
 }
@@ -4210,16 +3932,16 @@
 /*
  * Connection management.
  * IBTF does not support recycling of channels. So connections are only
- * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
+ * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
  * C_DISCONN_PEND state. No C_IDLE state.
  * C_CONN_PEND state: Connection establishment in progress to the server.
  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
  * It has an RC channel associated with it. ibt_post_send/recv are allowed
  * only in this state.
- * C_ERROR state: A connection transitions to this state when WRs on the
+ * C_ERROR_CONN state: A connection transitions to this state when WRs on the
  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
- * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
+ * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
  * c_ref drops to 0 (this indicates that RPC has no more references to this
  * connection), the connection should be destroyed. A connection transitions
  * into this state when it is being destroyed.
@@ -4233,6 +3955,7 @@
 	rib_qp_t *qp;
 	clock_t cv_stat, timout;
 	ibt_path_info_t path;
+	ibt_ip_addr_t s_ip, d_ip;
 
 again:
 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
@@ -4242,7 +3965,7 @@
 		 * First, clear up any connection in the ERROR state
 		 */
 		mutex_enter(&cn->c_lock);
-		if (cn->c_state == C_ERROR) {
+		if (cn->c_state == C_ERROR_CONN) {
 			if (cn->c_ref == 0) {
 				/*
 				 * Remove connection from list and destroy it.
@@ -4257,7 +3980,8 @@
 			mutex_exit(&cn->c_lock);
 			cn = cn->c_next;
 			continue;
-		} else if (cn->c_state == C_DISCONN_PEND) {
+		}
+		if (cn->c_state == C_DISCONN_PEND) {
 			mutex_exit(&cn->c_lock);
 			cn = cn->c_next;
 			continue;
@@ -4284,8 +4008,8 @@
 				timout =  ddi_get_lbolt() +
 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
-					&cn->c_lock, timout)) > 0 &&
-					cn->c_state == C_CONN_PEND)
+				    &cn->c_lock, timout)) > 0 &&
+				    cn->c_state == C_CONN_PEND)
 					;
 				if (cv_stat == 0) {
 					cn->c_ref--;
@@ -4313,14 +4037,12 @@
 	}
 	rw_exit(&hca->cl_conn_list.conn_lock);
 
-	status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
+	bzero(&path, sizeof (ibt_path_info_t));
+	bzero(&s_ip, sizeof (ibt_ip_addr_t));
+	bzero(&d_ip, sizeof (ibt_ip_addr_t));
+
+	status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip);
 	if (status != RDMA_SUCCESS) {
-#ifdef DEBUG
-		if (rib_debug) {
-			cmn_err(CE_WARN, "rib_conn_get: "
-				"No server ATS record!");
-		}
-#endif
 		return (RDMA_FAILED);
 	}
 
@@ -4345,20 +4067,14 @@
 	 * WRITER lock.
 	 */
 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
-	status = rib_conn_to_srv(hca, qp, &path);
+	status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip);
 	mutex_enter(&cn->c_lock);
 	if (status == RDMA_SUCCESS) {
 		cn->c_state = C_CONNECTED;
 		*conn = cn;
 	} else {
-		cn->c_state = C_ERROR;
+		cn->c_state = C_ERROR_CONN;
 		cn->c_ref--;
-#ifdef DEBUG
-		if (rib_debug) {
-			cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
-			    " a channel!");
-		}
-#endif
 	}
 	cv_broadcast(&cn->c_cv);
 	mutex_exit(&cn->c_lock);
@@ -4374,10 +4090,10 @@
 	conn->c_ref--;
 
 	/*
-	 * If a conn is C_ERROR, close the channel.
+	 * If a conn is C_ERROR_CONN, close the channel.
 	 * If it's CONNECTED, keep it that way.
 	 */
-	if (conn->c_ref == 0 && (conn->c_state &  C_ERROR)) {
+	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
 		conn->c_state = C_DISCONN_PEND;
 		mutex_exit(&conn->c_lock);
 		if (qp->mode == RIB_SERVER)
@@ -4466,351 +4182,10 @@
 			r = r->next;
 		}
 	}
-	if (rib_debug > 1) {
-	    cmn_err(CE_WARN, "rdma_done_notify: "
-		"No matching xid for %u, qp %p\n", xid, (void *)qp);
-	}
-}
-
-rpcib_ats_t *
-get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
-{
-	rpcib_ats_t		*atsp;
-	int			i;
-
-	for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
-		if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
-		    atsp->ras_port_gid.gid_guid == gid->gid_guid &&
-		    atsp->ras_pkey == pkey) {
-			return (atsp);
-		}
-	}
-	return (NULL);
-}
-
-int
-rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
-{
-	rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
-	rpcib_ats_t	*atsp;
-	ib_pkey_t	pkey;
-	uint8_t		port;
-	ib_guid_t	hca_guid;
-	ib_gid_t	port_gid;
-
-	if (i_ddi_devi_attached(dip) &&
-	    (strcmp(ddi_node_name(dip), "ibport") == 0) &&
-	    (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
-
-		if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
-		    rpcib_ats_t	*tmp;
-
-		    tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
-			N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
-		    bcopy(ibds->rib_ats, tmp,
-			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
-		    kmem_free(ibds->rib_ats,
-			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
-		    ibds->rib_ats = tmp;
-		    ibds->rib_ibd_alloc += N_IBD_INSTANCES;
-		}
-		if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
-			dip, 0, "hca-guid", 0)) == 0) ||
-		    ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
-			0, "port-number", 0)) == 0) ||
-		    (ibt_get_port_state_byguid(hca_guid, port,
-			&port_gid, NULL) != IBT_SUCCESS) ||
-		    ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
-			"port-pkey", IB_PKEY_INVALID_LIMITED)) <=
-			IB_PKEY_INVALID_FULL)) {
-		    return (DDI_WALK_CONTINUE);
-		}
-		atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
-		atsp->ras_inst = ddi_get_instance(dip);
-		atsp->ras_pkey = pkey;
-		atsp->ras_port_gid = port_gid;
-		ibds->rib_ibd_cnt++;
-	}
-	return (DDI_WALK_CONTINUE);
-}
-
-void
-rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
-{
-	ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
-}
-
-/*
- * Return ibd interfaces and ibd instances.
- */
-int
-get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
-{
-	TIUSER			*tiptr, *tiptr6;
-	vnode_t			*kvp, *kvp6;
-	vnode_t			*vp = NULL, *vp6 = NULL;
-	struct strioctl		iocb;
-	struct lifreq		lif_req;
-	int			k, ip_cnt;
-	rpcib_ats_t		*atsp;
-
-	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
-		&kvp) == 0) {
-	    if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
-		&tiptr, CRED()) == 0) {
-		vp = tiptr->fp->f_vnode;
-	    } else {
-		VN_RELE(kvp);
-	    }
-	}
-
-	if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
-		&kvp6) == 0) {
-	    if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
-		&tiptr6, CRED()) == 0) {
-		vp6 = tiptr6->fp->f_vnode;
-	    } else {
-		VN_RELE(kvp6);
-	    }
-	}
-
-	if (vp == NULL && vp6 == NULL)
-		return (-1);
-
-	/* Get ibd ip's */
-	ip_cnt = 0;
-	for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
-		/* IPv4 */
-	    if (vp != NULL) {
-		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
-		(void) snprintf(lif_req.lifr_name,
-			sizeof (lif_req.lifr_name), "%s%d",
-			IBD_NAME, atsp->ras_inst);
-
-		(void) bzero((void *)&iocb, sizeof (struct strioctl));
-		iocb.ic_cmd = SIOCGLIFADDR;
-		iocb.ic_timout = 0;
-		iocb.ic_len = sizeof (struct lifreq);
-		iocb.ic_dp = (caddr_t)&lif_req;
-		if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
-		    atsp->ras_inet_type = AF_INET;
-		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
-			sizeof (struct sockaddr_in));
-		    ip_cnt++;
-		    continue;
-		}
-	    }
-		/* Try IPv6 */
-	    if (vp6 != NULL) {
-		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
-		(void) snprintf(lif_req.lifr_name,
-			sizeof (lif_req.lifr_name), "%s%d",
-			IBD_NAME, atsp->ras_inst);
-
-		(void) bzero((void *)&iocb, sizeof (struct strioctl));
-		iocb.ic_cmd = SIOCGLIFADDR;
-		iocb.ic_timout = 0;
-		iocb.ic_len = sizeof (struct lifreq);
-		iocb.ic_dp = (caddr_t)&lif_req;
-		if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
-
-		    atsp->ras_inet_type = AF_INET6;
-		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
-			    sizeof (struct sockaddr_in6));
-		    ip_cnt++;
-		}
-	    }
-	}
-
-	if (vp6 != NULL) {
-	    (void) t_kclose(tiptr6, 0);
-	    VN_RELE(kvp6);
-	}
-	if (vp != NULL) {
-	    (void) t_kclose(tiptr, 0);
-	    VN_RELE(kvp);
-	}
-
-	if (ip_cnt == 0)
-	    return (-1);
-	else
-	    return (0);
+	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
+	    int, xid);
 }
 
-char **
-get_ip_addrs(int *count)
-{
-	TIUSER			*tiptr;
-	vnode_t			*kvp;
-	int			num_of_ifs;
-	char			**addresses;
-	int			return_code;
-
-	/*
-	 * Open a device for doing down stream kernel ioctls
-	 */
-	return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
-	    NULLVPP, &kvp);
-	if (return_code != 0) {
-		cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
-		*count = -1;
-		return (NULL);
-	}
-
-	return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
-	    &tiptr, CRED());
-	if (return_code != 0) {
-		cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
-		VN_RELE(kvp);
-		*count = -1;
-		return (NULL);
-	}
-
-	/*
-	 * Perform the first ioctl to get the number of interfaces
-	 */
-	return_code = get_interfaces(tiptr, &num_of_ifs);
-	if (return_code != 0 || num_of_ifs == 0) {
-		cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
-		(void) t_kclose(tiptr, 0);
-		VN_RELE(kvp);
-		*count = -1;
-		return (NULL);
-	}
-
-	/*
-	 * Perform the second ioctl to get the address on each interface
-	 * found.
-	 */
-	addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
-	return_code = find_addrs(tiptr, addresses, num_of_ifs);
-	if (return_code <= 0) {
-		cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
-		(void) t_kclose(tiptr, 0);
-		kmem_free(addresses, num_of_ifs * sizeof (char *));
-		VN_RELE(kvp);
-		*count = -1;
-		return (NULL);
-	}
-
-	*count = return_code;
-	VN_RELE(kvp);
-	(void) t_kclose(tiptr, 0);
-	return (addresses);
-}
-
-int
-get_interfaces(TIUSER *tiptr, int *num)
-{
-	struct lifnum		if_buf;
-	struct strioctl		iocb;
-	vnode_t			*vp;
-	int			return_code;
-
-	/*
-	 * Prep the number of interfaces request buffer for ioctl
-	 */
-	(void) bzero((void *)&if_buf, sizeof (struct lifnum));
-	if_buf.lifn_family = AF_UNSPEC;
-	if_buf.lifn_flags = 0;
-
-	/*
-	 * Prep the kernel ioctl buffer and send it down stream
-	 */
-	(void) bzero((void *)&iocb, sizeof (struct strioctl));
-	iocb.ic_cmd = SIOCGLIFNUM;
-	iocb.ic_timout = 0;
-	iocb.ic_len = sizeof (if_buf);
-	iocb.ic_dp = (caddr_t)&if_buf;
-
-	vp = tiptr->fp->f_vnode;
-	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
-	if (return_code != 0) {
-		cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
-		*num = -1;
-		return (-1);
-	}
-
-	*num = if_buf.lifn_count;
-#ifdef	DEBUG
-	if (rib_debug > 1)
-		cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
-		    if_buf.lifn_count);
-#endif
-	return (0);
-}
-
-int
-find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
-{
-	struct lifconf		lifc;
-	struct lifreq		*if_data_buf;
-	struct strioctl		iocb;
-	caddr_t			request_buffer;
-	struct sockaddr_in	*sin4;
-	struct sockaddr_in6	*sin6;
-	vnode_t			*vp;
-	int			i, count, return_code;
-
-	/*
-	 * Prep the buffer for requesting all interface's info
-	 */
-	(void) bzero((void *)&lifc, sizeof (struct lifconf));
-	lifc.lifc_family = AF_UNSPEC;
-	lifc.lifc_flags = 0;
-	lifc.lifc_len = num_ifs * sizeof (struct lifreq);
-
-	request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
-	    KM_SLEEP);
-
-	lifc.lifc_buf = request_buffer;
-
-	/*
-	 * Prep the kernel ioctl buffer and send it down stream
-	 */
-	(void) bzero((void *)&iocb, sizeof (struct strioctl));
-	iocb.ic_cmd = SIOCGLIFCONF;
-	iocb.ic_timout = 0;
-	iocb.ic_len = sizeof (struct lifconf);
-	iocb.ic_dp = (caddr_t)&lifc;
-
-	vp = tiptr->fp->f_vnode;
-	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
-	if (return_code != 0) {
-		cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
-		kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
-		return (-1);
-	}
-
-	/*
-	 * Extract addresses and fill them in the requested array
-	 * IB_SVC_NAME_LEN is defined to be 64 so it  covers both IPv4 &
-	 * IPv6. Here count is the number of IP addresses collected.
-	 */
-	if_data_buf = lifc.lifc_req;
-	count = 0;
-	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
-	if_data_buf++) {
-		if (if_data_buf->lifr_addr.ss_family == AF_INET) {
-			sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
-			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
-			(void) inet_ntop(AF_INET, &sin4->sin_addr,
-			    addrs[count], IB_SVC_NAME_LEN);
-			count ++;
-		}
-
-		if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
-			sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
-			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
-			(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
-			    addrs[count], IB_SVC_NAME_LEN);
-			count ++;
-		}
-	}
-
-	kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
-	return (count);
-}
 
 /*
  * Goes through all connections and closes the channel
@@ -4828,27 +4203,27 @@
 	while (conn != NULL) {
 		mutex_enter(&conn->c_lock);
 		qp = ctoqp(conn);
-		if (conn->c_state & C_CONNECTED) {
+		if (conn->c_state == C_CONNECTED) {
 			/*
 			 * Live connection in CONNECTED state.
 			 * Call ibt_close_rc_channel in nonblocking mode
 			 * with no callbacks.
 			 */
-			conn->c_state = C_ERROR;
+			conn->c_state = C_ERROR_CONN;
 			(void) ibt_close_rc_channel(qp->qp_hdl,
-				IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
+			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
 			(void) ibt_free_channel(qp->qp_hdl);
 			qp->qp_hdl = NULL;
 		} else {
-			if (conn->c_state == C_ERROR &&
-				qp->qp_hdl != NULL) {
+			if (conn->c_state == C_ERROR_CONN &&
+			    qp->qp_hdl != NULL) {
 				/*
 				 * Connection in ERROR state but
 				 * channel is not yet freed.
 				 */
 				(void) ibt_close_rc_channel(qp->qp_hdl,
-					IBT_NOCALLBACKS, NULL, 0, NULL,
-					NULL, 0);
+				    IBT_NOCALLBACKS, NULL, 0, NULL,
+				    NULL, 0);
 				(void) ibt_free_channel(qp->qp_hdl);
 				qp->qp_hdl = NULL;
 			}
@@ -4880,7 +4255,7 @@
 		 * If not and if c_ref is 0, then destroy the connection.
 		 */
 		if (conn->c_ref == 0 &&
-			conn->c_state != C_DISCONN_PEND) {
+		    conn->c_state != C_DISCONN_PEND) {
 			/*
 			 * Cull the connection
 			 */
@@ -4933,7 +4308,6 @@
 	rib_stat->nhca_inited--;
 
 	rib_stop_services(hca);
-	rib_deregister_ats();
 	rib_close_channels(&hca->cl_conn_list);
 	rib_close_channels(&hca->srv_conn_list);
 	rw_exit(&hca->state_lock);
@@ -4953,13 +4327,14 @@
 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
 	if (hca->srv_conn_list.conn_hd == NULL &&
-		hca->cl_conn_list.conn_hd == NULL) {
+	    hca->cl_conn_list.conn_hd == NULL) {
 		/*
 		 * conn_lists are NULL, so destroy
 		 * buffers, close hca and be done.
 		 */
 		rib_rbufpool_destroy(hca, RECV_BUFFER);
 		rib_rbufpool_destroy(hca, SEND_BUFFER);
+		rib_destroy_cache(hca);
 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
 		(void) ibt_close_hca(hca->hca_hdl);
 		hca->hca_hdl = NULL;
@@ -4983,3 +4358,541 @@
 		hca->hca_hdl = NULL;
 	}
 }
+
+static void
+rib_server_side_cache_reclaim(void *argp)
+{
+	cache_avl_struct_t    *rcas;
+	rib_lrc_entry_t		*rb;
+	rib_hca_t *hca = (rib_hca_t *)argp;
+
+	rw_enter(&hca->avl_rw_lock, RW_WRITER);
+	rcas = avl_first(&hca->avl_tree);
+	if (rcas != NULL)
+		avl_remove(&hca->avl_tree, rcas);
+
+	while (rcas != NULL) {
+		while (rcas->r.forw != &rcas->r) {
+			rcas->elements--;
+			rib_total_buffers --;
+			rb = rcas->r.forw;
+			remque(rb);
+			if (rb->registered)
+				(void) rib_deregistermem_via_hca(hca,
+				    rb->lrc_buf, rb->lrc_mhandle);
+			cache_allocation -= rb->lrc_len;
+			kmem_free(rb->lrc_buf, rb->lrc_len);
+			kmem_free(rb, sizeof (rib_lrc_entry_t));
+		}
+		mutex_destroy(&rcas->node_lock);
+		kmem_cache_free(hca->server_side_cache, rcas);
+		rcas = avl_first(&hca->avl_tree);
+		if (rcas != NULL)
+			avl_remove(&hca->avl_tree, rcas);
+	}
+	rw_exit(&hca->avl_rw_lock);
+}
+
+static void
+rib_server_side_cache_cleanup(void *argp)
+{
+	cache_avl_struct_t    *rcas;
+	rib_lrc_entry_t		*rb;
+	rib_hca_t *hca = (rib_hca_t *)argp;
+
+	rw_enter(&hca->avl_rw_lock, RW_READER);
+	if (cache_allocation < cache_limit) {
+		rw_exit(&hca->avl_rw_lock);
+		return;
+	}
+	rw_exit(&hca->avl_rw_lock);
+
+	rw_enter(&hca->avl_rw_lock, RW_WRITER);
+	rcas = avl_last(&hca->avl_tree);
+	if (rcas != NULL)
+		avl_remove(&hca->avl_tree, rcas);
+
+	while (rcas != NULL) {
+		while (rcas->r.forw != &rcas->r) {
+			rcas->elements--;
+			rib_total_buffers --;
+			rb = rcas->r.forw;
+			remque(rb);
+			if (rb->registered)
+				(void) rib_deregistermem_via_hca(hca,
+				    rb->lrc_buf, rb->lrc_mhandle);
+			cache_allocation -= rb->lrc_len;
+			kmem_free(rb->lrc_buf, rb->lrc_len);
+			kmem_free(rb, sizeof (rib_lrc_entry_t));
+		}
+		mutex_destroy(&rcas->node_lock);
+		kmem_cache_free(hca->server_side_cache, rcas);
+		if ((cache_allocation) < cache_limit) {
+			rw_exit(&hca->avl_rw_lock);
+			return;
+		}
+
+		rcas = avl_last(&hca->avl_tree);
+		if (rcas != NULL)
+			avl_remove(&hca->avl_tree, rcas);
+	}
+	rw_exit(&hca->avl_rw_lock);
+}
+
+static int
+avl_compare(const void *t1, const void *t2)
+{
+	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
+		return (0);
+
+	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
+		return (-1);
+
+	return (1);
+}
+
+static void
+rib_destroy_cache(rib_hca_t *hca)
+{
+	if (hca->reg_cache_clean_up != NULL) {
+		ddi_taskq_destroy(hca->reg_cache_clean_up);
+		hca->reg_cache_clean_up = NULL;
+	}
+	if (!hca->avl_init) {
+		kmem_cache_destroy(hca->server_side_cache);
+		avl_destroy(&hca->avl_tree);
+		mutex_destroy(&hca->cache_allocation);
+		rw_destroy(&hca->avl_rw_lock);
+	}
+	hca->avl_init = FALSE;
+}
+
+static void
+rib_force_cleanup(void *hca)
+{
+	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
+		(void) ddi_taskq_dispatch(
+		    ((rib_hca_t *)hca)->reg_cache_clean_up,
+		    rib_server_side_cache_cleanup,
+		    (void *)hca, DDI_NOSLEEP);
+}
+
+static rib_lrc_entry_t *
+rib_get_cache_buf(CONN *conn, uint32_t len)
+{
+	cache_avl_struct_t	cas, *rcas;
+	rib_hca_t	*hca = (ctoqp(conn))->hca;
+	rib_lrc_entry_t *reply_buf;
+	avl_index_t where = NULL;
+	uint64_t c_alloc = 0;
+
+	if (!hca->avl_init)
+		goto  error_alloc;
+
+	cas.len = len;
+
+	rw_enter(&hca->avl_rw_lock, RW_READER);
+
+	mutex_enter(&hca->cache_allocation);
+	c_alloc = cache_allocation;
+	mutex_exit(&hca->cache_allocation);
+
+	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
+	    &where)) == NULL) {
+		/* Am I above the cache limit */
+		if ((c_alloc + len) >= cache_limit) {
+			rib_force_cleanup((void *)hca);
+			rw_exit(&hca->avl_rw_lock);
+			cache_misses_above_the_limit ++;
+
+			/* Allocate and register the buffer directly */
+			goto error_alloc;
+		}
+
+		rw_exit(&hca->avl_rw_lock);
+		rw_enter(&hca->avl_rw_lock, RW_WRITER);
+
+		/* Recheck to make sure no other thread added the entry in */
+		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
+		    &cas, &where)) == NULL) {
+			/* Allocate an avl tree entry */
+			rcas = (cache_avl_struct_t *)
+			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
+
+			bzero(rcas, sizeof (cache_avl_struct_t));
+			rcas->elements = 0;
+			rcas->r.forw = &rcas->r;
+			rcas->r.back = &rcas->r;
+			rcas->len = len;
+			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
+			avl_insert(&hca->avl_tree, rcas, where);
+		}
+	}
+
+	mutex_enter(&rcas->node_lock);
+
+	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
+		rib_total_buffers--;
+		cache_hits++;
+		reply_buf = rcas->r.forw;
+		remque(reply_buf);
+		rcas->elements--;
+		mutex_exit(&rcas->node_lock);
+		rw_exit(&hca->avl_rw_lock);
+		mutex_enter(&hca->cache_allocation);
+		cache_allocation -= len;
+		mutex_exit(&hca->cache_allocation);
+	} else {
+		/* Am I above the cache limit */
+		mutex_exit(&rcas->node_lock);
+		if ((c_alloc + len) >= cache_limit) {
+			rib_force_cleanup((void *)hca);
+			rw_exit(&hca->avl_rw_lock);
+			cache_misses_above_the_limit ++;
+			/* Allocate and register the buffer directly */
+			goto error_alloc;
+		}
+		rw_exit(&hca->avl_rw_lock);
+		cache_misses ++;
+		/* Allocate a reply_buf entry */
+		reply_buf = (rib_lrc_entry_t *)
+		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
+		bzero(reply_buf, sizeof (rib_lrc_entry_t));
+		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
+		reply_buf->lrc_len  = len;
+		reply_buf->registered = FALSE;
+		reply_buf->avl_node = (void *)rcas;
+	}
+
+	return (reply_buf);
+
+error_alloc:
+	reply_buf = (rib_lrc_entry_t *)
+	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
+	bzero(reply_buf, sizeof (rib_lrc_entry_t));
+	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
+	reply_buf->lrc_len = len;
+	reply_buf->registered = FALSE;
+	reply_buf->avl_node = NULL;
+
+	return (reply_buf);
+}
+
+/*
+ * Return a pre-registered back to the cache (without
+ * unregistering the buffer)..
+ */
+
+static void
+rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
+{
+	cache_avl_struct_t    cas, *rcas;
+	avl_index_t where = NULL;
+	rib_hca_t	*hca = (ctoqp(conn))->hca;
+
+	if (!hca->avl_init)
+		goto  error_free;
+
+	cas.len = reg_buf->lrc_len;
+	rw_enter(&hca->avl_rw_lock, RW_READER);
+	if ((rcas = (cache_avl_struct_t *)
+	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
+		rw_exit(&hca->avl_rw_lock);
+		goto error_free;
+	} else {
+		rib_total_buffers ++;
+		cas.len = reg_buf->lrc_len;
+		mutex_enter(&rcas->node_lock);
+		insque(reg_buf, &rcas->r);
+		rcas->elements ++;
+		mutex_exit(&rcas->node_lock);
+		rw_exit(&hca->avl_rw_lock);
+		mutex_enter(&hca->cache_allocation);
+		cache_allocation += cas.len;
+		mutex_exit(&hca->cache_allocation);
+	}
+
+	return;
+
+error_free:
+
+	if (reg_buf->registered)
+		(void) rib_deregistermem_via_hca(hca,
+		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
+	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
+	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
+}
+
+static rdma_stat
+rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
+	uint_t buflen, struct mrc *buf_handle)
+{
+	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
+	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
+	rdma_stat	status;
+
+
+	/*
+	 * Note: ALL buffer pools use the same memory type RDMARW.
+	 */
+	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
+	if (status == RDMA_SUCCESS) {
+		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
+		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
+		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
+	} else {
+		buf_handle->mrc_linfo = NULL;
+		buf_handle->mrc_lmr = 0;
+		buf_handle->mrc_rmr = 0;
+	}
+	return (status);
+}
+
+/* ARGSUSED */
+static rdma_stat
+rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
+    struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
+{
+
+	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
+	return (RDMA_SUCCESS);
+}
+
+/* ARGSUSED */
+static rdma_stat
+rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
+{
+
+	(void) ibt_deregister_mr(hca->hca_hdl,
+	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
+	return (RDMA_SUCCESS);
+}
+
+
+/*
+ * Return 0 if the interface is IB.
+ * Return error (>0) if any error is encountered during processing.
+ * Return -1 if the interface is not IB and no error.
+ */
+#define	isalpha(ch)	(((ch) >= 'a' && (ch) <= 'z') || \
+			((ch) >= 'A' && (ch) <= 'Z'))
+static int
+rpcib_is_ib_interface(char *name)
+{
+
+	char	dev_path[MAXPATHLEN];
+	char	devname[MAXNAMELEN];
+	ldi_handle_t	lh;
+	dl_info_ack_t	info;
+	int	ret = 0;
+	int	i;
+
+	/*
+	 * ibd devices are only style 2 devices
+	 * so we will open only style 2 devices
+	 * by ignoring the ppa
+	 */
+
+	i = strlen(name) - 1;
+	while ((i >= 0) && (!isalpha(name[i]))) i--;
+
+	if (i < 0) {
+		/* Invalid interface name, no alphabet */
+		return (-1);
+	}
+
+	(void) strncpy(devname, name, i + 1);
+	devname[i + 1] = '\0';
+
+	if (strcmp("lo", devname) == 0) {
+		/*
+		 * loopback interface  not rpc/rdma capable
+		 */
+		return (-1);
+	}
+
+	(void) strncpy(dev_path, "/dev/", MAXPATHLEN);
+	if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) {
+		/* string overflow */
+		return (-1);
+	}
+
+	ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li);
+	if (ret != 0) {
+		return (ret);
+	}
+	ret = rpcib_dl_info(lh, &info);
+	(void) ldi_close(lh, FREAD|FWRITE, kcred);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	if (info.dl_mac_type != DL_IB) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info)
+{
+	dl_info_req_t *info_req;
+	union DL_primitives *dl_prim;
+	mblk_t *mp;
+	k_sigset_t smask;
+	int error;
+
+	if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) {
+		return (ENOMEM);
+	}
+
+	mp->b_datap->db_type = M_PROTO;
+
+	info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr;
+	mp->b_wptr += sizeof (dl_info_req_t);
+	info_req->dl_primitive = DL_INFO_REQ;
+
+	sigintr(&smask, 0);
+	if ((error = ldi_putmsg(lh, mp)) != 0) {
+		sigunintr(&smask);
+		return (error);
+	}
+	if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) {
+		sigunintr(&smask);
+		return (error);
+	}
+	sigunintr(&smask);
+
+	dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr;
+	switch (dl_prim->dl_primitive) {
+		case DL_INFO_ACK:
+			if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) <
+			    sizeof (dl_info_ack_t)) {
+			error = -1;
+			} else {
+				*info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr;
+				error = 0;
+			}
+			break;
+		default:
+			error = -1;
+			break;
+	}
+
+	freemsg(mp);
+	return (error);
+}
+static int
+rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
+{
+	vnode_t *kvp, *vp;
+	TIUSER  *tiptr;
+	struct  strioctl iocb;
+	k_sigset_t smask;
+	int	err = 0;
+
+	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
+	    &kvp) == 0) {
+		if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
+		    &tiptr, CRED()) == 0) {
+		vp = tiptr->fp->f_vnode;
+	} else {
+		VN_RELE(kvp);
+		return (EPROTO);
+		}
+	} else {
+			return (EPROTO);
+	}
+
+	iocb.ic_cmd = cmd;
+	iocb.ic_timout = 0;
+	iocb.ic_len = len;
+	iocb.ic_dp = arg;
+	sigintr(&smask, 0);
+	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
+	sigunintr(&smask);
+	(void) t_kclose(tiptr, 0);
+	VN_RELE(kvp);
+	return (err);
+}
+
+static uint_t rpcib_get_number_interfaces(void) {
+uint_t	numifs;
+	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) {
+		return (0);
+	}
+	return (numifs);
+}
+
+static boolean_t
+rpcib_get_ib_addresses(
+	struct sockaddr_in *saddr4,
+	struct sockaddr_in6 *saddr6,
+	uint_t *number4,
+	uint_t *number6)
+{
+	int	numifs;
+	struct	ifconf	kifc;
+	struct  ifreq *ifr;
+	boolean_t ret = B_FALSE;
+
+	*number4 = 0;
+	*number6 = 0;
+
+	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) {
+		return (ret);
+	}
+
+	kifc.ifc_len = numifs * sizeof (struct ifreq);
+	kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
+
+	if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf),
+	    (caddr_t)&kifc)) {
+		goto done;
+	}
+
+	ifr = kifc.ifc_req;
+	for (numifs = kifc.ifc_len / sizeof (struct ifreq);
+	    numifs > 0; numifs--, ifr++) {
+		struct sockaddr_in *sin4;
+		struct sockaddr_in6 *sin6;
+
+		if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) {
+			sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
+			sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr;
+			if (sin4->sin_family == AF_INET) {
+				saddr4[*number4] = *(struct sockaddr_in *)
+				    (uintptr_t)&ifr->ifr_addr;
+				*number4 = *number4 + 1;
+			} else if (sin6->sin6_family == AF_INET6) {
+				saddr6[*number6] = *(struct sockaddr_in6 *)
+				    (uintptr_t)&ifr->ifr_addr;
+				*number6 = *number6 + 1;
+			}
+		}
+	}
+	ret = B_TRUE;
+done:
+	kmem_free(kifc.ifc_buf, kifc.ifc_len);
+	return (ret);
+}
+
+/* ARGSUSED */
+static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
+
+	if (KSTAT_WRITE == rw) {
+		return (EACCES);
+	}
+	rpcib_kstat.cache_limit.value.ui64 =
+	    (uint64_t)cache_limit;
+	rpcib_kstat.cache_allocation.value.ui64 =
+	    (uint64_t)cache_allocation;
+	rpcib_kstat.cache_hits.value.ui64 =
+	    (uint64_t)cache_hits;
+	rpcib_kstat.cache_misses.value.ui64 =
+	    (uint64_t)cache_misses;
+	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
+	    (uint64_t)cache_misses_above_the_limit;
+	return (0);
+}
--- a/usr/src/uts/common/rpc/rpcsec_gss.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/rpcsec_gss.h	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,8 +34,6 @@
 #ifndef	_RPCSEC_GSS_H
 #define	_RPCSEC_GSS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -201,6 +198,7 @@
 enum auth_stat __svcrpcsec_gss(struct svc_req *,
 			struct rpc_msg *, bool_t *);
 bool_t rpc_gss_set_defaults(AUTH *, rpc_gss_service_t, uint_t);
+rpc_gss_service_t rpc_gss_get_service_type(AUTH *);
 
 
 #else
--- a/usr/src/uts/common/rpc/sec_gss/rpcsec_gss.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/sec_gss/rpcsec_gss.c	Thu Aug 21 18:01:07 2008 -0500
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Copyright 1993 OpenVision Technologies, Inc., All Rights Reserved.
  *
@@ -1572,3 +1570,11 @@
 {
 	return (0);
 }
+
+rpc_gss_service_t
+rpc_gss_get_service_type(AUTH *auth)
+{
+	rpc_gss_data		*ap = AUTH_PRIVATE(auth);
+
+	return (ap->service);
+}
--- a/usr/src/uts/common/rpc/svc.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/svc.h	Thu Aug 21 18:01:07 2008 -0500
@@ -37,8 +37,6 @@
 #ifndef	_RPC_SVC_H
 #define	_RPC_SVC_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <rpc/rpc_com.h>
 #include <rpc/rpc_msg.h>
 #include <sys/tihdr.h>
@@ -437,7 +435,7 @@
  * transport type.
  */
 
-#define	SVC_P2LEN   64
+#define	SVC_P2LEN   128
 
 struct __svcxprt {
 	__SVCXPRT_COMMON xp_xpc;
--- a/usr/src/uts/common/rpc/svc_rdma.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/svc_rdma.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -31,8 +30,6 @@
  * California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Server side of RPC over RDMA in the kernel.
  */
@@ -66,6 +63,20 @@
 #include <inet/ip.h>
 #include <inet/ip6.h>
 
+#include <nfs/nfs.h>
+#include <sys/sdt.h>
+
+#define	SVC_RDMA_SUCCESS 0
+#define	SVC_RDMA_FAIL -1
+
+#define	SVC_CREDIT_FACTOR (0.5)
+
+#define	MSG_IS_RPCSEC_GSS(msg)		\
+	((msg)->rm_reply.rp_acpt.ar_verf.oa_flavor == RPCSEC_GSS)
+
+
+uint32_t rdma_bufs_granted = RDMA_BUFS_GRANT;
+
 /*
  * RDMA transport specific data associated with SVCMASTERXPRT
  */
@@ -81,12 +92,10 @@
 struct clone_rdma_data {
 	CONN		*conn;		/* RDMA connection */
 	rdma_buf_t	rpcbuf;		/* RPC req/resp buffer */
+	struct clist	*cl_reply;	/* reply chunk buffer info */
+	struct clist	*cl_wlist;		/* write list clist */
 };
 
-#ifdef DEBUG
-int rdma_svc_debug = 0;
-#endif
-
 #define	MAXADDRLEN	128	/* max length for address mask */
 
 /*
@@ -107,6 +116,17 @@
 static void		svc_rdma_kstart(SVCMASTERXPRT *);
 void			svc_rdma_kstop(SVCMASTERXPRT *);
 
+static int	svc_process_long_reply(SVCXPRT *, xdrproc_t,
+			caddr_t, struct rpc_msg *, bool_t, int *,
+			int *, int *, unsigned int *);
+
+static int	svc_compose_rpcmsg(SVCXPRT *, CONN *, xdrproc_t,
+			caddr_t, rdma_buf_t *, XDR **, struct rpc_msg *,
+			bool_t, uint_t *);
+static bool_t rpcmsg_length(xdrproc_t,
+		caddr_t,
+		struct rpc_msg *, bool_t, int);
+
 /*
  * Server transport operations vector.
  */
@@ -137,6 +157,9 @@
 	kstat_named_t	rsdupchecks;
 	kstat_named_t	rsdupreqs;
 	kstat_named_t	rslongrpcs;
+	kstat_named_t	rstotalreplies;
+	kstat_named_t	rstotallongreplies;
+	kstat_named_t	rstotalinlinereplies;
 } rdmarsstat = {
 	{ "calls",	KSTAT_DATA_UINT64 },
 	{ "badcalls",	KSTAT_DATA_UINT64 },
@@ -145,14 +168,16 @@
 	{ "xdrcall",	KSTAT_DATA_UINT64 },
 	{ "dupchecks",	KSTAT_DATA_UINT64 },
 	{ "dupreqs",	KSTAT_DATA_UINT64 },
-	{ "longrpcs",	KSTAT_DATA_UINT64 }
+	{ "longrpcs",	KSTAT_DATA_UINT64 },
+	{ "totalreplies",	KSTAT_DATA_UINT64 },
+	{ "totallongreplies",	KSTAT_DATA_UINT64 },
+	{ "totalinlinereplies",	KSTAT_DATA_UINT64 },
 };
 
 kstat_named_t *rdmarsstat_ptr = (kstat_named_t *)&rdmarsstat;
 uint_t rdmarsstat_ndata = sizeof (rdmarsstat) / sizeof (kstat_named_t);
 
-#define	RSSTAT_INCR(x)	rdmarsstat.x.value.ui64++
-
+#define	RSSTAT_INCR(x)	atomic_add_64(&rdmarsstat.x.value.ui64, 1)
 /*
  * Create a transport record.
  * The transport record, output buffer, and private data structure
@@ -163,7 +188,7 @@
 /* ARGSUSED */
 int
 svc_rdma_kcreate(char *netid, SVC_CALLOUT_TABLE *sct, int id,
-	rdma_xprt_group_t *started_xprts)
+    rdma_xprt_group_t *started_xprts)
 {
 	int error;
 	SVCMASTERXPRT *xprt;
@@ -171,11 +196,13 @@
 	rdma_registry_t *rmod;
 	rdma_xprt_record_t *xprt_rec;
 	queue_t	*q;
-
 	/*
 	 * modload the RDMA plugins is not already done.
 	 */
 	if (!rdma_modloaded) {
+		/*CONSTANTCONDITION*/
+		ASSERT(sizeof (struct clone_rdma_data) <= SVC_P2LEN);
+
 		mutex_enter(&rdma_modload_lock);
 		if (!rdma_modloaded) {
 			error = rdma_modload();
@@ -239,7 +266,7 @@
 
 		if (netid != NULL) {
 			xprt->xp_netid = kmem_alloc(strlen(netid) + 1,
-						KM_SLEEP);
+			    KM_SLEEP);
 			(void) strcpy(xprt->xp_netid, netid);
 		}
 
@@ -260,8 +287,7 @@
 		rd->rd_data.svcid = id;
 		error = svc_xprt_register(xprt, id);
 		if (error) {
-			cmn_err(CE_WARN, "svc_rdma_kcreate: svc_xprt_register"
-				"failed");
+			DTRACE_PROBE(krpc__e__svcrdma__xprt__reg);
 			goto cleanup;
 		}
 
@@ -351,8 +377,7 @@
 	 */
 	(*rmod->rdma_ops->rdma_svc_stop)(svcdata);
 	if (svcdata->active)
-		cmn_err(CE_WARN, "rdma_stop: Failed to shutdown RDMA based kRPC"
-			"  listener");
+		DTRACE_PROBE(krpc__e__svcrdma__kstop);
 }
 
 /* ARGSUSED */
@@ -364,128 +389,103 @@
 static bool_t
 svc_rdma_krecv(SVCXPRT *clone_xprt, mblk_t *mp, struct rpc_msg *msg)
 {
-	XDR *xdrs;
-	rdma_stat status;
-	struct recv_data *rdp = (struct recv_data *)mp->b_rptr;
-	CONN *conn;
-	struct clone_rdma_data *vd;
-	struct clist *cl;
-	uint_t vers, op, pos;
-	uint32_t xid;
+	XDR	*xdrs;
+	CONN	*conn;
 
-	vd = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+	rdma_recv_data_t	*rdp = (rdma_recv_data_t *)mp->b_rptr;
+	struct clone_rdma_data *crdp;
+	struct clist	*cl = NULL;
+	struct clist	*wcl = NULL;
+	struct clist	*cllong = NULL;
+
+	rdma_stat	status;
+	uint32_t vers, op, pos, xid;
+	uint32_t rdma_credit;
+	uint32_t wcl_total_length = 0;
+	bool_t	wwl = FALSE;
+
+	crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
 	RSSTAT_INCR(rscalls);
 	conn = rdp->conn;
 
-	/*
-	 * Post a receive descriptor on this
-	 * endpoint to ensure all packets are received.
-	 */
 	status = rdma_svc_postrecv(conn);
 	if (status != RDMA_SUCCESS) {
-		cmn_err(CE_NOTE,
-		    "svc_rdma_krecv: rdma_svc_postrecv failed %d", status);
+		DTRACE_PROBE(krpc__e__svcrdma__krecv__postrecv);
+		goto badrpc_call;
 	}
 
-	if (rdp->status != 0) {
-		RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-		RDMA_REL_CONN(conn);
-		RSSTAT_INCR(rsbadcalls);
-		freeb(mp);
-		return (FALSE);
-	}
-
-	/*
-	 * Decode rpc message
-	 */
 	xdrs = &clone_xprt->xp_xdrin;
 	xdrmem_create(xdrs, rdp->rpcmsg.addr, rdp->rpcmsg.len, XDR_DECODE);
+	xid = *(uint32_t *)rdp->rpcmsg.addr;
+	XDR_SETPOS(xdrs, sizeof (uint32_t));
 
-	/*
-	 * Get the XID
-	 */
-	/*
-	 * Treat xid as opaque (xid is the first entity
-	 * in the rpc rdma message).
-	 */
-	xid = *(uint32_t *)rdp->rpcmsg.addr;
-	/* Skip xid and set the xdr position accordingly. */
-	XDR_SETPOS(xdrs, sizeof (uint32_t));
 	if (! xdr_u_int(xdrs, &vers) ||
+	    ! xdr_u_int(xdrs, &rdma_credit) ||
 	    ! xdr_u_int(xdrs, &op)) {
-		cmn_err(CE_WARN, "svc_rdma_krecv: xdr_u_int failed");
-		XDR_DESTROY(xdrs);
-		RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-		RDMA_REL_CONN(conn);
-		freeb(mp);
-		RSSTAT_INCR(rsbadcalls);
-		return (FALSE);
-	}
-	if (op == RDMA_DONE) {
-		/*
-		 * Should not get RDMA_DONE
-		 */
-		freeb(mp);
-		XDR_DESTROY(xdrs);
-		RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-		RDMA_REL_CONN(conn);
-		RSSTAT_INCR(rsbadcalls);
-		return (FALSE); /* no response */
+		DTRACE_PROBE(krpc__e__svcrdma__krecv__uint);
+		goto xdr_err;
 	}
 
-#ifdef DEBUG
-	if (rdma_svc_debug)
-		printf("svc_rdma_krecv: recv'd call xid %u\n", xid);
-#endif
-	/*
-	 * Now decode the chunk list
-	 */
-	cl = NULL;
+	/* Checking if the status of the recv operation was normal */
+	if (rdp->status != 0) {
+		DTRACE_PROBE1(krpc__e__svcrdma__krecv__invalid__status,
+		    int, rdp->status);
+		goto badrpc_call;
+	}
+
 	if (! xdr_do_clist(xdrs, &cl)) {
-		cmn_err(CE_WARN, "svc_rdma_krecv: xdr_do_clist failed");
+		DTRACE_PROBE(krpc__e__svcrdma__krecv__do__clist);
+		goto xdr_err;
 	}
 
+	if (!xdr_decode_wlist_svc(xdrs, &wcl, &wwl, &wcl_total_length, conn)) {
+		DTRACE_PROBE(krpc__e__svcrdma__krecv__decode__wlist);
+		if (cl)
+			clist_free(cl);
+		goto xdr_err;
+	}
+	crdp->cl_wlist = wcl;
+
+	crdp->cl_reply = NULL;
+	(void) xdr_decode_reply_wchunk(xdrs, &crdp->cl_reply);
+
 	/*
 	 * A chunk at 0 offset indicates that the RPC call message
 	 * is in a chunk. Get the RPC call message chunk.
 	 */
 	if (cl != NULL && op == RDMA_NOMSG) {
-		struct clist *cllong;	/* Long RPC chunk */
 
 		/* Remove RPC call message chunk from chunklist */
 		cllong = cl;
 		cl = cl->c_next;
 		cllong->c_next = NULL;
 
+
 		/* Allocate and register memory for the RPC call msg chunk */
-		cllong->c_daddr = (uint64)(uintptr_t)
-		    kmem_alloc(cllong->c_len, KM_SLEEP);
-		if (cllong->c_daddr == NULL) {
-			cmn_err(CE_WARN,
-				"svc_rdma_krecv: no memory for rpc call");
-			XDR_DESTROY(xdrs);
-			RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-			RDMA_REL_CONN(conn);
-			freeb(mp);
-			RSSTAT_INCR(rsbadcalls);
-			clist_free(cl);
+		cllong->rb_longbuf.type = RDMA_LONG_BUFFER;
+		cllong->rb_longbuf.len = cllong->c_len > LONG_REPLY_LEN ?
+		    cllong->c_len : LONG_REPLY_LEN;
+
+		if (rdma_buf_alloc(conn, &cllong->rb_longbuf)) {
 			clist_free(cllong);
-			return (FALSE);
+			goto cll_malloc_err;
 		}
-		status = clist_register(conn, cllong, 0);
+
+		cllong->u.c_daddr3 = cllong->rb_longbuf.addr;
+
+		if (cllong->u.c_daddr == NULL) {
+			DTRACE_PROBE(krpc__e__svcrdma__krecv__nomem);
+			rdma_buf_free(conn, &cllong->rb_longbuf);
+			clist_free(cllong);
+			goto cll_malloc_err;
+		}
+
+		status = clist_register(conn, cllong, CLIST_REG_DST);
 		if (status) {
-			cmn_err(CE_WARN,
-				"svc_rdma_krecv: clist_register failed");
-			kmem_free((void *)(uintptr_t)cllong->c_daddr,
-			    cllong->c_len);
-			XDR_DESTROY(xdrs);
-			RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-			RDMA_REL_CONN(conn);
-			freeb(mp);
-			RSSTAT_INCR(rsbadcalls);
-			clist_free(cl);
+			DTRACE_PROBE(krpc__e__svcrdma__krecv__clist__reg);
+			rdma_buf_free(conn, &cllong->rb_longbuf);
 			clist_free(cllong);
-			return (FALSE);
+			goto cll_malloc_err;
 		}
 
 		/*
@@ -493,67 +493,49 @@
 		 */
 		status = RDMA_READ(conn, cllong, WAIT);
 		if (status) {
-			cmn_err(CE_WARN,
-			    "svc_rdma_krecv: rdma_read failed %d", status);
-			(void) clist_deregister(conn, cllong, 0);
-			kmem_free((void *)(uintptr_t)cllong->c_daddr,
-			    cllong->c_len);
-			XDR_DESTROY(xdrs);
-			RDMA_BUF_FREE(conn, &rdp->rpcmsg);
-			RDMA_REL_CONN(conn);
-			freeb(mp);
-			RSSTAT_INCR(rsbadcalls);
-			clist_free(cl);
+			DTRACE_PROBE(krpc__e__svcrdma__krecv__read);
+			(void) clist_deregister(conn, cllong, CLIST_REG_DST);
+			rdma_buf_free(conn, &cllong->rb_longbuf);
 			clist_free(cllong);
-			return (FALSE);
+			goto cll_malloc_err;
 		}
-		/*
-		 * Sync memory for CPU after DMA
-		 */
-		status = clist_syncmem(conn, cllong, 0);
 
-		/*
-		 * Deregister the chunk
-		 */
-		(void) clist_deregister(conn, cllong, 0);
+		status = clist_syncmem(conn, cllong, CLIST_REG_DST);
+		(void) clist_deregister(conn, cllong, CLIST_REG_DST);
 
-		/*
-		 * Setup the XDR for the RPC call message
-		 */
-		xdrrdma_create(xdrs, (caddr_t)(uintptr_t)cllong->c_daddr,
+		xdrrdma_create(xdrs, (caddr_t)(uintptr_t)cllong->u.c_daddr3,
 		    cllong->c_len, 0, cl, XDR_DECODE, conn);
-		vd->rpcbuf.type = CHUNK_BUFFER;
-		vd->rpcbuf.addr = (caddr_t)(uintptr_t)cllong->c_daddr;
-		vd->rpcbuf.len = cllong->c_len;
-		vd->rpcbuf.handle.mrc_rmr = 0;
 
-		/*
-		 * Free the chunk element with the Long RPC details and
-		 * the message received.
-		 */
+		crdp->rpcbuf = cllong->rb_longbuf;
+		crdp->rpcbuf.len = cllong->c_len;
 		clist_free(cllong);
 		RDMA_BUF_FREE(conn, &rdp->rpcmsg);
 	} else {
 		pos = XDR_GETPOS(xdrs);
+		xdrrdma_create(xdrs, rdp->rpcmsg.addr + pos,
+		    rdp->rpcmsg.len - pos, 0, cl, XDR_DECODE, conn);
+		crdp->rpcbuf = rdp->rpcmsg;
 
-		/*
-		 * Now the RPC call message header
-		 */
-		xdrrdma_create(xdrs, rdp->rpcmsg.addr + pos,
-			rdp->rpcmsg.len - pos, 0, cl, XDR_DECODE, conn);
-		vd->rpcbuf = rdp->rpcmsg;
+		/* Use xdrrdmablk_ops to indicate there is a read chunk list */
+		if (cl != NULL) {
+			int32_t flg = XDR_RDMA_RLIST_REG;
+
+			XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
+			xdrs->x_ops = &xdrrdmablk_ops;
+		}
 	}
+
+	if (crdp->cl_wlist) {
+		int32_t flg = XDR_RDMA_WLIST_REG;
+
+		XDR_CONTROL(xdrs, XDR_RDMA_SET_WLIST, crdp->cl_wlist);
+		XDR_CONTROL(xdrs, XDR_RDMA_SET_FLAGS, &flg);
+	}
+
 	if (! xdr_callmsg(xdrs, msg)) {
-		cmn_err(CE_WARN, "svc_rdma_krecv: xdr_callmsg failed");
-		if (cl != NULL)
-			clist_free(cl);
-		XDR_DESTROY(xdrs);
-		rdma_buf_free(conn, &vd->rpcbuf);
-		RDMA_REL_CONN(conn);
-		freeb(mp);
+		DTRACE_PROBE(krpc__e__svcrdma__krecv__callmsg);
 		RSSTAT_INCR(rsxdrcall);
-		RSSTAT_INCR(rsbadcalls);
-		return (FALSE);
+		goto callmsg_err;
 	}
 
 	/*
@@ -563,48 +545,224 @@
 	clone_xprt->xp_rtaddr.buf = conn->c_raddr.buf;
 	clone_xprt->xp_rtaddr.len = conn->c_raddr.len;
 	clone_xprt->xp_rtaddr.maxlen = conn->c_raddr.len;
+	clone_xprt->xp_xid = xid;
+	crdp->conn = conn;
 
-#ifdef DEBUG
-	if (rdma_svc_debug) {
-		struct sockaddr_in *sin4;
-		char print_addr[INET_ADDRSTRLEN];
+	freeb(mp);
+
+	return (TRUE);
+
+callmsg_err:
+	rdma_buf_free(conn, &crdp->rpcbuf);
+
+cll_malloc_err:
+	if (cl)
+		clist_free(cl);
+xdr_err:
+	XDR_DESTROY(xdrs);
+
+badrpc_call:
+	RDMA_BUF_FREE(conn, &rdp->rpcmsg);
+	RDMA_REL_CONN(conn);
+	freeb(mp);
+	RSSTAT_INCR(rsbadcalls);
+	return (FALSE);
+}
+
+static int
+svc_process_long_reply(SVCXPRT * clone_xprt,
+    xdrproc_t xdr_results, caddr_t xdr_location,
+    struct rpc_msg *msg, bool_t has_args, int *msglen,
+    int *freelen, int *numchunks, unsigned int *final_len)
+{
+	int status;
+	XDR xdrslong;
+	struct clist *wcl = NULL;
+	int count = 0;
+	int alloc_len;
+	char  *memp;
+	rdma_buf_t long_rpc = {0};
+	struct clone_rdma_data *crdp;
+
+	crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+
+	bzero(&xdrslong, sizeof (xdrslong));
+
+	/* Choose a size for the long rpc response */
+	if (MSG_IS_RPCSEC_GSS(msg)) {
+		alloc_len = RNDUP(MAX_AUTH_BYTES + *msglen);
+	} else {
+		alloc_len = RNDUP(*msglen);
+	}
+
+	if (alloc_len <= 64 * 1024) {
+		if (alloc_len > 32 * 1024) {
+			alloc_len = 64 * 1024;
+		} else {
+			if (alloc_len > 16 * 1024) {
+				alloc_len = 32 * 1024;
+			} else {
+				alloc_len = 16 * 1024;
+			}
+		}
+	}
+
+	long_rpc.type = RDMA_LONG_BUFFER;
+	long_rpc.len = alloc_len;
+	if (rdma_buf_alloc(crdp->conn, &long_rpc)) {
+		return (SVC_RDMA_FAIL);
+	}
+
+	memp = long_rpc.addr;
+	xdrmem_create(&xdrslong, memp, alloc_len, XDR_ENCODE);
+
+	msg->rm_xid = clone_xprt->xp_xid;
+
+	if (!(xdr_replymsg(&xdrslong, msg) &&
+	    (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, &xdrslong,
+	    xdr_results, xdr_location)))) {
+		rdma_buf_free(crdp->conn, &long_rpc);
+		DTRACE_PROBE(krpc__e__svcrdma__longrep__authwrap);
+		return (SVC_RDMA_FAIL);
+	}
+
+	*final_len = XDR_GETPOS(&xdrslong);
+
+	*numchunks = 0;
+	*freelen = 0;
+
+	wcl = crdp->cl_reply;
+	wcl->rb_longbuf = long_rpc;
+
+	count = *final_len;
+	while (wcl != NULL) {
+		if (wcl->c_dmemhandle.mrc_rmr == 0)
+			break;
 
-		sin4 = (struct sockaddr_in *)clone_xprt->xp_rtaddr.buf;
-		bzero(print_addr, INET_ADDRSTRLEN);
-		(void) inet_ntop(AF_INET,
-		    &sin4->sin_addr, print_addr, INET_ADDRSTRLEN);
-		cmn_err(CE_NOTE,
-		    "svc_rdma_krecv: remote clnt_addr: %s", print_addr);
+		if (wcl->c_len > count) {
+			wcl->c_len = count;
+		}
+		wcl->w.c_saddr3 = (caddr_t)memp;
+
+		count -= wcl->c_len;
+		*numchunks +=  1;
+		if (count == 0)
+			break;
+		memp += wcl->c_len;
+		wcl = wcl->c_next;
+	}
+
+	wcl = crdp->cl_reply;
+
+	/*
+	 * MUST fail if there are still more data
+	 */
+	if (count > 0) {
+		rdma_buf_free(crdp->conn, &long_rpc);
+		DTRACE_PROBE(krpc__e__svcrdma__longrep__dlen__clist);
+		return (SVC_RDMA_FAIL);
+	}
+
+	if (clist_register(crdp->conn, wcl, CLIST_REG_SOURCE) != RDMA_SUCCESS) {
+		rdma_buf_free(crdp->conn, &long_rpc);
+		DTRACE_PROBE(krpc__e__svcrdma__longrep__clistreg);
+		return (SVC_RDMA_FAIL);
+	}
+
+	status = clist_syncmem(crdp->conn, wcl, CLIST_REG_SOURCE);
+
+	if (status) {
+		(void) clist_deregister(crdp->conn, wcl, CLIST_REG_SOURCE);
+		rdma_buf_free(crdp->conn, &long_rpc);
+		DTRACE_PROBE(krpc__e__svcrdma__longrep__syncmem);
+		return (SVC_RDMA_FAIL);
 	}
-#endif
+
+	status = RDMA_WRITE(crdp->conn, wcl, WAIT);
+
+	(void) clist_deregister(crdp->conn, wcl, CLIST_REG_SOURCE);
+	rdma_buf_free(crdp->conn, &wcl->rb_longbuf);
+
+	if (status != RDMA_SUCCESS) {
+		DTRACE_PROBE(krpc__e__svcrdma__longrep__write);
+		return (SVC_RDMA_FAIL);
+	}
+
+	return (SVC_RDMA_SUCCESS);
+}
+
 
-	clone_xprt->xp_xid = xid;
-	vd->conn = conn;
-	freeb(mp);
-	return (TRUE);
+static int
+svc_compose_rpcmsg(SVCXPRT * clone_xprt, CONN * conn, xdrproc_t xdr_results,
+    caddr_t xdr_location, rdma_buf_t *rpcreply, XDR ** xdrs,
+    struct rpc_msg *msg, bool_t has_args, uint_t *len)
+{
+	/*
+	 * Get a pre-allocated buffer for rpc reply
+	 */
+	rpcreply->type = SEND_BUFFER;
+	if (rdma_buf_alloc(conn, rpcreply)) {
+		DTRACE_PROBE(krpc__e__svcrdma__rpcmsg__reply__nofreebufs);
+		return (SVC_RDMA_FAIL);
+	}
+
+	xdrrdma_create(*xdrs, rpcreply->addr, rpcreply->len,
+	    0, NULL, XDR_ENCODE, conn);
+
+	msg->rm_xid = clone_xprt->xp_xid;
+
+	if (has_args) {
+		if (!(xdr_replymsg(*xdrs, msg) &&
+		    (!has_args ||
+		    SVCAUTH_WRAP(&clone_xprt->xp_auth, *xdrs,
+		    xdr_results, xdr_location)))) {
+			rdma_buf_free(conn, rpcreply);
+			DTRACE_PROBE(
+			    krpc__e__svcrdma__rpcmsg__reply__authwrap1);
+			return (SVC_RDMA_FAIL);
+		}
+	} else {
+		if (!xdr_replymsg(*xdrs, msg)) {
+			rdma_buf_free(conn, rpcreply);
+			DTRACE_PROBE(
+			    krpc__e__svcrdma__rpcmsg__reply__authwrap2);
+			return (SVC_RDMA_FAIL);
+		}
+	}
+
+	*len = XDR_GETPOS(*xdrs);
+
+	return (SVC_RDMA_SUCCESS);
 }
 
 /*
  * Send rpc reply.
  */
 static bool_t
-svc_rdma_ksend(SVCXPRT *clone_xprt, struct rpc_msg *msg)
+svc_rdma_ksend(SVCXPRT * clone_xprt, struct rpc_msg *msg)
 {
-	struct clone_rdma_data *vd;
-	XDR *xdrs = &(clone_xprt->xp_xdrout), rxdrs;
+	XDR *xdrs_rpc = &(clone_xprt->xp_xdrout);
+	XDR xdrs_rhdr;
+	CONN *conn = NULL;
+	rdma_buf_t rbuf_resp = {0}, rbuf_rpc_resp = {0};
+
+	struct clone_rdma_data *crdp;
+	struct clist *cl_read = NULL;
+	struct clist *cl_send = NULL;
+	struct clist *cl_write = NULL;
+	xdrproc_t xdr_results;		/* results XDR encoding function */
+	caddr_t xdr_location;		/* response results pointer */
+
 	int retval = FALSE;
-	xdrproc_t xdr_results;
-	caddr_t xdr_location;
-	bool_t has_args, reg = FALSE;
-	uint_t len, op;
-	uint_t vers;
-	struct clist *cl = NULL, *cle = NULL;
-	struct clist *sendlist = NULL;
-	int status;
-	int msglen;
-	rdma_buf_t clmsg, longreply, rpcreply;
+	int status, msglen, num_wreply_segments = 0;
+	uint32_t rdma_credit = 0;
+	int freelen = 0;
+	bool_t has_args;
+	uint_t  final_resp_len, rdma_response_op, vers;
 
-	vd = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+	bzero(&xdrs_rhdr, sizeof (XDR));
+	crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+	conn = crdp->conn;
 
 	/*
 	 * If there is a result procedure specified in the reply message,
@@ -624,343 +782,189 @@
 	}
 
 	/*
-	 * Get the size of the rpc reply message. Need this
-	 * to determine if the rpc reply message will fit in
-	 * the pre-allocated RDMA buffers. If the rpc reply
-	 * message length is greater that the pre-allocated
-	 * buffers then, a one time use buffer is allocated
-	 * and registered for this rpc reply.
+	 * Given the limit on the inline response size (RPC_MSG_SZ),
+	 * there is a need to make a guess as to the overall size of
+	 * the response.  If the resultant size is beyond the inline
+	 * size, then the server needs to use the "reply chunk list"
+	 * provided by the client (if the client provided one).  An
+	 * example of this type of response would be a READDIR
+	 * response (e.g. a small directory read would fit in RPC_MSG_SZ
+	 * and that is the preference but it may not fit)
+	 *
+	 * Combine the encoded size and the size of the true results
+	 * and then make the decision about where to encode and send results.
+	 *
+	 * One important note, this calculation is ignoring the size
+	 * of the encoding of the authentication overhead.  The reason
+	 * for this is rooted in the complexities of access to the
+	 * encoded size of RPCSEC_GSS related authentiation,
+	 * integrity, and privacy.
+	 *
+	 * If it turns out that the encoded authentication bumps the
+	 * response over the RPC_MSG_SZ limit, then it may need to
+	 * attempt to encode for the reply chunk list.
+	 */
+
+	/*
+	 * Calculating the "sizeof" the RPC response header and the
+	 * encoded results.
 	 */
 	msglen = xdr_sizeof(xdr_replymsg, msg);
-	if (has_args && msg->rm_reply.rp_acpt.ar_verf.oa_flavor != RPCSEC_GSS) {
+
+	if (msglen > 0) {
+		RSSTAT_INCR(rstotalreplies);
+	}
+	if (has_args)
 		msglen += xdrrdma_sizeof(xdr_results, xdr_location,
-				rdma_minchunk);
-		if (msglen > RPC_MSG_SZ) {
+		    rdma_minchunk, NULL, NULL);
+
+	DTRACE_PROBE1(krpc__i__svcrdma__ksend__msglen, int, msglen);
 
-			/*
-			 * Allocate chunk buffer for rpc reply
-			 */
-			rpcreply.type = CHUNK_BUFFER;
-			rpcreply.addr = kmem_zalloc(msglen, KM_SLEEP);
-			cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-			cle->c_xdroff = 0;
-			cle->c_len  = rpcreply.len = msglen;
-			cle->c_saddr = (uint64)(uintptr_t)rpcreply.addr;
-			cle->c_next = NULL;
-			xdrrdma_create(xdrs, rpcreply.addr, msglen,
-			    rdma_minchunk, cle, XDR_ENCODE, NULL);
-			op = RDMA_NOMSG;
-		} else {
-			/*
-			 * Get a pre-allocated buffer for rpc reply
-			 */
-			rpcreply.type = SEND_BUFFER;
-			if (RDMA_BUF_ALLOC(vd->conn, &rpcreply)) {
-				cmn_err(CE_WARN,
-				    "svc_rdma_ksend: no free buffers!");
-				return (retval);
-			}
-			xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len,
-			    rdma_minchunk, NULL, XDR_ENCODE, NULL);
-			op = RDMA_MSG;
-		}
+	status = SVC_RDMA_SUCCESS;
 
+	if (msglen < RPC_MSG_SZ) {
 		/*
-		 * Initialize the XDR encode stream.
+		 * Looks like the response will fit in the inline
+		 * response; let's try
 		 */
-		msg->rm_xid = clone_xprt->xp_xid;
+		RSSTAT_INCR(rstotalinlinereplies);
+
+		rdma_response_op = RDMA_MSG;
 
-		if (!(xdr_replymsg(xdrs, msg) &&
-		    (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, xdrs,
-		    xdr_results, xdr_location)))) {
-			rdma_buf_free(vd->conn, &rpcreply);
-			if (cle)
-				clist_free(cle);
-			cmn_err(CE_WARN,
-			    "svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
-			    "failed");
-			goto out;
+		status = svc_compose_rpcmsg(clone_xprt, conn, xdr_results,
+		    xdr_location, &rbuf_rpc_resp, &xdrs_rpc, msg,
+		    has_args, &final_resp_len);
+
+		DTRACE_PROBE1(krpc__i__srdma__ksend__compose_status,
+		    int, status);
+		DTRACE_PROBE1(krpc__i__srdma__ksend__compose_len,
+		    int, final_resp_len);
+
+		if (status == SVC_RDMA_SUCCESS && crdp->cl_reply) {
+			clist_free(crdp->cl_reply);
+			crdp->cl_reply = NULL;
 		}
-		len = XDR_GETPOS(xdrs);
 	}
-	if (has_args && msg->rm_reply.rp_acpt.ar_verf.oa_flavor == RPCSEC_GSS) {
 
+	/*
+	 * If the encode failed (size?) or the message really is
+	 * larger than what is allowed, try the response chunk list.
+	 */
+	if (status != SVC_RDMA_SUCCESS || msglen >= RPC_MSG_SZ) {
 		/*
-		 * For RPCSEC_GSS since we cannot accurately presize the
-		 * buffer required for encoding, we assume that its going
-		 * to be a Long RPC to start with. We also create the
-		 * the XDR stream with min_chunk set to 0 which instructs
-		 * the XDR layer to not chunk the incoming byte stream.
-		 */
-		msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth);
-		msglen += xdr_sizeof(xdr_results, xdr_location);
-
-		/*
-		 * Long RPC. Allocate one time use custom buffer.
+		 * attempting to use a reply chunk list when there
+		 * isn't one won't get very far...
 		 */
-		longreply.type = CHUNK_BUFFER;
-		longreply.addr = kmem_zalloc(msglen, KM_SLEEP);
-		cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-		cle->c_xdroff = 0;
-		cle->c_len  = longreply.len = msglen;
-		cle->c_saddr = (uint64)(uintptr_t)longreply.addr;
-		cle->c_next = NULL;
-		xdrrdma_create(xdrs, longreply.addr, msglen, 0, cle,
-		    XDR_ENCODE, NULL);
-		op = RDMA_NOMSG;
-		/*
-		 * Initialize the XDR encode stream.
-		 */
-		msg->rm_xid = clone_xprt->xp_xid;
-
-		if (!(xdr_replymsg(xdrs, msg) &&
-		    (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, xdrs,
-		    xdr_results, xdr_location)))) {
-			if (longreply.addr != xdrs->x_base) {
-				longreply.addr = xdrs->x_base;
-				longreply.len = xdr_getbufsize(xdrs);
-			}
-			rdma_buf_free(vd->conn, &longreply);
-			if (cle)
-				clist_free(cle);
-			cmn_err(CE_WARN,
-			    "svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
-			    "failed");
+		if (crdp->cl_reply == NULL) {
+			DTRACE_PROBE(krpc__e__svcrdma__ksend__noreplycl);
 			goto out;
 		}
 
-		/*
-		 * If we had to allocate a new buffer while encoding
-		 * then update the addr and len.
-		 */
-		if (longreply.addr != xdrs->x_base) {
-			longreply.addr = xdrs->x_base;
-			longreply.len = xdr_getbufsize(xdrs);
+		RSSTAT_INCR(rstotallongreplies);
+
+		msglen = xdr_sizeof(xdr_replymsg, msg);
+		msglen += xdrrdma_sizeof(xdr_results, xdr_location, 0,
+		    NULL, NULL);
+
+		status = svc_process_long_reply(clone_xprt, xdr_results,
+		    xdr_location, msg, has_args, &msglen, &freelen,
+		    &num_wreply_segments, &final_resp_len);
+
+		DTRACE_PROBE1(krpc__i__svcrdma__ksend__longreplen,
+		    int, final_resp_len);
+
+		if (status != SVC_RDMA_SUCCESS) {
+			DTRACE_PROBE(krpc__e__svcrdma__ksend__compose__failed);
+			goto out;
 		}
 
-		len = XDR_GETPOS(xdrs);
-
-		/*
-		 * If it so happens that the encoded message is after all
-		 * not long enough to be a Long RPC then allocate a
-		 * SEND_BUFFER and copy the encoded message into it.
-		 */
-		if (len > RPC_MSG_SZ) {
-			rpcreply.type = CHUNK_BUFFER;
-			rpcreply.addr = longreply.addr;
-			rpcreply.len = longreply.len;
-		} else {
-			clist_free(cle);
-			XDR_DESTROY(xdrs);
-			/*
-			 * Get a pre-allocated buffer for rpc reply
-			 */
-			rpcreply.type = SEND_BUFFER;
-			if (RDMA_BUF_ALLOC(vd->conn, &rpcreply)) {
-				cmn_err(CE_WARN,
-				    "svc_rdma_ksend: no free buffers!");
-				rdma_buf_free(vd->conn, &longreply);
-				return (retval);
-			}
-			bcopy(longreply.addr, rpcreply.addr, len);
-			xdrrdma_create(xdrs, rpcreply.addr, len, 0, NULL,
-			    XDR_ENCODE, NULL);
-			rdma_buf_free(vd->conn, &longreply);
-			op = RDMA_MSG;
-		}
+		rdma_response_op = RDMA_NOMSG;
 	}
 
-	if (has_args == FALSE) {
-
-		if (msglen > RPC_MSG_SZ) {
+	DTRACE_PROBE1(krpc__i__svcrdma__ksend__rdmamsg__len,
+	    int, final_resp_len);
 
-			/*
-			 * Allocate chunk buffer for rpc reply
-			 */
-			rpcreply.type = CHUNK_BUFFER;
-			rpcreply.addr = kmem_zalloc(msglen, KM_SLEEP);
-			cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
-			cle->c_xdroff = 0;
-			cle->c_len  = rpcreply.len = msglen;
-			cle->c_saddr = (uint64)(uintptr_t)rpcreply.addr;
-			cle->c_next = NULL;
-			xdrrdma_create(xdrs, rpcreply.addr, msglen,
-			    rdma_minchunk, cle, XDR_ENCODE, NULL);
-			op = RDMA_NOMSG;
-		} else {
-			/*
-			 * Get a pre-allocated buffer for rpc reply
-			 */
-			rpcreply.type = SEND_BUFFER;
-			if (RDMA_BUF_ALLOC(vd->conn, &rpcreply)) {
-				cmn_err(CE_WARN,
-				    "svc_rdma_ksend: no free buffers!");
-				return (retval);
-			}
-			xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len,
-			    rdma_minchunk, NULL, XDR_ENCODE, NULL);
-			op = RDMA_MSG;
-		}
-
-		/*
-		 * Initialize the XDR encode stream.
-		 */
-		msg->rm_xid = clone_xprt->xp_xid;
-
-		if (!xdr_replymsg(xdrs, msg)) {
-			rdma_buf_free(vd->conn, &rpcreply);
-			if (cle)
-				clist_free(cle);
-			cmn_err(CE_WARN,
-			    "svc_rdma_ksend: xdr_replymsg/SVCAUTH_WRAP "
-			    "failed");
-			goto out;
-		}
-		len = XDR_GETPOS(xdrs);
+	rbuf_resp.type = SEND_BUFFER;
+	if (rdma_buf_alloc(conn, &rbuf_resp)) {
+		rdma_buf_free(conn, &rbuf_rpc_resp);
+		DTRACE_PROBE(krpc__e__svcrdma__ksend__nofreebufs);
+		goto out;
 	}
 
-	/*
-	 * Get clist and a buffer for sending it across
-	 */
-	cl = xdrrdma_clist(xdrs);
-	clmsg.type = SEND_BUFFER;
-	if (RDMA_BUF_ALLOC(vd->conn, &clmsg)) {
-		rdma_buf_free(vd->conn, &rpcreply);
-		cmn_err(CE_WARN, "svc_rdma_ksend: no free buffers!!");
+	rdma_credit = rdma_bufs_granted;
+
+	vers = RPCRDMA_VERS;
+	xdrmem_create(&xdrs_rhdr, rbuf_resp.addr, rbuf_resp.len, XDR_ENCODE);
+	(*(uint32_t *)rbuf_resp.addr) = msg->rm_xid;
+	/* Skip xid and set the xdr position accordingly. */
+	XDR_SETPOS(&xdrs_rhdr, sizeof (uint32_t));
+	if (!xdr_u_int(&xdrs_rhdr, &vers) ||
+	    !xdr_u_int(&xdrs_rhdr, &rdma_credit) ||
+	    !xdr_u_int(&xdrs_rhdr, &rdma_response_op)) {
+		rdma_buf_free(conn, &rbuf_rpc_resp);
+		rdma_buf_free(conn, &rbuf_resp);
+		DTRACE_PROBE(krpc__e__svcrdma__ksend__uint);
 		goto out;
 	}
 
 	/*
-	 * Now register the chunks in the list
+	 * Now XDR the read chunk list, actually always NULL
 	 */
-	if (cl != NULL) {
-		status = clist_register(vd->conn, cl, 1);
-		if (status != RDMA_SUCCESS) {
-			rdma_buf_free(vd->conn, &clmsg);
-			cmn_err(CE_WARN,
-				"svc_rdma_ksend: clist register failed");
-			goto out;
-		}
-		reg = TRUE;
-	}
+	(void) xdr_encode_rlist_svc(&xdrs_rhdr, cl_read);
 
 	/*
-	 * XDR the XID, vers, and op
-	 */
-	/*
-	 * Treat xid as opaque (xid is the first entity
-	 * in the rpc rdma message).
+	 * encode write list -- we already drove RDMA_WRITEs
 	 */
-	vers = RPCRDMA_VERS;
-	xdrs = &rxdrs;
-	xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
-	(*(uint32_t *)clmsg.addr) = msg->rm_xid;
-	/* Skip xid and set the xdr position accordingly. */
-	XDR_SETPOS(xdrs, sizeof (uint32_t));
-	if (! xdr_u_int(xdrs, &vers) ||
-	    ! xdr_u_int(xdrs, &op)) {
-		rdma_buf_free(vd->conn, &rpcreply);
-		rdma_buf_free(vd->conn, &clmsg);
-		cmn_err(CE_WARN, "svc_rdma_ksend: xdr_u_int failed");
+	cl_write = crdp->cl_wlist;
+	if (!xdr_encode_wlist(&xdrs_rhdr, cl_write)) {
+		DTRACE_PROBE(krpc__e__svcrdma__ksend__enc__wlist);
+		rdma_buf_free(conn, &rbuf_rpc_resp);
+		rdma_buf_free(conn, &rbuf_resp);
 		goto out;
 	}
 
 	/*
-	 * Now XDR the chunk list
+	 * XDR encode the RDMA_REPLY write chunk
 	 */
-	(void) xdr_do_clist(xdrs, &cl);
-
-	clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle, clmsg.addr,
-		NULL, NULL);
-
-	if (op == RDMA_MSG) {
-		clist_add(&sendlist, 0, len, &rpcreply.handle, rpcreply.addr,
-			NULL, NULL);
-	} else {
-		cl->c_len = len;
-		RSSTAT_INCR(rslongrpcs);
+	if (!xdr_encode_reply_wchunk(&xdrs_rhdr, crdp->cl_reply,
+	    num_wreply_segments)) {
+		rdma_buf_free(conn, &rbuf_rpc_resp);
+		rdma_buf_free(conn, &rbuf_resp);
+		goto out;
 	}
 
-	/*
-	 * Send the reply message to the client
-	 */
-	if (cl != NULL) {
-		status = clist_syncmem(vd->conn, cl, 1);
-		if (status != RDMA_SUCCESS) {
-			rdma_buf_free(vd->conn, &rpcreply);
-			rdma_buf_free(vd->conn, &clmsg);
-			goto out;
-		}
-#ifdef DEBUG
-	if (rdma_svc_debug)
-		printf("svc_rdma_ksend: chunk response len %d xid %u\n",
-			cl->c_len, msg->rm_xid);
-#endif
-		/*
-		 * Post a receive buffer because we expect a RDMA_DONE
-		 * message.
-		 */
-		status = rdma_svc_postrecv(vd->conn);
+	clist_add(&cl_send, 0, XDR_GETPOS(&xdrs_rhdr), &rbuf_resp.handle,
+	    rbuf_resp.addr, NULL, NULL);
 
-		/*
-		 * Send the RPC reply message and wait for RDMA_DONE
-		 */
-		status = RDMA_SEND_RESP(vd->conn, sendlist, msg->rm_xid);
-		if (status != RDMA_SUCCESS) {
-#ifdef DEBUG
-			if (rdma_svc_debug)
-				cmn_err(CE_NOTE, "svc_rdma_ksend: "
-					"rdma_send_resp failed %d", status);
-#endif
-			goto out;
-		}
-#ifdef DEBUG
-	if (rdma_svc_debug)
-		printf("svc_rdma_ksend: got RDMA_DONE xid %u\n", msg->rm_xid);
-#endif
-	} else {
-#ifdef DEBUG
-	if (rdma_svc_debug)
-		printf("svc_rdma_ksend: msg response xid %u\n", msg->rm_xid);
-#endif
-		status = RDMA_SEND(vd->conn, sendlist, msg->rm_xid);
-		if (status != RDMA_SUCCESS) {
-#ifdef DEBUG
-			if (rdma_svc_debug)
-				cmn_err(CE_NOTE, "svc_rdma_ksend: "
-					"rdma_send failed %d", status);
-#endif
-			goto out;
-		}
+	if (rdma_response_op == RDMA_MSG) {
+		clist_add(&cl_send, 0, final_resp_len, &rbuf_rpc_resp.handle,
+		    rbuf_rpc_resp.addr, NULL, NULL);
 	}
 
-	retval = TRUE;
-out:
-	/*
-	 * Deregister the chunks
-	 */
-	if (cl != NULL) {
-		if (reg)
-			(void) clist_deregister(vd->conn, cl, 1);
-		if (op == RDMA_NOMSG) {
-			/*
-			 * Long RPC reply in chunk. Free it up.
-			 */
-			rdma_buf_free(vd->conn, &rpcreply);
-		}
-		clist_free(cl);
+	status = RDMA_SEND(conn, cl_send, msg->rm_xid);
+
+	if (status == RDMA_SUCCESS) {
+		retval = TRUE;
 	}
 
+out:
 	/*
 	 * Free up sendlist chunks
 	 */
-	if (sendlist != NULL)
-		clist_free(sendlist);
+	if (cl_send != NULL)
+		clist_free(cl_send);
 
 	/*
 	 * Destroy private data for xdr rdma
 	 */
-	XDR_DESTROY(&(clone_xprt->xp_xdrout));
+	if (clone_xprt->xp_xdrout.x_ops != NULL) {
+		XDR_DESTROY(&(clone_xprt->xp_xdrout));
+	}
+
+	if (crdp->cl_reply) {
+		clist_free(crdp->cl_reply);
+		crdp->cl_reply = NULL;
+	}
 
 	/*
 	 * This is completely disgusting.  If public is set it is
@@ -968,9 +972,13 @@
 	 * of the function to free that structure and any related
 	 * stuff.  (see rrokfree in nfs_xdr.c).
 	 */
-	if (xdrs->x_public) {
+	if (xdrs_rpc->x_public) {
 		/* LINTED pointer alignment */
-		(**((int (**)())xdrs->x_public))(xdrs->x_public);
+		(**((int (**)()) xdrs_rpc->x_public)) (xdrs_rpc->x_public);
+	}
+
+	if (xdrs_rhdr.x_ops != NULL) {
+		XDR_DESTROY(&xdrs_rhdr);
 	}
 
 	return (retval);
@@ -992,24 +1000,29 @@
 svc_rdma_kfreeargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
     caddr_t args_ptr)
 {
-	struct clone_rdma_data *vd;
+	struct clone_rdma_data *crdp;
 	bool_t retval;
 
-	vd = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+	crdp = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
+
+	/*
+	 * Free the args if needed then XDR_DESTROY
+	 */
 	if (args_ptr) {
 		XDR	*xdrs = &clone_xprt->xp_xdrin;
-		struct clist *cl;
-
-		cl = xdrrdma_clist(xdrs);
-		if (cl != NULL)
-			clist_free(cl);
 
 		xdrs->x_op = XDR_FREE;
 		retval = (*xdr_args)(xdrs, args_ptr);
 	}
+
 	XDR_DESTROY(&(clone_xprt->xp_xdrin));
-	rdma_buf_free(vd->conn, &vd->rpcbuf);
-	RDMA_REL_CONN(vd->conn);
+	rdma_buf_free(crdp->conn, &crdp->rpcbuf);
+	if (crdp->cl_reply) {
+		clist_free(crdp->cl_reply);
+		crdp->cl_reply = NULL;
+	}
+	RDMA_REL_CONN(crdp->conn);
+
 	return (retval);
 }
 
@@ -1139,7 +1152,6 @@
 		while (dr->dr_status == DUP_INPROGRESS) {
 			dr = dr->dr_next;
 			if (dr == rdmadrmru->dr_next) {
-				cmn_err(CE_WARN, "svc_rdma_kdup no slots free");
 				mutex_exit(&rdmadupreq_lock);
 				return (DUP_ERROR);
 			}
@@ -1237,3 +1249,32 @@
 		drt = drt->dr_chain;
 	}
 }
+
+bool_t
+rdma_get_wchunk(struct svc_req *req, iovec_t *iov, struct clist *wlist)
+{
+	struct clist	*clist;
+	uint32_t	tlen;
+
+	if (req->rq_xprt->xp_type != T_RDMA) {
+		return (FALSE);
+	}
+
+	tlen = 0;
+	clist = wlist;
+	while (clist) {
+		tlen += clist->c_len;
+		clist = clist->c_next;
+	}
+
+	/*
+	 * set iov to addr+len of first segment of first wchunk of
+	 * wlist sent by client.  krecv() already malloc'd a buffer
+	 * large enough, but registration is deferred until we write
+	 * the buffer back to (NFS) client using RDMA_WRITE.
+	 */
+	iov->iov_base = (caddr_t)(uintptr_t)wlist->w.c_saddr;
+	iov->iov_len = tlen;
+
+	return (TRUE);
+}
--- a/usr/src/uts/common/rpc/xdr.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdr.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,8 +31,6 @@
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * xdr.c, generic XDR routines implementation.
  * These are the "generic" xdr routines used to serialize and de-serialize
@@ -115,9 +112,6 @@
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
-#ifdef DEBUG
-	printf("xdr_int: FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -141,9 +135,6 @@
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
-#ifdef DEBUG
-	printf("xdr_int: FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -287,9 +278,6 @@
 
 	case XDR_DECODE:
 		if (!XDR_GETINT32(xdrs, (int32_t *)&l)) {
-#ifdef DEBUG
-			printf("xdr_u_short: decode FAILED\n");
-#endif
 			return (FALSE);
 		}
 		*usp = (ushort_t)l;
@@ -298,9 +286,6 @@
 	case XDR_FREE:
 		return (TRUE);
 	}
-#ifdef DEBUG
-	printf("xdr_u_short: bad op FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -342,9 +327,6 @@
 
 	case XDR_DECODE:
 		if (!XDR_GETINT32(xdrs, &i32b)) {
-#ifdef DEBUG
-			printf("xdr_bool: decode FAILED\n");
-#endif
 			return (FALSE);
 		}
 		*bp = (i32b == XDR_FALSE) ? FALSE : TRUE;
@@ -353,9 +335,6 @@
 	case XDR_FREE:
 		return (TRUE);
 	}
-#ifdef DEBUG
-	printf("xdr_bool: bad op FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -422,9 +401,6 @@
 
 	if (xdrs->x_op == XDR_DECODE) {
 		if (!XDR_GETBYTES(xdrs, cp, cnt)) {
-#ifdef DEBUG
-			printf("xdr_opaque: decode FAILED\n");
-#endif
 			return (FALSE);
 		}
 		if (rndup == 0)
@@ -434,9 +410,6 @@
 
 	if (xdrs->x_op == XDR_ENCODE) {
 		if (!XDR_PUTBYTES(xdrs, cp, cnt)) {
-#ifdef DEBUG
-			printf("xdr_opaque: encode FAILED\n");
-#endif
 			return (FALSE);
 		}
 		if (rndup == 0)
@@ -447,9 +420,6 @@
 	if (xdrs->x_op == XDR_FREE)
 		return (TRUE);
 
-#ifdef DEBUG
-	printf("xdr_opaque: bad op FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -473,17 +443,10 @@
 	 * first deal with the length since xdr bytes are counted
 	 */
 	if (!xdr_u_int(xdrs, sizep)) {
-#ifdef DEBUG
-		printf("xdr_bytes: size FAILED\n");
-#endif
 		return (FALSE);
 	}
 	nodesize = *sizep;
 	if ((nodesize > maxsize) && (xdrs->x_op != XDR_FREE)) {
-#ifdef DEBUG
-		printf("xdr_bytes: bad size (%d) FAILED (%d max)\n",
-		    nodesize, maxsize);
-#endif
 		return (FALSE);
 	}
 
@@ -508,9 +471,6 @@
 		}
 		return (TRUE);
 	}
-#ifdef DEBUG
-	printf("xdr_bytes: bad op FAILED\n");
-#endif
 	return (FALSE);
 }
 
@@ -544,9 +504,6 @@
 	 * we deal with the discriminator;  it's an enum
 	 */
 	if (!xdr_enum(xdrs, dscmp)) {
-#ifdef DEBUG
-		printf("xdr_enum: dscmp FAILED\n");
-#endif
 		return (FALSE);
 	}
 	dscm = *dscmp;
@@ -604,15 +561,9 @@
 		break;
 	}
 	if (!xdr_u_int(xdrs, &size)) {
-#ifdef DEBUG
-		printf("xdr_string: size FAILED\n");
-#endif
 		return (FALSE);
 	}
 	if (size > maxsize) {
-#ifdef DEBUG
-		printf("xdr_string: bad size FAILED\n");
-#endif
 		return (FALSE);
 	}
 	nodesize = size + 1;
@@ -653,9 +604,6 @@
 		*cpp = NULL;
 		return (TRUE);
 	}
-#ifdef DEBUG
-	printf("xdr_string: bad op FAILED\n");
-#endif
 	return (FALSE);
 }
 
--- a/usr/src/uts/common/rpc/xdr.h	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdr.h	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,7 +18,7 @@
  *
  * CDDL HEADER END
  *
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -38,8 +37,6 @@
 #ifndef _RPC_XDR_H
 #define	_RPC_XDR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/byteorder.h>	/* For all ntoh* and hton*() kind of macros */
 #include <rpc/types.h>	/* For all ntoh* and hton*() kind of macros */
 #ifndef _KERNEL
@@ -546,8 +543,14 @@
 #ifdef _KERNEL
 #define	XDR_PEEK		2
 #define	XDR_SKIPBYTES		3
-#define	XDR_RDMAGET		4
-#define	XDR_RDMASET		5
+#define	XDR_RDMA_GET_FLAGS	4
+#define	XDR_RDMA_SET_FLAGS	5
+#define	XDR_RDMA_ADD_CHUNK	6
+#define	XDR_RDMA_GET_CHUNK_LEN	7
+#define	XDR_RDMA_SET_WLIST	8
+#define	XDR_RDMA_GET_WLIST	9
+#define	XDR_RDMA_GET_WCINFO	10
+#define	XDR_RDMA_GET_RLIST	11
 #endif
 
 /*
@@ -586,8 +589,9 @@
 extern void	xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);
 extern bool_t	xdrmblk_getmblk(XDR *, mblk_t **, uint_t *);
 extern bool_t	xdrmblk_putmblk(XDR *, mblk_t *, uint_t);
-
 extern struct xdr_ops xdrmblk_ops;
+extern struct xdr_ops xdrrdmablk_ops;
+extern struct xdr_ops xdrrdma_ops;
 
 struct rpc_msg;
 extern bool_t	xdr_callmsg(XDR *, struct rpc_msg *);
--- a/usr/src/uts/common/rpc/xdr_array.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdr_array.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,8 +31,6 @@
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * xdr_array.c, Generic XDR routines impelmentation.
  * These are the "non-trivial" xdr primitives used to serialize and de-serialize
@@ -69,17 +66,11 @@
 
 	/* like strings, arrays are really counted arrays */
 	if (!xdr_u_int(xdrs, sizep)) {
-#ifdef DEBUG
-		printf("xdr_array: size FAILED\n");
-#endif
 		return (FALSE);
 	}
 	c = *sizep;
 	if ((c > maxsize || LASTUNSIGNED / elsize < c) &&
 	    xdrs->x_op != XDR_FREE) {
-#ifdef DEBUG
-		printf("xdr_array: bad size FAILED\n");
-#endif
 		return (FALSE);
 	}
 	nodesize = c * elsize;
--- a/usr/src/uts/common/rpc/xdr_mblk.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdr_mblk.c	Thu Aug 21 18:01:07 2008 -0500
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,8 +31,6 @@
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * xdr_mblk.c, XDR implementation on kernel streams mblks.
  */
@@ -116,7 +114,7 @@
 	 * align the mblk.
 	 */
 	if (!IS_P2ALIGNED(m->b_rptr, sizeof (int32_t)) ||
-			xdrs->x_handy < sizeof (int32_t)) {
+	    xdrs->x_handy < sizeof (int32_t)) {
 		while (!pullupmsg(m, sizeof (int32_t))) {
 			/*
 			 * Could have failed due to not
@@ -167,7 +165,7 @@
 			return (FALSE);
 		}
 		xdrs->x_handy = (int)(m->b_datap->db_lim - m->b_rptr -
-			    sizeof (int32_t));
+		    sizeof (int32_t));
 		ASSERT(m->b_rptr == m->b_wptr);
 		ASSERT(m->b_rptr >= m->b_datap->db_base);
 		ASSERT(m->b_rptr < m->b_datap->db_lim);
@@ -267,8 +265,6 @@
 	}
 	if (len < llen) {
 		if (m == NULL) {
-			/* not enough data in XDR stream */
-			printf("xdrmblk_getmblk failed\n");
 			return (FALSE);
 		} else {
 			int tail_bytes = llen - len;
@@ -530,7 +526,7 @@
 		int32p = (int32_t *)info;
 		len = RNDUP((int)(*int32p));
 		if (len < 0)
-		    return (FALSE);
+			return (FALSE);
 		while ((xdrs->x_handy -= len) < 0) {
 			if ((xdrs->x_handy += len) > 0) {
 				m->b_rptr += xdrs->x_handy;
--- a/usr/src/uts/common/rpc/xdr_rdma.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdr_rdma.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,11 +19,23 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2007, The Ohio State University. All rights reserved.
+ *
+ * Portions of this source code is developed by the team members of
+ * The Ohio State University's Network-Based Computing Laboratory (NBCL),
+ * headed by Professor Dhabaleswar K. (DK) Panda.
+ *
+ * Acknowledgements to contributions from developors:
+ *   Ranjit Noronha: [email protected]
+ *   Lei Chai      : [email protected]
+ *   Weikuan Yu    : [email protected]
+ *
+ */
 
 /*
  * xdr_rdma.c, XDR implementation using RDMA to move large chunks
@@ -34,85 +45,271 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
 
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/cmn_err.h>
 #include <rpc/rpc_sztypes.h>
 #include <rpc/rpc_rdma.h>
+#include <sys/sysmacros.h>
 
-static struct xdr_ops *xdrrdma_ops(void);
+static bool_t   xdrrdma_getint32(XDR *, int32_t *);
+static bool_t   xdrrdma_putint32(XDR *, int32_t *);
+static bool_t   xdrrdma_getbytes(XDR *, caddr_t, int);
+static bool_t   xdrrdma_putbytes(XDR *, caddr_t, int);
+uint_t		xdrrdma_getpos(XDR *);
+bool_t		xdrrdma_setpos(XDR *, uint_t);
+static rpc_inline_t *xdrrdma_inline(XDR *, int);
+void		xdrrdma_destroy(XDR *);
+static bool_t   xdrrdma_control(XDR *, int, void *);
+
+struct xdr_ops  xdrrdmablk_ops = {
+	xdrrdma_getbytes,
+	xdrrdma_putbytes,
+	xdrrdma_getpos,
+	xdrrdma_setpos,
+	xdrrdma_inline,
+	xdrrdma_destroy,
+	xdrrdma_control,
+	xdrrdma_getint32,
+	xdrrdma_putint32
+};
+
+struct xdr_ops  xdrrdma_ops = {
+	xdrrdma_getbytes,
+	xdrrdma_putbytes,
+	xdrrdma_getpos,
+	xdrrdma_setpos,
+	xdrrdma_inline,
+	xdrrdma_destroy,
+	xdrrdma_control,
+	xdrrdma_getint32,
+	xdrrdma_putint32
+};
 
 /*
- * A chunk list entry identifies a chunk
- * of opaque data to be moved separately
- * from the rest of the RPC message.
- * xp_min_chunk = 0, is a special case for ENCODING, which means
- * do not chunk the incoming stream of data.
+ * A chunk list entry identifies a chunk of opaque data to be moved
+ * separately from the rest of the RPC message. xp_min_chunk = 0, is a
+ * special case for ENCODING, which means do not chunk the incoming stream of
+ * data.
  */
 
-struct private {
+typedef struct {
 	caddr_t		xp_offp;
 	int		xp_min_chunk;
 	uint_t		xp_flags;	/* Controls setting for rdma xdr */
-	int		xp_buf_size;		/* size of xdr buffer */
-	struct clist	*xp_cl;			/* head of chunk list */
-	struct clist	**xp_cl_next;	/* location to place/find next chunk */
+	int		xp_buf_size;	/* size of xdr buffer */
+	struct clist	*xp_rcl;		/* head of chunk list */
+	struct clist	**xp_rcl_next;	/* location to place/find next chunk */
+	struct clist	*xp_wcl;	/* head of write chunk list */
 	CONN		*xp_conn;	/* connection for chunk data xfer */
-};
+	uint_t		xp_reply_chunk_len;
+	/* used to track length for security modes: integrity/privacy */
+	uint_t		xp_reply_chunk_len_alt;
+} xrdma_private_t;
+
+extern kmem_cache_t *clist_cache;
+
+bool_t
+xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep,
+    CONN **conn, const uint_t maxsize)
+{
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+	struct clist	*cle = *(xdrp->xp_rcl_next);
+	struct clist	*cls = *(xdrp->xp_rcl_next);
+	struct clist	*rdclist = NULL, *prev = NULL;
+	bool_t		retval = TRUE;
+	uint32_t	cur_offset = 0;
+	uint32_t	total_segments = 0;
+	uint32_t	actual_segments = 0;
+	uint32_t	alen;
+	uint_t		total_len;
+
+	ASSERT(xdrs->x_op != XDR_FREE);
+
+	/*
+	 * first deal with the length since xdr bytes are counted
+	 */
+	if (!xdr_u_int(xdrs, sizep)) {
+		DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail);
+		return (FALSE);
+	}
+	total_len = *sizep;
+	if (total_len > maxsize) {
+		DTRACE_PROBE2(xdr__e__getrdmablk_bad_size,
+		    int, total_len, int, maxsize);
+		return (FALSE);
+	}
+	(*conn) = xdrp->xp_conn;
+
+	/*
+	 * if no data we are done
+	 */
+	if (total_len == 0)
+		return (TRUE);
+
+	while (cle) {
+		total_segments++;
+		cle = cle->c_next;
+	}
+
+	cle = *(xdrp->xp_rcl_next);
+
+	/*
+	 * If there was a chunk at the current offset, then setup a read
+	 * chunk list which records the destination address and length
+	 * and will RDMA READ the data in later.
+	 */
+	if (cle == NULL)
+		return (FALSE);
 
+	if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base))
+		return (FALSE);
+
+	/*
+	 * Setup the chunk list with appropriate
+	 * address (offset) and length
+	 */
+	for (actual_segments = 0;
+	    actual_segments < total_segments; actual_segments++) {
+		if (total_len <= 0)
+			break;
+		cle->u.c_daddr = (uint64) cur_offset;
+		alen = 0;
+		if (cle->c_len > total_len) {
+			alen = cle->c_len;
+			cle->c_len = total_len;
+		}
+		if (!alen)
+			xdrp->xp_rcl_next = &cle->c_next;
+
+		cur_offset += cle->c_len;
+		total_len -= cle->c_len;
+
+		if ((total_segments - actual_segments - 1) == 0 &&
+		    total_len > 0) {
+			DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort);
+			retval = FALSE;
+		}
+
+		if ((total_segments - actual_segments - 1) > 0 &&
+		    total_len == 0) {
+			DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig,
+			    int, total_segments, int, actual_segments);
+		}
+
+		rdclist = clist_alloc();
+		(*rdclist) = (*cle);
+		if ((*rlist) == NULL)
+			(*rlist) = rdclist;
+		if (prev == NULL)
+			prev = rdclist;
+		else {
+			prev->c_next = rdclist;
+			prev = rdclist;
+		}
+
+		cle = cle->c_next;
+	}
+
+out:
+	if (prev != NULL)
+		prev->c_next = NULL;
+
+	cle = cls;
+	if (alen) {
+		cle->w.c_saddr =
+		    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
+		cle->c_len = alen - cle->c_len;
+	}
+
+	return (retval);
+}
 
 /*
- * The procedure xdrrdma_create initializes a stream descriptor for a
- * memory buffer.
+ * The procedure xdrrdma_create initializes a stream descriptor for a memory
+ * buffer.
  */
 void
 xdrrdma_create(XDR *xdrs, caddr_t addr, uint_t size,
-	int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
+    int min_chunk, struct clist *cl, enum xdr_op op, CONN *conn)
 {
-	struct private *xdrp;
-	struct clist *cle;
+	xrdma_private_t *xdrp;
+	struct clist   *cle;
 
 	xdrs->x_op = op;
-	xdrs->x_ops = xdrrdma_ops();
+	xdrs->x_ops = &xdrrdma_ops;
 	xdrs->x_base = addr;
 	xdrs->x_handy = size;
 	xdrs->x_public = NULL;
 
-	xdrp = (struct private *)kmem_zalloc(sizeof (struct private), KM_SLEEP);
+	xdrp = (xrdma_private_t *)kmem_zalloc(sizeof (xrdma_private_t),
+	    KM_SLEEP);
 	xdrs->x_private = (caddr_t)xdrp;
 	xdrp->xp_offp = addr;
 	xdrp->xp_min_chunk = min_chunk;
 	xdrp->xp_flags = 0;
 	xdrp->xp_buf_size = size;
-	xdrp->xp_cl = cl;
+	xdrp->xp_rcl = cl;
+	xdrp->xp_reply_chunk_len = 0;
+	xdrp->xp_reply_chunk_len_alt = 0;
+
 	if (op == XDR_ENCODE && cl != NULL) {
-		/* Find last element in chunk list and set xp_cl_next */
-		for (cle = cl; cle->c_next != NULL; cle = cle->c_next);
-		xdrp->xp_cl_next = &(cle->c_next);
-	} else
-		xdrp->xp_cl_next = &(xdrp->xp_cl);
+		/* Find last element in chunk list and set xp_rcl_next */
+		for (cle = cl; cle->c_next != NULL; cle = cle->c_next)
+			continue;
+
+		xdrp->xp_rcl_next = &(cle->c_next);
+	} else {
+		xdrp->xp_rcl_next = &(xdrp->xp_rcl);
+	}
+
+	xdrp->xp_wcl = NULL;
+
 	xdrp->xp_conn = conn;
-	if (xdrp->xp_min_chunk == 0)
-		xdrp->xp_flags |= RDMA_NOCHUNK;
+	if (xdrp->xp_min_chunk != 0)
+		xdrp->xp_flags |= XDR_RDMA_CHUNK;
 }
 
 /* ARGSUSED */
 void
-xdrrdma_destroy(XDR *xdrs)
+xdrrdma_destroy(XDR * xdrs)
 {
-	(void) kmem_free(xdrs->x_private, sizeof (struct private));
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+
+	if (xdrp == NULL)
+		return;
+
+	if (xdrp->xp_wcl) {
+		if (xdrp->xp_flags & XDR_RDMA_WLIST_REG) {
+			(void) clist_deregister(xdrp->xp_conn,
+			    xdrp->xp_wcl, CLIST_REG_DST);
+			rdma_buf_free(xdrp->xp_conn,
+			    &xdrp->xp_wcl->rb_longbuf);
+		}
+		clist_free(xdrp->xp_wcl);
+	}
+
+	if (xdrp->xp_rcl) {
+		if (xdrp->xp_flags & XDR_RDMA_RLIST_REG) {
+			(void) clist_deregister(xdrp->xp_conn,
+			    xdrp->xp_rcl, CLIST_REG_SOURCE);
+			rdma_buf_free(xdrp->xp_conn,
+			    &xdrp->xp_rcl->rb_longbuf);
+		}
+		clist_free(xdrp->xp_rcl);
+	}
+
+	(void) kmem_free(xdrs->x_private, sizeof (xrdma_private_t));
+	xdrs->x_private = NULL;
 }
 
-struct clist *
-xdrrdma_clist(XDR *xdrs) {
-	return (((struct private *)(xdrs->x_private))->xp_cl);
-}
-
-static bool_t
+static	bool_t
 xdrrdma_getint32(XDR *xdrs, int32_t *int32p)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
 
 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
 		return (FALSE);
@@ -124,10 +321,10 @@
 	return (TRUE);
 }
 
-static bool_t
+static	bool_t
 xdrrdma_putint32(XDR *xdrs, int32_t *int32p)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
 
 	if ((xdrs->x_handy -= (int)sizeof (int32_t)) < 0)
 		return (FALSE);
@@ -140,72 +337,131 @@
 }
 
 /*
- * DECODE some bytes from an XDR stream
+ * DECODE bytes from XDR stream for rdma.
+ * If the XDR stream contains a read chunk list,
+ * it will go through xdrrdma_getrdmablk instead.
  */
-static bool_t
+static	bool_t
 xdrrdma_getbytes(XDR *xdrs, caddr_t addr, int len)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
-	struct clist *cle = *(xdrp->xp_cl_next);
-	struct clist cl;
-	bool_t  retval = TRUE;
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+	struct clist	*cle = *(xdrp->xp_rcl_next);
+	struct clist	*cls = *(xdrp->xp_rcl_next);
+	struct clist	cl;
+	bool_t		retval = TRUE;
+	uint32_t	total_len = len;
+	uint32_t	cur_offset = 0;
+	uint32_t	total_segments = 0;
+	uint32_t	actual_segments = 0;
+	uint32_t	status;
+	uint32_t	alen;
 
+	while (cle) {
+		total_segments++;
+		cle = cle->c_next;
+	}
+
+	cle = *(xdrp->xp_rcl_next);
 	/*
-	 * If there was a chunk at the current offset
-	 * first record the destination address and length
-	 * in the chunk list that came with the message, then
-	 * RDMA READ the chunk data.
+	 * If there was a chunk at the current offset, then setup a read
+	 * chunk list which records the destination address and length
+	 * and will RDMA READ the data in later.
 	 */
+
 	if (cle != NULL &&
-		cle->c_xdroff == (xdrp->xp_offp - xdrs->x_base)) {
-		cle->c_daddr = (uint64)(uintptr_t)addr;
-		cle->c_len  = len;
-		xdrp->xp_cl_next = &cle->c_next;
+	    cle->c_xdroff == (xdrp->xp_offp - xdrs->x_base)) {
+		for (actual_segments = 0;
+		    actual_segments < total_segments; actual_segments++) {
+			if (total_len <= 0)
+				break;
+			cle->u.c_daddr = (uint64)(uintptr_t)addr + cur_offset;
+			alen = 0;
+			if (cle->c_len > total_len) {
+				alen = cle->c_len;
+				cle->c_len = total_len;
+			}
+			if (!alen)
+				xdrp->xp_rcl_next = &cle->c_next;
+
+			cur_offset += cle->c_len;
+			total_len -= cle->c_len;
+
+			if ((total_segments - actual_segments - 1) == 0 &&
+			    total_len > 0) {
+				DTRACE_PROBE(
+				    krpc__e__xdrrdma_getbytes_chunktooshort);
+				retval = FALSE;
+			}
+
+			if ((total_segments - actual_segments - 1) > 0 &&
+			    total_len == 0) {
+				DTRACE_PROBE2(krpc__e__xdrrdma_getbytes_toobig,
+				    int, total_segments, int, actual_segments);
+			}
 
-		/*
-		 * RDMA READ the chunk data from the remote end.
-		 * First prep the destination buffer by registering
-		 * it, then RDMA READ the chunk data. Since we are
-		 * doing streaming memory, sync the destination buffer
-		 * to CPU and deregister the buffer.
-		 */
-		if (xdrp->xp_conn == NULL) {
-			return (FALSE);
-		}
+			/*
+			 * RDMA READ the chunk data from the remote end.
+			 * First prep the destination buffer by registering
+			 * it, then RDMA READ the chunk data. Since we are
+			 * doing streaming memory, sync the destination
+			 * buffer to CPU and deregister the buffer.
+			 */
+			if (xdrp->xp_conn == NULL) {
+				return (FALSE);
+			}
+			cl = *cle;
+			cl.c_next = NULL;
+			if (clist_register(xdrp->xp_conn, &cl, CLIST_REG_DST)
+			    != RDMA_SUCCESS) {
+				return (FALSE);
+			}
+			cle->c_dmemhandle = cl.c_dmemhandle;
+			cle->c_dsynchandle = cl.c_dsynchandle;
 
-		cl = *cle;
-		cl.c_next = NULL;
-		if (clist_register(xdrp->xp_conn, &cl, 0) != RDMA_SUCCESS) {
-			return (FALSE);
+			/*
+			 * Now read the chunk in
+			 */
+			if ((total_segments - actual_segments - 1) == 0 ||
+			    total_len == 0) {
+				status = RDMA_READ(xdrp->xp_conn, &cl, WAIT);
+			} else {
+				status = RDMA_READ(xdrp->xp_conn, &cl, NOWAIT);
+			}
+			if (status != RDMA_SUCCESS) {
+				DTRACE_PROBE1(
+				    krpc__i__xdrrdma_getblk_readfailed,
+				    int, status);
+				retval = FALSE;
+				goto out;
+			}
+			cle = cle->c_next;
 		}
 
 		/*
-		 * Now read the chunk in
-		 */
-		if (RDMA_READ(xdrp->xp_conn, &cl, WAIT) != RDMA_SUCCESS) {
-#ifdef DEBUG
-			cmn_err(CE_WARN,
-				"xdrrdma_getbytes: RDMA_READ failed\n");
-#endif
-			retval = FALSE;
-			goto out;
-		}
-		/*
 		 * sync the memory for cpu
 		 */
+		cl = *cls;
+		cl.c_next = NULL;
+		cl.c_len = cur_offset;
 		if (clist_syncmem(xdrp->xp_conn, &cl, 0) != RDMA_SUCCESS) {
 			retval = FALSE;
-			goto out;
 		}
-
 out:
 		/*
 		 * Deregister the chunks
 		 */
-		(void) clist_deregister(xdrp->xp_conn, &cl, 0);
+		cle = cls;
+		cl = *cle;
+		cl.c_next = NULL;
+		cl.c_len = cur_offset;
+		(void) clist_deregister(xdrp->xp_conn, &cl, CLIST_REG_DST);
+		if (alen) {
+			cle->w.c_saddr =
+			    (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len;
+			cle->c_len = alen - cle->c_len;
+		}
 		return (retval);
 	}
-
 	if ((xdrs->x_handy -= len) < 0)
 		return (FALSE);
 
@@ -216,63 +472,46 @@
 }
 
 /*
- * ENCODE some bytes into an XDR stream
- * xp_min_chunk = 0, means the stream of bytes contain no chunks
- * to seperate out, and if the bytes do not fit in the supplied
- * buffer, grow the buffer and free the old buffer.
+ * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of
+ * bytes contain no chunks to seperate out, and if the bytes do not fit in
+ * the supplied buffer, grow the buffer and free the old buffer.
  */
-static bool_t
+static	bool_t
 xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
-	struct clist *clzero = xdrp->xp_cl;
-
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
 	/*
-	 * If this chunk meets the minimum chunk size
-	 * then don't encode it.  Just record its address
-	 * and length in a chunk list entry so that it
-	 * can be moved separately via RDMA.
+	 * Is this stream accepting chunks?
+	 * If so, does the either of the two following conditions exist?
+	 * - length of bytes to encode is greater than the min chunk size?
+	 * - remaining space in this stream is shorter than length of
+	 *   bytes to encode?
+	 *
+	 * If the above exists, then create a chunk for this encoding
+	 * and save the addresses, etc.
 	 */
-	if (!(xdrp->xp_flags & RDMA_NOCHUNK) && xdrp->xp_min_chunk != 0 &&
-	    len >= xdrp->xp_min_chunk) {
-		struct clist *cle;
-		int offset = xdrp->xp_offp - xdrs->x_base;
+	if (xdrp->xp_flags & XDR_RDMA_CHUNK &&
+	    ((xdrp->xp_min_chunk != 0 &&
+	    len >= xdrp->xp_min_chunk) ||
+	    (xdrs->x_handy - len  < 0))) {
+		struct clist	*cle;
+		int		offset = xdrp->xp_offp - xdrs->x_base;
 
-		cle = (struct clist *)kmem_zalloc(sizeof (struct clist),
-				KM_SLEEP);
+		cle = clist_alloc();
 		cle->c_xdroff = offset;
-		cle->c_len  = len;
-		cle->c_saddr = (uint64)(uintptr_t)addr;
+		cle->c_len = len;
+		cle->w.c_saddr = (uint64)(uintptr_t)addr;
 		cle->c_next = NULL;
 
-		*(xdrp->xp_cl_next) = cle;
-		xdrp->xp_cl_next = &(cle->c_next);
+		*(xdrp->xp_rcl_next) = cle;
+		xdrp->xp_rcl_next = &(cle->c_next);
 
 		return (TRUE);
 	}
-
+	/* Is there enough space to encode what is left? */
 	if ((xdrs->x_handy -= len) < 0) {
-		if (xdrp->xp_min_chunk == 0) {
-			int  newbuflen, encodelen;
-			caddr_t newbuf;
-
-			xdrs->x_handy += len;
-			encodelen = xdrp->xp_offp - xdrs->x_base;
-			newbuflen = xdrp->xp_buf_size + len;
-			newbuf = kmem_zalloc(newbuflen, KM_SLEEP);
-			bcopy(xdrs->x_base, newbuf, encodelen);
-			(void) kmem_free(xdrs->x_base, xdrp->xp_buf_size);
-			xdrs->x_base = newbuf;
-			xdrp->xp_offp = newbuf + encodelen;
-			xdrp->xp_buf_size = newbuflen;
-			if (xdrp->xp_min_chunk == 0 && clzero->c_xdroff == 0) {
-				clzero->c_len = newbuflen;
-				clzero->c_saddr = (uint64)(uintptr_t)newbuf;
-			}
-		} else
-			return (FALSE);
+		return (FALSE);
 	}
-
 	bcopy(addr, xdrp->xp_offp, len);
 	xdrp->xp_offp += len;
 
@@ -282,7 +521,7 @@
 uint_t
 xdrrdma_getpos(XDR *xdrs)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
 
 	return ((uint_t)((uintptr_t)xdrp->xp_offp - (uintptr_t)xdrs->x_base));
 }
@@ -290,11 +529,11 @@
 bool_t
 xdrrdma_setpos(XDR *xdrs, uint_t pos)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
 
-	caddr_t newaddr = xdrs->x_base + pos;
-	caddr_t lastaddr = xdrp->xp_offp + xdrs->x_handy;
-	ptrdiff_t diff;
+	caddr_t		newaddr = xdrs->x_base + pos;
+	caddr_t		lastaddr = xdrp->xp_offp + xdrs->x_handy;
+	ptrdiff_t	diff;
 
 	if (newaddr > lastaddr)
 		return (FALSE);
@@ -310,39 +549,48 @@
 static rpc_inline_t *
 xdrrdma_inline(XDR *xdrs, int len)
 {
-	rpc_inline_t *buf = NULL;
-	struct private *xdrp = (struct private *)(xdrs->x_private);
-	struct clist *cle = *(xdrp->xp_cl_next);
+	rpc_inline_t	*buf = NULL;
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+	struct clist	*cle = *(xdrp->xp_rcl_next);
 
 	if (xdrs->x_op == XDR_DECODE) {
 		/*
-		 * Since chunks aren't in-line, check to see whether
-		 * there is a chunk in the inline range.
+		 * Since chunks aren't in-line, check to see whether there is
+		 * a chunk in the inline range.
 		 */
 		if (cle != NULL &&
-			cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
+		    cle->c_xdroff <= (xdrp->xp_offp - xdrs->x_base + len))
+			return (NULL);
+	}
+
+	/* LINTED pointer alignment */
+	buf = (rpc_inline_t *)xdrp->xp_offp;
+	if (!IS_P2ALIGNED(buf, sizeof (int32_t)))
 		return (NULL);
-	}
 
 	if ((xdrs->x_handy < len) || (xdrp->xp_min_chunk != 0 &&
 	    len >= xdrp->xp_min_chunk)) {
 		return (NULL);
 	} else {
 		xdrs->x_handy -= len;
-		/* LINTED pointer alignment */
-		buf = (rpc_inline_t *)xdrp->xp_offp;
 		xdrp->xp_offp += len;
 		return (buf);
 	}
 }
 
-static bool_t
+static	bool_t
 xdrrdma_control(XDR *xdrs, int request, void *info)
 {
-	int32_t *int32p;
-	int len;
-	uint_t in_flags;
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	int32_t		*int32p;
+	int		len, i;
+	uint_t		in_flags;
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+	rdma_chunkinfo_t *rcip = NULL;
+	rdma_wlist_conn_info_t *rwcip = NULL;
+	rdma_chunkinfo_lengths_t *rcilp = NULL;
+	struct uio *uiop;
+	struct clist	*rwl = NULL;
+	struct clist	*prev = NULL;
 
 	switch (request) {
 	case XDR_PEEK:
@@ -370,10 +618,10 @@
 
 		return (TRUE);
 
-	case XDR_RDMASET:
+	case XDR_RDMA_SET_FLAGS:
 		/*
-		 * Set the flags provided in the *info in xp_flags for rdma xdr
-		 * stream control.
+		 * Set the flags provided in the *info in xp_flags for rdma
+		 * xdr stream control.
 		 */
 		int32p = (int32_t *)info;
 		in_flags = (uint_t)(*int32p);
@@ -381,7 +629,7 @@
 		xdrp->xp_flags |= in_flags;
 		return (TRUE);
 
-	case XDR_RDMAGET:
+	case XDR_RDMA_GET_FLAGS:
 		/*
 		 * Get the flags provided in xp_flags return through *info
 		 */
@@ -390,63 +638,597 @@
 		*int32p = (int32_t)xdrp->xp_flags;
 		return (TRUE);
 
+	case XDR_RDMA_GET_CHUNK_LEN:
+		rcilp = (rdma_chunkinfo_lengths_t *)info;
+		rcilp->rcil_len = xdrp->xp_reply_chunk_len;
+		rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
+
+		return (TRUE);
+
+	case XDR_RDMA_ADD_CHUNK:
+		/*
+		 * Store wlist information
+		 */
+
+		rcip = (rdma_chunkinfo_t *)info;
+
+		switch (rcip->rci_type) {
+		case RCI_WRITE_UIO_CHUNK:
+			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
+
+			if (rcip->rci_len < xdrp->xp_min_chunk) {
+				xdrp->xp_wcl = NULL;
+				*(rcip->rci_clpp) = NULL;
+				return (TRUE);
+			}
+			uiop = rcip->rci_a.rci_uiop;
+
+			for (i = 0; i < uiop->uio_iovcnt; i++) {
+				rwl = clist_alloc();
+				rwl->c_len = uiop->uio_iov[i].iov_len;
+				rwl->u.c_daddr =
+				    (uint64)(uintptr_t)
+				    (uiop->uio_iov[i].iov_base);
+				/*
+				 * if userspace address, put adspace ptr in
+				 * clist. If not, then do nothing since it's
+				 * already set to NULL (from kmem_zalloc)
+				 */
+				if (uiop->uio_segflg == UIO_USERSPACE) {
+					rwl->c_adspc = ttoproc(curthread)->p_as;
+				}
+
+				if (prev == NULL)
+					prev = rwl;
+				else {
+					prev->c_next = rwl;
+					prev = rwl;
+				}
+			}
+
+			rwl->c_next = NULL;
+			xdrp->xp_wcl = rwl;
+			*(rcip->rci_clpp) = rwl;
+
+			break;
+
+		case RCI_WRITE_ADDR_CHUNK:
+			rwl = clist_alloc();
+
+			rwl->c_len = rcip->rci_len;
+			rwl->u.c_daddr3 = rcip->rci_a.rci_addr;
+			rwl->c_next = NULL;
+			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
+
+			xdrp->xp_wcl = rwl;
+			*(rcip->rci_clpp) = rwl;
+
+			break;
+
+		case RCI_REPLY_CHUNK:
+			xdrp->xp_reply_chunk_len += rcip->rci_len;
+			break;
+		}
+		return (TRUE);
+
+	case XDR_RDMA_GET_WLIST:
+		*((struct clist **)info) = xdrp->xp_wcl;
+		return (TRUE);
+
+	case XDR_RDMA_SET_WLIST:
+		xdrp->xp_wcl = (struct clist *)info;
+		return (TRUE);
+
+	case XDR_RDMA_GET_RLIST:
+		*((struct clist **)info) = xdrp->xp_rcl;
+		return (TRUE);
+
+	case XDR_RDMA_GET_WCINFO:
+		rwcip = (rdma_wlist_conn_info_t *)info;
+
+		rwcip->rwci_wlist = xdrp->xp_wcl;
+		rwcip->rwci_conn = xdrp->xp_conn;
+
+		return (TRUE);
+
 	default:
 		return (FALSE);
 	}
 }
 
-static struct xdr_ops *
-xdrrdma_ops(void)
-{
-	static struct xdr_ops ops;
-
-	if (ops.x_getint32 == NULL) {
-		ops.x_getbytes = xdrrdma_getbytes;
-		ops.x_putbytes = xdrrdma_putbytes;
-		ops.x_getpostn = xdrrdma_getpos;
-		ops.x_setpostn = xdrrdma_setpos;
-		ops.x_inline = xdrrdma_inline;
-		ops.x_destroy = xdrrdma_destroy;
-		ops.x_control = xdrrdma_control;
-		ops.x_getint32 = xdrrdma_getint32;
-		ops.x_putint32 = xdrrdma_putint32;
-	}
-	return (&ops);
-}
+bool_t xdr_do_clist(XDR *, clist **);
 
 /*
- * Not all fields in struct clist are interesting to the
- * RPC over RDMA protocol. Only XDR the interesting fields.
+ * Not all fields in struct clist are interesting to the RPC over RDMA
+ * protocol. Only XDR the interesting fields.
  */
 bool_t
 xdr_clist(XDR *xdrs, clist *objp)
 {
-
 	if (!xdr_uint32(xdrs, &objp->c_xdroff))
 		return (FALSE);
+	if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
+		return (FALSE);
 	if (!xdr_uint32(xdrs, &objp->c_len))
 		return (FALSE);
-	if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr))
+	if (!xdr_uint64(xdrs, &objp->w.c_saddr))
 		return (FALSE);
-	if (!xdr_uint64(xdrs, &objp->c_saddr))
-		return (FALSE);
-	if (!xdr_pointer(xdrs, (char **)&objp->c_next, sizeof (clist),
-		(xdrproc_t)xdr_clist))
+	if (!xdr_do_clist(xdrs, &objp->c_next))
 		return (FALSE);
 	return (TRUE);
 }
 
+/*
+ * The following two functions are forms of xdr_pointer()
+ * and xdr_reference(). Since the generic versions just
+ * kmem_alloc() a new clist, we actually want to use the
+ * rdma_clist kmem_cache.
+ */
+
+/*
+ * Generate or free a clist structure from the
+ * kmem_cache "rdma_clist"
+ */
 bool_t
-xdr_do_clist(XDR *xdrs, clist **clp)
+xdr_ref_clist(XDR *xdrs, caddr_t *pp)
 {
-	return (xdr_pointer(xdrs, (char **)clp,
-		sizeof (clist), (xdrproc_t)xdr_clist));
+	caddr_t loc = *pp;
+	bool_t stat;
+
+	if (loc == NULL) {
+		switch (xdrs->x_op) {
+		case XDR_FREE:
+			return (TRUE);
+
+		case XDR_DECODE:
+			*pp = loc = (caddr_t)clist_alloc();
+			break;
+
+		case XDR_ENCODE:
+			ASSERT(loc);
+			break;
+		}
+	}
+
+	stat = xdr_clist(xdrs, (struct clist *)loc);
+
+	if (xdrs->x_op == XDR_FREE) {
+		kmem_cache_free(clist_cache, loc);
+		*pp = NULL;
+	}
+	return (stat);
+}
+
+/*
+ * XDR a pointer to a possibly recursive clist. This differs
+ * with xdr_reference in that it can serialize/deserialiaze
+ * trees correctly.
+ *
+ *  What is sent is actually a union:
+ *
+ *  union object_pointer switch (boolean b) {
+ *  case TRUE: object_data data;
+ *  case FALSE: void nothing;
+ *  }
+ *
+ * > objpp: Pointer to the pointer to the object.
+ *
+ */
+
+bool_t
+xdr_do_clist(XDR *xdrs, clist **objpp)
+{
+	bool_t more_data;
+
+	more_data = (*objpp != NULL);
+	if (!xdr_bool(xdrs, &more_data))
+		return (FALSE);
+	if (!more_data) {
+		*objpp = NULL;
+		return (TRUE);
+	}
+	return (xdr_ref_clist(xdrs, (caddr_t *)objpp));
 }
 
 uint_t
 xdr_getbufsize(XDR *xdrs)
 {
-	struct private *xdrp = (struct private *)(xdrs->x_private);
+	xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private);
 
 	return ((uint_t)xdrp->xp_buf_size);
 }
+
+/* ARGSUSED */
+bool_t
+xdr_encode_rlist_svc(XDR *xdrs, clist *rlist)
+{
+	bool_t	vfalse = FALSE;
+
+	ASSERT(rlist == NULL);
+	return (xdr_bool(xdrs, &vfalse));
+}
+
+bool_t
+xdr_encode_wlist(XDR *xdrs, clist *w)
+{
+	bool_t		vfalse = FALSE, vtrue = TRUE;
+	int		i;
+	uint_t		num_segment = 0;
+	struct clist	*cl;
+
+	/* does a wlist exist? */
+	if (w == NULL) {
+		return (xdr_bool(xdrs, &vfalse));
+	}
+	/* Encode N consecutive segments, 1, N, HLOO, ..., HLOO, 0 */
+	if (!xdr_bool(xdrs, &vtrue))
+		return (FALSE);
+
+	for (cl = w; cl != NULL; cl = cl->c_next) {
+		num_segment++;
+	}
+
+	if (!xdr_uint32(xdrs, &num_segment))
+		return (FALSE);
+	for (i = 0; i < num_segment; i++) {
+		if (!xdr_uint32(xdrs, &w->c_dmemhandle.mrc_rmr))
+			return (FALSE);
+
+		if (!xdr_uint32(xdrs, &w->c_len))
+			return (FALSE);
+
+		if (!xdr_uint64(xdrs, &w->u.c_daddr))
+			return (FALSE);
+
+		w = w->c_next;
+	}
+
+	if (!xdr_bool(xdrs, &vfalse))
+		return (FALSE);
+
+	return (TRUE);
+}
+
+
+/*
+ * Conditionally decode a RDMA WRITE chunk list from XDR stream.
+ *
+ * If the next boolean in the XDR stream is false there is no
+ * RDMA WRITE chunk list present. Otherwise iterate over the
+ * array and for each entry: allocate a struct clist and decode.
+ * Pass back an indication via wlist_exists if we have seen a
+ * RDMA WRITE chunk list.
+ */
+bool_t
+xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists)
+{
+	struct clist	*tmp;
+	bool_t		more = FALSE;
+	uint32_t	seg_array_len;
+	uint32_t	i;
+
+	if (!xdr_bool(xdrs, &more))
+		return (FALSE);
+
+	/* is there a wlist? */
+	if (more == FALSE) {
+		*wlist_exists = FALSE;
+		return (TRUE);
+	}
+	*wlist_exists = TRUE;
+
+	if (!xdr_uint32(xdrs, &seg_array_len))
+		return (FALSE);
+
+	tmp = *w = clist_alloc();
+	for (i = 0; i < seg_array_len; i++) {
+		if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr))
+			return (FALSE);
+		if (!xdr_uint32(xdrs, &tmp->c_len))
+			return (FALSE);
+		if (!xdr_uint64(xdrs, &tmp->u.c_daddr))
+			return (FALSE);
+		if (i < seg_array_len - 1) {
+			tmp->c_next = clist_alloc();
+			tmp = tmp->c_next;
+		} else {
+			tmp->c_next = NULL;
+		}
+	}
+
+	more = FALSE;
+	if (!xdr_bool(xdrs, &more))
+		return (FALSE);
+
+	return (TRUE);
+}
+
+/*
+ * Server side RDMA WRITE list decode.
+ * XDR context is memory ops
+ */
+bool_t
+xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl,
+    uint32_t *total_length, CONN *conn)
+{
+	struct clist	*first, *ncl;
+	char		*memp;
+	uint32_t	num_wclist;
+	uint32_t	wcl_length = 0;
+	uint32_t	i;
+	bool_t		more = FALSE;
+
+	*wclp = NULL;
+	*wwl = FALSE;
+	*total_length = 0;
+
+	if (!xdr_bool(xdrs, &more)) {
+		return (FALSE);
+	}
+
+	if (more == FALSE) {
+		return (TRUE);
+	}
+
+	*wwl = TRUE;
+
+	if (!xdr_uint32(xdrs, &num_wclist)) {
+		DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength);
+		return (FALSE);
+	}
+
+	first = ncl = clist_alloc();
+
+	for (i = 0; i < num_wclist; i++) {
+		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
+			goto err_out;
+		if (!xdr_uint32(xdrs, &ncl->c_len))
+			goto err_out;
+		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
+			goto err_out;
+
+		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
+			DTRACE_PROBE(
+			    krpc__e__xdrrdma__wlistsvc__chunklist_toobig);
+			ncl->c_len = MAX_SVC_XFER_SIZE;
+		}
+
+		wcl_length += ncl->c_len;
+
+		if (i < num_wclist - 1) {
+			ncl->c_next = clist_alloc();
+			ncl = ncl->c_next;
+		}
+	}
+
+	if (!xdr_bool(xdrs, &more))
+		goto err_out;
+
+	first->rb_longbuf.type = RDMA_LONG_BUFFER;
+	first->rb_longbuf.len =
+	    wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN;
+
+	if (rdma_buf_alloc(conn, &first->rb_longbuf)) {
+		clist_free(first);
+		return (FALSE);
+	}
+
+	memp = first->rb_longbuf.addr;
+
+	ncl = first;
+	for (i = 0; i < num_wclist; i++) {
+		ncl->w.c_saddr3 = (caddr_t)memp;
+		memp += ncl->c_len;
+		ncl = ncl->c_next;
+	}
+
+	*wclp = first;
+	*total_length = wcl_length;
+	return (TRUE);
+
+err_out:
+	clist_free(first);
+	return (FALSE);
+}
+
+/*
+ * XDR decode the long reply write chunk.
+ */
+bool_t
+xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist)
+{
+	bool_t		have_rchunk = FALSE;
+	struct clist	*first = NULL, *ncl = NULL;
+	uint32_t	num_wclist;
+	uint32_t	i;
+
+	if (!xdr_bool(xdrs, &have_rchunk))
+		return (FALSE);
+
+	if (have_rchunk == FALSE)
+		return (TRUE);
+
+	if (!xdr_uint32(xdrs, &num_wclist)) {
+		DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength);
+		return (FALSE);
+	}
+
+	if (num_wclist == 0) {
+		return (FALSE);
+	}
+
+	first = ncl = clist_alloc();
+
+	for (i = 0; i < num_wclist; i++) {
+		if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr))
+			goto err_out;
+		if (!xdr_uint32(xdrs, &ncl->c_len))
+			goto err_out;
+		if (!xdr_uint64(xdrs, &ncl->u.c_daddr))
+			goto err_out;
+
+		if (ncl->c_len > MAX_SVC_XFER_SIZE) {
+			DTRACE_PROBE(
+			    krpc__e__xdrrdma__replywchunk__chunklist_toobig);
+			ncl->c_len = MAX_SVC_XFER_SIZE;
+		}
+		if (!(ncl->c_dmemhandle.mrc_rmr &&
+		    (ncl->c_len > 0) && ncl->u.c_daddr))
+			DTRACE_PROBE(
+			    krpc__e__xdrrdma__replywchunk__invalid_segaddr);
+
+		if (i > 0) {
+			ncl->c_next = clist_alloc();
+			ncl = ncl->c_next;
+		}
+	}
+	*clist = first;
+	return (TRUE);
+
+err_out:
+	clist_free(first);
+	return (FALSE);
+}
+
+
+bool_t
+xdr_encode_reply_wchunk(XDR *xdrs,
+    struct clist *cl_longreply, uint32_t seg_array_len)
+{
+	int		i;
+	bool_t		long_reply_exists = TRUE;
+	uint32_t	length;
+	uint64		offset;
+
+	if (seg_array_len > 0) {
+		if (!xdr_bool(xdrs, &long_reply_exists))
+			return (FALSE);
+		if (!xdr_uint32(xdrs, &seg_array_len))
+			return (FALSE);
+
+		for (i = 0; i < seg_array_len; i++) {
+			if (!cl_longreply)
+				return (FALSE);
+			length = cl_longreply->c_len;
+			offset = (uint64) cl_longreply->u.c_daddr;
+
+			if (!xdr_uint32(xdrs,
+			    &cl_longreply->c_dmemhandle.mrc_rmr))
+				return (FALSE);
+			if (!xdr_uint32(xdrs, &length))
+				return (FALSE);
+			if (!xdr_uint64(xdrs, &offset))
+				return (FALSE);
+			cl_longreply = cl_longreply->c_next;
+		}
+	} else {
+		long_reply_exists = FALSE;
+		if (!xdr_bool(xdrs, &long_reply_exists))
+			return (FALSE);
+	}
+	return (TRUE);
+}
+bool_t
+xdrrdma_read_from_client(struct clist **rlist, CONN **conn, uint_t count)
+{
+	struct clist	*rdclist;
+	struct clist	cl;
+	uint_t		total_len = 0;
+	uint32_t	status;
+	bool_t		retval = TRUE;
+
+	(*rlist)->rb_longbuf.type = RDMA_LONG_BUFFER;
+	(*rlist)->rb_longbuf.len =
+	    count > RCL_BUF_LEN ? count : RCL_BUF_LEN;
+
+	if (rdma_buf_alloc(*conn, &(*rlist)->rb_longbuf)) {
+		return (FALSE);
+	}
+
+	for (rdclist = *rlist;
+	    rdclist != NULL; rdclist = rdclist->c_next) {
+		total_len += rdclist->c_len;
+#if (defined(OBJ32)||defined(DEBUG32))
+		rdclist->u.c_daddr3 =
+		    (caddr_t)((char *)(*rlist)->rb_longbuf.addr +
+		    (uint32) rdclist->u.c_daddr3);
+#else
+		rdclist->u.c_daddr3 =
+		    (caddr_t)((char *)(*rlist)->rb_longbuf.addr +
+		    (uint64) rdclist->u.c_daddr);
+
+#endif
+		cl = (*rdclist);
+		cl.c_next = NULL;
+
+		if (clist_register(*conn, &cl, CLIST_REG_DST) != RDMA_SUCCESS) {
+			rdma_buf_free(*conn, &(*rlist)->rb_longbuf);
+			DTRACE_PROBE(
+			    krpc__e__xdrrdma__readfromclient__clist__reg);
+			return (FALSE);
+		}
+
+		DTRACE_PROBE1(krpc__i__xdrrdma__readfromclient__buflen,
+		    int, rdclist->c_len);
+
+		/*
+		 * Now read the chunk in
+		 */
+		if (rdclist->c_next == NULL) {
+			status = RDMA_READ(*conn, &cl, WAIT);
+		} else {
+			status = RDMA_READ(*conn, &cl, NOWAIT);
+		}
+		if (status != RDMA_SUCCESS) {
+			DTRACE_PROBE(
+			    krpc__e__xdrrdma__readfromclient__readfailed);
+			rdma_buf_free(*conn, &(*rlist)->rb_longbuf);
+			return (FALSE);
+		}
+	}
+
+	cl = (*(*rlist));
+	cl.c_next = NULL;
+	cl.c_len = total_len;
+	if (clist_syncmem(*conn, &cl, 0) != RDMA_SUCCESS) {
+		retval = FALSE;
+	}
+	return (retval);
+}
+
+bool_t
+xdrrdma_free_clist(CONN *conn, struct clist *clp)
+{
+	rdma_buf_free(conn, &clp->rb_longbuf);
+	clist_free(clp);
+	return (TRUE);
+}
+
+bool_t
+xdrrdma_send_read_data(XDR *xdrs, struct clist *wcl)
+{
+	int status;
+	xrdma_private_t	*xdrp = (xrdma_private_t *)(xdrs->x_private);
+	struct xdr_ops *xops = xdrrdma_xops();
+
+	/* caller is doing a sizeof */
+	if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops)
+		return (TRUE);
+
+	status = clist_register(xdrp->xp_conn, wcl, CLIST_REG_SOURCE);
+	if (status != RDMA_SUCCESS) {
+		return (FALSE);
+	}
+
+	status = clist_syncmem(xdrp->xp_conn, wcl, CLIST_REG_SOURCE);
+	if (status != RDMA_SUCCESS) {
+		return (FALSE);
+	}
+
+	status = RDMA_WRITE(xdrp->xp_conn, wcl, WAIT);
+	if (status != RDMA_SUCCESS) {
+		return (FALSE);
+	}
+
+	return (TRUE);
+}
--- a/usr/src/uts/common/rpc/xdrrdma_sizeof.c	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/common/rpc/xdrrdma_sizeof.c	Thu Aug 21 18:01:07 2008 -0500
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,26 +19,25 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/types.h>
+#include <sys/sdt.h>
 #include <rpc/auth.h>
 #include <rpc/rpc_rdma.h>
 
-static struct xdr_ops *xdrrdma_xops(void);
-
 struct private {
 	int	min_chunk;
 	uint_t	flags;			/* controls setting for rdma xdr */
 	int	num_chunk;
 	caddr_t	inline_buf;		/* temporary buffer for xdr inlining */
 	int	inline_len;		/* inline buffer length */
+	uint_t	xp_reply_chunk_len;
+	uint_t	xp_reply_chunk_len_alt;
 };
 
 /* ARGSUSED */
@@ -60,7 +58,7 @@
 	 * min_chunk = 0, means that the stream of bytes, to estimate size of,
 	 * contains no chunks to seperate out. See xdrrdma_putbytes()
 	 */
-	if (len < xdrp->min_chunk || (xdrp->flags & RDMA_NOCHUNK)) {
+	if (len < xdrp->min_chunk || !(xdrp->flags & XDR_RDMA_CHUNK)) {
 		xdrs->x_handy += len;
 		return (TRUE);
 	}
@@ -68,6 +66,7 @@
 	 * Chunk item. No impact on xdr size.
 	 */
 	xdrp->num_chunk++;
+
 	return (TRUE);
 }
 
@@ -91,10 +90,12 @@
 {
 	int32_t *int32p;
 	uint_t in_flags;
+	rdma_chunkinfo_t *rcip = NULL;
+	rdma_chunkinfo_lengths_t *rcilp = NULL;
 	struct private *xdrp = (struct private *)xdrs->x_private;
 
 	switch (request) {
-	case XDR_RDMASET:
+	case XDR_RDMA_SET_FLAGS:
 		/*
 		 * Set the flags provided in the *info in xp_flags for rdma xdr
 		 * stream control.
@@ -105,7 +106,7 @@
 		xdrp->flags = in_flags;
 		return (TRUE);
 
-	case XDR_RDMAGET:
+	case XDR_RDMA_GET_FLAGS:
 		/*
 		 * Get the flags provided in xp_flags return through *info
 		 */
@@ -114,6 +115,31 @@
 		*int32p = (int32_t)xdrp->flags;
 		return (TRUE);
 
+	case XDR_RDMA_GET_CHUNK_LEN:
+		rcilp = (rdma_chunkinfo_lengths_t *)info;
+		rcilp->rcil_len = xdrp->xp_reply_chunk_len;
+		rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt;
+
+		return (TRUE);
+
+	case XDR_RDMA_ADD_CHUNK:
+		rcip = (rdma_chunkinfo_t *)info;
+
+		switch (rcip->rci_type) {
+		case RCI_WRITE_UIO_CHUNK:
+			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
+			break;
+
+		case RCI_WRITE_ADDR_CHUNK:
+			xdrp->xp_reply_chunk_len_alt += rcip->rci_len;
+			break;
+
+		case RCI_REPLY_CHUNK:
+			xdrp->xp_reply_chunk_len += rcip->rci_len;
+			break;
+		}
+		return (TRUE);
+
 	default:
 		return (FALSE);
 	}
@@ -187,14 +213,18 @@
 	xdrp = (struct private *)xdrs->x_private;
 	xdrp->min_chunk = min_chunk;
 	xdrp->flags = 0;
-	if (xdrp->min_chunk == 0)
-		xdrp->flags |= RDMA_NOCHUNK;
+	if (xdrp->min_chunk != 0)
+		xdrp->flags |= XDR_RDMA_CHUNK;
+
+	xdrp->xp_reply_chunk_len = 0;
+	xdrp->xp_reply_chunk_len_alt = 0;
 
 	return (TRUE);
 }
 
 unsigned int
-xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk)
+xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk,
+    uint_t *reply_size, uint_t *reply_size_alt)
 {
 	XDR x;
 	struct xdr_ops ops;
@@ -207,6 +237,10 @@
 	stat = func(&x, data);
 	xdrp = (struct private *)x.x_private;
 	if (xdrp) {
+		if (reply_size != NULL)
+			*reply_size = xdrp->xp_reply_chunk_len;
+		if (reply_size_alt != NULL)
+			*reply_size_alt = xdrp->xp_reply_chunk_len_alt;
 		if (xdrp->inline_buf)
 			mem_free(xdrp->inline_buf, xdrp->inline_len);
 		mem_free(xdrp, sizeof (struct private));
@@ -235,7 +269,7 @@
 	return (stat == TRUE ? (unsigned int)x.x_handy: 0);
 }
 
-static struct xdr_ops *
+struct xdr_ops *
 xdrrdma_xops(void)
 {
 	static struct xdr_ops ops;
--- a/usr/src/uts/intel/ia32/ml/modstubs.s	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s	Thu Aug 21 18:01:07 2008 -0500
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/asm_linkage.h>
 
 #if defined(__lint)
@@ -1014,6 +1012,7 @@
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_versions,	nomod_zero);
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_max_data_length,	nomod_zero);
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_svc_max_data_length,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_service_type,	nomod_zero);
 	END_MODULE(rpcsec_gss);
 #endif
 
--- a/usr/src/uts/sparc/ml/modstubs.s	Thu Aug 21 14:16:05 2008 -0700
+++ b/usr/src/uts/sparc/ml/modstubs.s	Thu Aug 21 18:01:07 2008 -0500
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #if !defined(lint)
 #include "assym.h"
 #endif /* !lint */
@@ -946,6 +944,7 @@
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_versions,	nomod_zero);
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_max_data_length,	nomod_zero);
 	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_svc_max_data_length,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_service_type,	nomod_zero);
 	END_MODULE(rpcsec_gss);
 #endif