6873106 Need a mechanism to share buffers between fs modules
authorchunli zhang - Sun Microsystems - Irvine United States <Chunli.Zhang@Sun.COM>
Mon, 18 Jan 2010 10:34:16 -0800
changeset 11539 10d35fc3d7fd
parent 11538 887ee1196411
child 11540 5e26146fd282
6873106 Need a mechanism to share buffers between fs modules
usr/src/cmd/stat/fsstat/fsstat.c
usr/src/uts/common/fs/fem.c
usr/src/uts/common/fs/nfs/nfs3_srv.c
usr/src/uts/common/fs/nfs/nfs3_vfsops.c
usr/src/uts/common/fs/nfs/nfs3_xdr.c
usr/src/uts/common/fs/nfs/nfs4_srv.c
usr/src/uts/common/fs/nfs/nfs4_vfsops.c
usr/src/uts/common/fs/nfs/nfs4_xdr.c
usr/src/uts/common/fs/nfs/nfs_server.c
usr/src/uts/common/fs/vnode.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_impl.h
usr/src/uts/common/fs/zfs/zfs_vfsops.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
usr/src/uts/common/nfs/nfs.h
usr/src/uts/common/rpc/rpcmod.c
usr/src/uts/common/rpc/xdr.h
usr/src/uts/common/rpc/xdr_mblk.c
usr/src/uts/common/sys/fem.h
usr/src/uts/common/sys/uio.h
usr/src/uts/common/sys/vfs.h
usr/src/uts/common/sys/vnode.h
--- a/usr/src/cmd/stat/fsstat/fsstat.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/cmd/stat/fsstat/fsstat.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -428,6 +428,8 @@
 	PRINT_VOPSTAT(niceflag, setsecattr);
 	PRINT_VOPSTAT(niceflag, shrlock);
 	PRINT_VOPSTAT(niceflag, vnevent);
+	PRINT_VOPSTAT(niceflag, reqzcbuf);
+	PRINT_VOPSTAT(niceflag, retzcbuf);
 
 	if (niceflag) {
 		/* Make it easier on the eyes */
--- a/usr/src/uts/common/fs/fem.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/fem.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,10 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
 #include <sys/atomic.h>
@@ -124,6 +123,8 @@
 	_FEMOPDEF(GETSECATTR,	getsecattr),
 	_FEMOPDEF(SHRLOCK,	shrlock),
 	_FEMOPDEF(VNEVENT,	vnevent),
+	_FEMOPDEF(REQZCBUF,	reqzcbuf),
+	_FEMOPDEF(RETZCBUF,	retzcbuf),
 	{ NULL, 0, NULL, NULL }
 };
 
@@ -176,6 +177,8 @@
 	_FEMGUARD(GETSECATTR,	getsecattr),
 	_FEMGUARD(SHRLOCK,	shrlock),
 	_FEMGUARD(VNEVENT,	vnevent),
+	_FEMGUARD(REQZCBUF,	reqzcbuf),
+	_FEMGUARD(RETZCBUF,	retzcbuf),
 	{ NULL, NULL }
 };
 
@@ -1645,6 +1648,61 @@
 }
 
 static int
+vhead_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+    caller_context_t *ct)
+{
+	femarg_t	farg;
+	struct fem_list	*femsp;
+	int		(*func)();
+	void		*arg0;
+	int		errc;
+
+	if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+		func = (int (*)()) (vp->v_op->vop_reqzcbuf);
+		arg0 = vp;
+		fem_unlock(vp->v_femhead);
+		errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+	} else {
+		fem_addref(femsp);
+		fem_unlock(vp->v_femhead);
+		farg.fa_vnode.vp = vp;
+		farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+		vsop_find(&farg, &func, int, &arg0, vop_reqzcbuf,
+		    femop_reqzcbuf);
+		errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+		fem_release(femsp);
+	}
+	return (errc);
+}
+
+static int
+vhead_retzcbuf(vnode_t *vp, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+	femarg_t	farg;
+	struct fem_list	*femsp;
+	int		(*func)();
+	void		*arg0;
+	int		errc;
+
+	if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+		func = (int (*)()) (vp->v_op->vop_retzcbuf);
+		arg0 = vp;
+		fem_unlock(vp->v_femhead);
+		errc = (*func)(arg0, xuiop, cr, ct);
+	} else {
+		fem_addref(femsp);
+		fem_unlock(vp->v_femhead);
+		farg.fa_vnode.vp = vp;
+		farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+		vsop_find(&farg, &func, int, &arg0, vop_retzcbuf,
+		    femop_retzcbuf);
+		errc = (*func)(arg0, xuiop, cr, ct);
+		fem_release(femsp);
+	}
+	return (errc);
+}
+
+static int
 fshead_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 {
 	fsemarg_t	farg;
@@ -1942,6 +2000,8 @@
 	{ VOPNAME_GETSECATTR, (femop_t *)vhead_getsecattr },
 	{ VOPNAME_SHRLOCK, (femop_t *)vhead_shrlock },
 	{ VOPNAME_VNEVENT, (femop_t *)vhead_vnevent },
+	{ VOPNAME_REQZCBUF, (femop_t *)vhead_reqzcbuf },
+	{ VOPNAME_RETZCBUF, (femop_t *)vhead_retzcbuf },
 	{	NULL,	NULL	}
 };
 
@@ -2642,6 +2702,35 @@
 }
 
 int
+vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+    caller_context_t *ct)
+{
+	int (*func)() = NULL;
+	void *arg0 = NULL;
+
+	ASSERT(vf != NULL);
+	vf->fa_fnode--;
+	vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
+	ASSERT(func != NULL);
+	ASSERT(arg0 != NULL);
+	return ((*func)(arg0, ioflag, xuiop, cr, ct));
+}
+
+int
+vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+	int (*func)() = NULL;
+	void *arg0 = NULL;
+
+	ASSERT(vf != NULL);
+	vf->fa_fnode--;
+	vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
+	ASSERT(func != NULL);
+	ASSERT(arg0 != NULL);
+	return ((*func)(arg0, xuiop, cr, ct));
+}
+
+int
 vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 {
 	int (*func)() = NULL;
--- a/usr/src/uts/common/fs/nfs/nfs3_srv.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -87,6 +87,8 @@
 static void	vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *);
 static int	rdma_setup_read_data3(READ3args *, READ3resok *);
 
+extern int nfs_loaned_buffers;
+
 u_longlong_t nfs3_srv_caller_id;
 
 /* ARGSUSED */
@@ -994,6 +996,9 @@
 	int in_crit = 0;
 	int need_rwunlock = 0;
 	caller_context_t ct;
+	int rdma_used = 0;
+	int loaned_buffers;
+	struct uio *uiop;
 
 	vap = NULL;
 
@@ -1007,6 +1012,12 @@
 		goto out;
 	}
 
+	if (args->wlist)
+		rdma_used = 1;
+
+	/* use loaned buffers for TCP */
+	loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
 	if (is_system_labeled()) {
 		bslabel_t *clabel = req->rq_label;
 
@@ -1136,12 +1147,38 @@
 	if (args->count > rfs3_tsize(req))
 		args->count = rfs3_tsize(req);
 
+	if (loaned_buffers) {
+		uiop = (uio_t *)rfs_setup_xuio(vp);
+		ASSERT(uiop != NULL);
+		uiop->uio_segflg = UIO_SYSSPACE;
+		uiop->uio_loffset = args->offset;
+		uiop->uio_resid = args->count;
+
+		/* Jump to do the read if successful */
+		if (VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cr, &ct) == 0) {
+			/*
+			 * Need to hold the vnode until after VOP_RETZCBUF()
+			 * is called.
+			 */
+			VN_HOLD(vp);
+			goto doio_read;
+		}
+
+		DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+		    uiop->uio_loffset, int, uiop->uio_resid);
+
+		uiop->uio_extflg = 0;
+		/* failure to setup for zero copy */
+		rfs_free_xuio((void *)uiop);
+		loaned_buffers = 0;
+	}
+
 	/*
 	 * If returning data via RDMA Write, then grab the chunk list.
 	 * If we aren't returning READ data w/RDMA_WRITE, then grab
 	 * a mblk.
 	 */
-	if (args->wlist) {
+	if (rdma_used) {
 		mp = NULL;
 		(void) rdma_get_wchunk(req, &iov, args->wlist);
 	} else {
@@ -1167,11 +1204,14 @@
 	uio.uio_extflg = UIO_COPY_CACHED;
 	uio.uio_loffset = args->offset;
 	uio.uio_resid = args->count;
-
-	error = VOP_READ(vp, &uio, 0, cr, &ct);
+	uiop = &uio;
+
+doio_read:
+	error = VOP_READ(vp, uiop, 0, cr, &ct);
 
 	if (error) {
-		freeb(mp);
+		if (mp)
+			freemsg(mp);
 		/* check if a monitor detected a delegation conflict */
 		if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 			resp->status = NFS3ERR_JUKEBOX;
@@ -1180,6 +1220,12 @@
 		goto out;
 	}
 
+	/* make mblk using zc buffers */
+	if (loaned_buffers) {
+		mp = uio_to_mblk(uiop);
+		ASSERT(mp != NULL);
+	}
+
 	va.va_mask = AT_ALL;
 	error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 
@@ -1205,16 +1251,20 @@
 
 	resp->status = NFS3_OK;
 	vattr_to_post_op_attr(vap, &resp->resok.file_attributes);
-	resp->resok.count = args->count - uio.uio_resid;
+	resp->resok.count = args->count - uiop->uio_resid;
 	if (!error && offset + resp->resok.count == va.va_size)
 		resp->resok.eof = TRUE;
 	else
 		resp->resok.eof = FALSE;
 	resp->resok.data.data_len = resp->resok.count;
+
+	if (mp)
+		rfs_rndup_mblks(mp, resp->resok.count, loaned_buffers);
+
 	resp->resok.data.mp = mp;
 	resp->resok.size = (uint_t)args->count;
 
-	if (args->wlist) {
+	if (rdma_used) {
 		resp->resok.data.data_val = (caddr_t)iov.iov_base;
 		if (!rdma_setup_read_data3(args, &(resp->resok))) {
 			resp->status = NFS3ERR_INVAL;
@@ -1260,7 +1310,7 @@
 	if (resp->status == NFS3_OK) {
 		mp = resp->resok.data.mp;
 		if (mp != NULL)
-			freeb(mp);
+			freemsg(mp);
 	}
 }
 
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c	Mon Jan 18 10:34:16 2010 -0800
@@ -1003,7 +1003,7 @@
 
 static int nfs3_dynamic = 0;	/* global variable to enable dynamic retrans. */
 static ushort_t nfs3_max_threads = 8;	/* max number of active async threads */
-static uint_t nfs3_bsize = 32 * 1024;	/* client `block' size */
+uint_t nfs3_bsize = 32 * 1024;	/* client `block' size */
 static uint_t nfs3_async_clusters = 1;	/* # of reqs from each async queue */
 static uint_t nfs3_cots_timeo = NFS_COTS_TIMEO;
 
--- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1320,16 +1320,9 @@
 	}
 
 	if (xdrs->x_op == XDR_ENCODE) {
-		int i, rndup;
 
 		mp = resokp->data.mp;
 		if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
-			mp->b_wptr += resokp->count;
-			rndup = BYTES_PER_XDR_UNIT -
-			    (resokp->data.data_len % BYTES_PER_XDR_UNIT);
-			if (rndup != BYTES_PER_XDR_UNIT)
-				for (i = 0; i < rndup; i++)
-					*mp->b_wptr++ = '\0';
 			if (xdrmblk_putmblk(xdrs, mp, resokp->count) == TRUE) {
 				resokp->data.mp = NULL;
 				return (TRUE);
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -84,6 +84,8 @@
 #define	RFS4_LOCK_DELAY 10	/* Milliseconds */
 static clock_t  rfs4_lock_delay = RFS4_LOCK_DELAY;
 extern struct svc_ops rdma_svc_ops;
+extern int nfs_loaned_buffers;
+/* End of Tunables */
 
 static int rdma_setup_read_data4(READ4args *, READ4res *);
 
@@ -3140,9 +3142,12 @@
 	bool_t *deleg = &cs->deleg;
 	nfsstat4 stat;
 	int in_crit = 0;
-	mblk_t *mp;
+	mblk_t *mp = NULL;
 	int alloc_err = 0;
+	int rdma_used = 0;
+	int loaned_buffers;
 	caller_context_t ct;
+	struct uio *uiop;
 
 	DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs,
 	    READ4args, args);
@@ -3183,6 +3188,12 @@
 		goto out;
 	}
 
+	if (args->wlist)
+		rdma_used = 1;
+
+	/* use loaned buffers for TCP */
+	loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
 	va.va_mask = AT_MODE|AT_SIZE|AT_UID;
 	verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
 
@@ -3250,11 +3261,38 @@
 	if (args->count > rfs4_tsize(req))
 		args->count = rfs4_tsize(req);
 
+	if (loaned_buffers) {
+		uiop = (uio_t *)rfs_setup_xuio(vp);
+		ASSERT(uiop != NULL);
+		uiop->uio_segflg = UIO_SYSSPACE;
+		uiop->uio_loffset = args->offset;
+		uiop->uio_resid = args->count;
+
+		/* Jump to do the read if successful */
+		if (!VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cs->cr, &ct)) {
+			/*
+			 * Need to hold the vnode until after VOP_RETZCBUF()
+			 * is called.
+			 */
+			VN_HOLD(vp);
+			goto doio_read;
+		}
+
+		DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+		    uiop->uio_loffset, int, uiop->uio_resid);
+
+		uiop->uio_extflg = 0;
+
+		/* failure to setup for zero copy */
+		rfs_free_xuio((void *)uiop);
+		loaned_buffers = 0;
+	}
+
 	/*
 	 * If returning data via RDMA Write, then grab the chunk list. If we
 	 * aren't returning READ data w/RDMA_WRITE, then grab a mblk.
 	 */
-	if (args->wlist) {
+	if (rdma_used) {
 		mp = NULL;
 		(void) rdma_get_wchunk(req, &iov, args->wlist);
 	} else {
@@ -3287,27 +3325,38 @@
 	uio.uio_extflg = UIO_COPY_CACHED;
 	uio.uio_loffset = args->offset;
 	uio.uio_resid = args->count;
-
-	error = do_io(FREAD, vp, &uio, 0, cs->cr, &ct);
+	uiop = &uio;
+
+doio_read:
+	error = do_io(FREAD, vp, uiop, 0, cs->cr, &ct);
 
 	va.va_mask = AT_SIZE;
 	verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
 
 	if (error) {
-		freeb(mp);
+		if (mp)
+			freemsg(mp);
 		*cs->statusp = resp->status = puterrno4(error);
 		goto out;
 	}
 
+	/* make mblk using zc buffers */
+	if (loaned_buffers) {
+		mp = uio_to_mblk(uiop);
+		ASSERT(mp != NULL);
+	}
+
 	*cs->statusp = resp->status = NFS4_OK;
 
-	ASSERT(uio.uio_resid >= 0);
-	resp->data_len = args->count - uio.uio_resid;
+	ASSERT(uiop->uio_resid >= 0);
+	resp->data_len = args->count - uiop->uio_resid;
 	if (mp) {
 		resp->data_val = (char *)mp->b_datap->db_base;
+		rfs_rndup_mblks(mp, resp->data_len, loaned_buffers);
 	} else {
 		resp->data_val = (caddr_t)iov.iov_base;
 	}
+
 	resp->mblk = mp;
 
 	if (!verror && offset + resp->data_len == va.va_size)
@@ -3315,7 +3364,7 @@
 	else
 		resp->eof = FALSE;
 
-	if (args->wlist) {
+	if (rdma_used) {
 		if (!rdma_setup_read_data4(args, resp)) {
 			*cs->statusp = resp->status = NFS4ERR_INVAL;
 		}
@@ -3337,7 +3386,7 @@
 	READ4res	*resp = &resop->nfs_resop4_u.opread;
 
 	if (resp->status == NFS4_OK && resp->mblk != NULL) {
-		freeb(resp->mblk);
+		freemsg(resp->mblk);
 		resp->mblk = NULL;
 		resp->data_val = NULL;
 		resp->data_len = 0;
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c	Mon Jan 18 10:34:16 2010 -0800
@@ -2159,7 +2159,7 @@
 }
 
 static ushort_t nfs4_max_threads = 8;	/* max number of active async threads */
-static uint_t nfs4_bsize = 32 * 1024;	/* client `block' size */
+uint_t nfs4_bsize = 32 * 1024;	/* client `block' size */
 static uint_t nfs4_async_clusters = 1;	/* # of reqs from each async queue */
 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
 
--- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -3350,7 +3350,6 @@
 static bool_t
 xdr_READ4res(XDR *xdrs, READ4res *objp)
 {
-	int i, rndup;
 	mblk_t *mp;
 
 	if (xdrs->x_op == XDR_DECODE)
@@ -3378,12 +3377,6 @@
 
 	mp = objp->mblk;
 	if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
-		mp->b_wptr += objp->data_len;
-		rndup = BYTES_PER_XDR_UNIT -
-		    (objp->data_len % BYTES_PER_XDR_UNIT);
-		if (rndup != BYTES_PER_XDR_UNIT)
-			for (i = 0; i < rndup; i++)
-				*mp->b_wptr++ = '\0';
 		if (xdrmblk_putmblk(xdrs, mp, objp->data_len) == TRUE) {
 			objp->mblk = NULL;
 			return (TRUE);
--- a/usr/src/uts/common/fs/nfs/nfs_server.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -106,6 +106,9 @@
 
 char _depends_on[] = "misc/klmmod";
 
+kmem_cache_t *nfs_xuio_cache;
+int nfs_loaned_buffers = 0;
+
 int
 _init(void)
 {
@@ -139,6 +142,11 @@
 	/* setup DSS paths here; must be done before initial server startup */
 	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
 
+	/* initialize the copy reduction caches */
+
+	nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache",
+	    sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
 	return (status);
 }
 
@@ -3215,3 +3223,140 @@
 	label_rele(tslabel);
 	return (result);
 }
+
+/*
+ * Callback function to return the loaned buffers.
+ * Calls VOP_RETZCBUF() only after all uio_iov[]
+ * buffers are returned. nu_ref maintains the count.
+ */
+void
+rfs_free_xuio(void *free_arg)
+{
+	uint_t ref;
+	nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg;
+
+	ref = atomic_dec_uint_nv(&nfsuiop->nu_ref);
+
+	/*
+	 * Call VOP_RETZCBUF() only when all the iov buffers
+	 * are sent OTW.
+	 */
+	if (ref != 0)
+		return;
+
+	if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) {
+		(void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL,
+		    NULL);
+		VN_RELE(nfsuiop->nu_vp);
+	}
+
+	kmem_cache_free(nfs_xuio_cache, free_arg);
+}
+
+xuio_t *
+rfs_setup_xuio(vnode_t *vp)
+{
+	nfs_xuio_t *nfsuiop;
+
+	nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP);
+
+	bzero(nfsuiop, sizeof (nfs_xuio_t));
+	nfsuiop->nu_vp = vp;
+
+	/*
+	 * ref count set to 1. more may be added
+	 * if multiple mblks refer to multiple iov's.
+	 * This is done in uio_to_mblk().
+	 */
+
+	nfsuiop->nu_ref = 1;
+
+	nfsuiop->nu_frtn.free_func = rfs_free_xuio;
+	nfsuiop->nu_frtn.free_arg = (char *)nfsuiop;
+
+	nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY;
+
+	return (&nfsuiop->nu_uio);
+}
+
+mblk_t *
+uio_to_mblk(uio_t *uiop)
+{
+	struct iovec *iovp;
+	int i;
+	mblk_t *mp, *mp1;
+	nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop;
+
+	if (uiop->uio_iovcnt == 0)
+		return (NULL);
+
+	iovp = uiop->uio_iov;
+	mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len,
+	    BPRI_MED, &nfsuiop->nu_frtn);
+	ASSERT(mp != NULL);
+
+	mp->b_wptr += iovp->iov_len;
+	mp->b_datap->db_type = M_DATA;
+
+	for (i = 1; i < uiop->uio_iovcnt; i++) {
+		iovp = (uiop->uio_iov + i);
+
+		mp1->b_cont = esballoca(
+		    (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED,
+		    &nfsuiop->nu_frtn);
+
+		mp1 = mp1->b_cont;
+		ASSERT(mp1 != NULL);
+		mp1->b_wptr += iovp->iov_len;
+		mp1->b_datap->db_type = M_DATA;
+	}
+
+	nfsuiop->nu_ref = uiop->uio_iovcnt;
+
+	return (mp);
+}
+
+void
+rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned)
+{
+	int i, rndup;
+	int alloc_err = 0;
+	mblk_t *rmp;
+
+	rndup = BYTES_PER_XDR_UNIT - (len % BYTES_PER_XDR_UNIT);
+
+	/* single mblk_t non copy-reduction case */
+	if (!buf_loaned) {
+		mp->b_wptr += len;
+		if (rndup != BYTES_PER_XDR_UNIT) {
+			for (i = 0; i < rndup; i++)
+				*mp->b_wptr++ = '\0';
+		}
+		return;
+	}
+
+	/* no need for extra rndup */
+	if (rndup == BYTES_PER_XDR_UNIT)
+		return;
+
+	while (mp->b_cont)
+		mp = mp->b_cont;
+
+	/*
+	 * In case of copy-reduction mblks, the size of the mblks
+	 * are fixed and are of the size of the loaned buffers.
+	 * Allocate a roundup mblk and chain it to the data
+	 * buffers. This is sub-optimal, but not expected to
+	 * happen in regular common workloads.
+	 */
+
+	rmp = allocb_wait(rndup, BPRI_MED, STR_NOSIG, &alloc_err);
+	ASSERT(rmp != NULL);
+	ASSERT(alloc_err == 0);
+
+	for (i = 0; i < rndup; i++)
+		*rmp->b_wptr++ = '\0';
+
+	rmp->b_datap->db_type = M_DATA;
+	mp->b_cont = rmp;
+}
--- a/usr/src/uts/common/fs/vnode.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/vnode.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -362,6 +362,12 @@
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 
+	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
+	    fs_nosys, fs_nosys,
+
+	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
+	    fs_nosys, fs_nosys,
+
 	NULL, 0, NULL, NULL
 };
 
@@ -522,6 +528,10 @@
 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 	/* VOP_VNEVENT */
 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
+	/* VOP_REQZCBUF */
+	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
+	/* VOP_RETZCBUF */
+	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 
 	return (vsp);
 }
@@ -4151,6 +4161,31 @@
 	return (err);
 }
 
+int
+fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
+    caller_context_t *ct)
+{
+	int err;
+
+	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+		return (ENOTSUP);
+	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
+	VOPSTATS_UPDATE(vp, reqzcbuf);
+	return (err);
+}
+
+int
+fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+	int err;
+
+	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+		return (ENOTSUP);
+	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
+	VOPSTATS_UPDATE(vp, retzcbuf);
+	return (err);
+}
+
 /*
  * Default destructor
  *	Needed because NULL destructor means that the key is unused
--- a/usr/src/uts/common/fs/zfs/arc.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1241,14 +1241,31 @@
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(hdr->b_state == arc_anon);
 	ASSERT(buf->b_data != NULL);
-	VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
-	VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+	(void) refcount_add(&hdr->b_refcnt, tag);
+	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
 
 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr;
+
+	rw_enter(&buf->b_lock, RW_WRITER);
+	ASSERT(buf->b_data != NULL);
+	hdr = buf->b_hdr;
+	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+	(void) refcount_remove(&hdr->b_refcnt, tag);
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
+
+	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+	rw_exit(&buf->b_lock);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -406,6 +406,29 @@
 	}
 }
 
+/*
+ * Loan out an arc_buf for read.  Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+	arc_buf_t *abuf;
+
+	mutex_enter(&db->db_mtx);
+	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+		int blksz = db->db.db_size;
+		mutex_exit(&db->db_mtx);
+		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+		bcopy(db->db.db_data, abuf->b_data, blksz);
+	} else {
+		abuf = db->db_buf;
+		arc_loan_inuse_buf(abuf, db);
+		dbuf_set_data(db, NULL);
+		mutex_exit(&db->db_mtx);
+	}
+	return (abuf);
+}
+
 uint64_t
 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 {
@@ -1162,7 +1185,6 @@
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 	mutex_enter(&db->db_mtx);
-
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
@@ -1341,9 +1363,11 @@
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		xuio_stat_wbuf_copied();
 		return;
 	}
 
+	xuio_stat_wbuf_nocopy();
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = db->db_last_dirty;
 
--- a/usr/src/uts/common/fs/zfs/dmu.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -661,12 +661,136 @@
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+	dmu_xuio_t *priv;
+	uio_t *uio = &xuio->xu_uio;
+
+	uio->uio_iovcnt = nblk;
+	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+	priv->cnt = nblk;
+	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+	priv->iovp = uio->uio_iov;
+	XUIO_XUZC_PRIV(xuio) = priv;
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+	return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int nblk = priv->cnt;
+
+	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+	kmem_free(priv, sizeof (dmu_xuio_t));
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+	struct iovec *iov;
+	uio_t *uio = &xuio->xu_uio;
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int i = priv->next++;
+
+	ASSERT(i < priv->cnt);
+	ASSERT(off + n <= arc_buf_size(abuf));
+	iov = uio->uio_iov + i;
+	iov->iov_base = (char *)abuf->b_data + off;
+	iov->iov_len = n;
+	priv->bufs[i] = abuf;
+	return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (xuio_ksp != NULL) {
+		xuio_ksp->ks_data = &xuio_stats;
+		kstat_install(xuio_ksp);
+	}
+}
+
+static void
+xuio_stat_fini(void)
+{
+	if (xuio_ksp != NULL) {
+		kstat_delete(xuio_ksp);
+		xuio_ksp = NULL;
+	}
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
 #ifdef _KERNEL
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
+	xuio_t *xuio = NULL;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
@@ -677,6 +801,9 @@
 	if (err)
 		return (err);
 
+	if (uio->uio_extflg == UIO_XUIO)
+		xuio = (xuio_t *)uio;
+
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
@@ -687,8 +814,24 @@
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
-		err = uiomove((char *)db->db_data + bufoff, tocpy,
-		    UIO_READ, uio);
+		if (xuio) {
+			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+			arc_buf_t *dbuf_abuf = dbi->db_buf;
+			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+			if (!err) {
+				uio->uio_resid -= tocpy;
+				uio->uio_loffset += tocpy;
+			}
+
+			if (abuf == dbuf_abuf)
+				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+			else
+				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+		} else {
+			err = uiomove((char *)db->db_data + bufoff, tocpy,
+			    UIO_READ, uio);
+		}
 		if (err)
 			break;
 
@@ -857,6 +1000,7 @@
 		dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
 		    buf->b_data, tx);
 		dmu_return_arcbuf(buf);
+		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
 }
 
@@ -1369,6 +1513,7 @@
 	zfetch_init();
 	arc_init();
 	l2arc_init();
+	xuio_stat_init();
 }
 
 void
@@ -1379,4 +1524,5 @@
 	dnode_fini();
 	dbuf_fini();
 	l2arc_fini();
+	xuio_stat_fini();
 }
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -87,6 +87,7 @@
     arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -267,6 +267,7 @@
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,6 +45,7 @@
 #endif
 
 struct uio;
+struct xuio;
 struct page;
 struct vnode;
 struct spa;
@@ -500,6 +501,15 @@
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+    size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
 
 extern int zfs_prefetch_disable;
 
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -232,6 +232,39 @@
 struct objset;
 struct dmu_pool;
 
+typedef struct dmu_xuio {
+	int next;
+	int cnt;
+	struct arc_buf **bufs;
+	iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+	/* loaned yet not returned arc_buf */
+	kstat_named_t xuiostat_onloan_rbuf;
+	kstat_named_t xuiostat_onloan_wbuf;
+	/* whether a copy is made when loaning out a read buffer */
+	kstat_named_t xuiostat_rbuf_copied;
+	kstat_named_t xuiostat_rbuf_nocopy;
+	/* whether a copy is made when assigning a write buffer */
+	kstat_named_t xuiostat_wbuf_copied;
+	kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
+	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
+	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
+	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
+};
+
+#define	XUIOSTAT_INCR(stat, val)	\
+	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
+
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1115,6 +1115,7 @@
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
+	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -447,6 +447,7 @@
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
+	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
@@ -507,6 +508,35 @@
 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+		int nblk;
+		int blksz = zp->z_blksz;
+		uint64_t offset = uio->uio_loffset;
+
+		xuio = (xuio_t *)uio;
+		if ((ISP2(blksz))) {
+			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+			    blksz)) / blksz;
+		} else {
+			ASSERT(offset + n <= blksz);
+			nblk = 1;
+		}
+		dmu_xuio_init(xuio, nblk);
+
+		if (vn_has_cached_data(vp)) {
+			/*
+			 * For simplicity, we always allocate a full buffer
+			 * even if we only expect to read a portion of a block.
+			 */
+			while (--nblk >= 0) {
+				dmu_xuio_add(xuio,
+				    dmu_request_arcbuf(zp->z_dbuf, blksz),
+				    0, blksz);
+			}
+		}
+	}
+
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
@@ -524,7 +554,6 @@
 
 		n -= nbytes;
 	}
-
 out:
 	zfs_range_unlock(rl);
 
@@ -570,6 +599,12 @@
 	uint64_t	pflags;
 	int		error;
 	arc_buf_t	*abuf;
+	iovec_t		*aiov;
+	xuio_t		*xuio = NULL;
+	int		i_iov = 0;
+	int		iovcnt = uio->uio_iovcnt;
+	iovec_t		*iovp = uio->uio_iov;
+	int		write_eof;
 
 	/*
 	 * Fasttrack empty write
@@ -619,8 +654,13 @@
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
 	 */
-	uio_prefaultpages(n, uio);
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(n, uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
@@ -659,6 +699,9 @@
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
+	/* Will this write extend the file length? */
+	write_eof = (woff + n > zp->z_phys->zp_size);
+
 	end_size = MAX(zp->z_phys->zp_size, woff + n);
 
 	/*
@@ -669,7 +712,6 @@
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
-
 again:
 		if (zfs_usergroup_overquota(zfsvfs,
 		    B_FALSE, zp->z_phys->zp_uid) ||
@@ -681,16 +723,28 @@
 			break;
 		}
 
-		/*
-		 * If dmu_assign_arcbuf() is expected to execute with minimum
-		 * overhead loan an arc buffer and copy user data to it before
-		 * we enter a txg.  This avoids holding a txg forever while we
-		 * pagefault on a hanging NFS server mapping.
-		 */
-		if (abuf == NULL && n >= max_blksz &&
+		if (xuio && abuf == NULL) {
+			ASSERT(i_iov < iovcnt);
+			aiov = &iovp[i_iov];
+			abuf = dmu_xuio_arcbuf(xuio, i_iov);
+			dmu_xuio_clear(xuio, i_iov);
+			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+			    iovec_t *, aiov, arc_buf_t *, abuf);
+			ASSERT((aiov->iov_base == abuf->b_data) ||
+			    ((char *)aiov->iov_base - (char *)abuf->b_data +
+			    aiov->iov_len == arc_buf_size(abuf)));
+			i_iov++;
+		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_phys->zp_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
+			/*
+			 * This write covers a full block.  "Borrow" a buffer
+			 * from the dmu so that we can fill it before we enter
+			 * a transaction.  This avoids the possibility of
+			 * holding up the transaction if the data copy hangs
+			 * up on a pagefault (e.g., from an NFS server mapping).
+			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
@@ -755,8 +809,24 @@
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
-			ASSERT(tx_bytes == max_blksz);
-			dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+			/*
+			 * If this is not a full block write, but we are
+			 * extending the file past EOF and this data starts
+			 * block-aligned, use assign_arcbuf().  Otherwise,
+			 * write via dmu_write().
+			 */
+			if (tx_bytes < max_blksz && (!write_eof ||
+			    aiov->iov_base != abuf->b_data)) {
+				ASSERT(xuio);
+				dmu_write(zfsvfs->z_os, zp->z_id, woff,
+				    aiov->iov_len, aiov->iov_base, tx);
+				dmu_return_arcbuf(abuf);
+				xuio_stat_wbuf_copied();
+			} else {
+				ASSERT(xuio || tx_bytes == max_blksz);
+				dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
@@ -4571,6 +4641,160 @@
 }
 
 /*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ *                an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10);	/* 1K */
+int zcr_blksz_max = (1 << 17);	/* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int max_blksz = zfsvfs->z_max_blksz;
+	uio_t *uio = &xuio->xu_uio;
+	ssize_t size = uio->uio_resid;
+	offset_t offset = uio->uio_loffset;
+	int blksz;
+	int fullblk, i;
+	arc_buf_t *abuf;
+	ssize_t maxsize;
+	int preamble, postamble;
+
+	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+		return (EINVAL);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	switch (ioflag) {
+	case UIO_WRITE:
+		/*
+		 * Loan out an arc_buf for write if write size is bigger than
+		 * max_blksz, and the file's block size is also max_blksz.
+		 */
+		blksz = max_blksz;
+		if (size < blksz || zp->z_blksz != blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+		/*
+		 * Caller requests buffers for write before knowing where the
+		 * write offset might be (e.g. NFS TCP write).
+		 */
+		if (offset == -1) {
+			preamble = 0;
+		} else {
+			preamble = P2PHASE(offset, blksz);
+			if (preamble) {
+				preamble = blksz - preamble;
+				size -= preamble;
+			}
+		}
+
+		postamble = P2PHASE(size, blksz);
+		size -= postamble;
+
+		fullblk = size / blksz;
+		dmu_xuio_init(xuio,
+		    (preamble != 0) + fullblk + (postamble != 0));
+		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+		    int, postamble, int,
+		    (preamble != 0) + fullblk + (postamble != 0));
+
+		/*
+		 * Have to fix iov base/len for partial buffers.  They
+		 * currently represent full arc_buf's.
+		 */
+		if (preamble) {
+			/* data begins in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			dmu_xuio_add(xuio, abuf, blksz - preamble, preamble);
+		}
+
+		for (i = 0; i < fullblk; i++) {
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			dmu_xuio_add(xuio, abuf, 0, blksz);
+		}
+
+		if (postamble) {
+			/* data ends in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			dmu_xuio_add(xuio, abuf, 0, postamble);
+		}
+		break;
+	case UIO_READ:
+		/*
+		 * Loan out an arc_buf for read if the read size is larger than
+		 * the current file block size.  Block alignment is not
+		 * considered.  Partial arc_buf will be loaned out for read.
+		 */
+		blksz = zp->z_blksz;
+		if (blksz < zcr_blksz_min)
+			blksz = zcr_blksz_min;
+		if (blksz > zcr_blksz_max)
+			blksz = zcr_blksz_max;
+		/* avoid potential complexity of dealing with it */
+		if (blksz > max_blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		maxsize = zp->z_phys->zp_size - uio->uio_loffset;
+		if (size > maxsize)
+			size = maxsize;
+
+		if (size < blksz || vn_has_cached_data(vp)) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+		break;
+	default:
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	uio->uio_extflg = UIO_XUIO;
+	XUIO_XUZC_RW(xuio) = ioflag;
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+	int i;
+	arc_buf_t *abuf;
+	int ioflag = XUIO_XUZC_RW(xuio);
+
+	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+	i = dmu_xuio_cnt(xuio);
+	while (i-- > 0) {
+		abuf = dmu_xuio_arcbuf(xuio, i);
+		/*
+		 * if abuf == NULL, it must be a write buffer
+		 * that has been returned in zfs_write().
+		 */
+		if (abuf)
+			dmu_return_arcbuf(abuf);
+		ASSERT(abuf || ioflag == UIO_WRITE);
+	}
+
+	dmu_xuio_fini(xuio);
+	return (0);
+}
+
+/*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
  * not include arguments => we won't get type mismatch errors
@@ -4653,6 +4877,8 @@
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
+	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
 	NULL,			NULL
 };
 
--- a/usr/src/uts/common/nfs/nfs.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/nfs/nfs.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1447,6 +1447,7 @@
 #ifdef _KERNEL
 	uint_t wlist_len;
 	struct clist *wlist;
+	frtn_t zcopy;
 #endif
 };
 typedef struct READ3resok READ3resok;
@@ -2322,6 +2323,24 @@
 extern ts_label_t	*nfs_getflabel(vnode_t *, struct exportinfo *);
 extern boolean_t	do_rfs_label_check(bslabel_t *, vnode_t *, int,
 			    struct exportinfo *);
+
+/*
+ * Copy Reduction support.
+ * xuio_t wrapper with additional private data.
+ */
+
+typedef struct nfs_xuio {
+	xuio_t nu_uio;
+	vnode_t *nu_vp;
+	uint_t nu_ref;
+	frtn_t nu_frtn;
+} nfs_xuio_t;
+
+xuio_t *rfs_setup_xuio(vnode_t *);
+mblk_t *uio_to_mblk(uio_t *);
+void rfs_rndup_mblks(mblk_t *, uint_t, int);
+void rfs_free_xuio(void *);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/rpc/rpcmod.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/rpcmod.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -1059,8 +1059,6 @@
 #define	MIR_SVC_ORDREL_TIMEOUT	(10 * (60 * 1000L))	/* 10 minutes */
 #define	MIR_LASTFRAG	0x80000000	/* Record marker */
 
-#define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
-
 #define	MIR_SVC_QUIESCED(mir)	\
 	(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
 
--- a/usr/src/uts/common/rpc/xdr.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/xdr.h	Mon Jan 18 10:34:16 2010 -0800
@@ -18,7 +18,7 @@
  *
  * CDDL HEADER END
  *
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -585,6 +585,8 @@
 #endif
 #else
 
+#define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
+
 extern void	xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
 extern void	xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);
 extern bool_t	xdrmblk_getmblk(XDR *, mblk_t **, uint_t *);
--- a/usr/src/uts/common/rpc/xdr_mblk.c	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/xdr_mblk.c	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -361,20 +361,24 @@
  * not a multiple of BYTES_PER_XDR_UNIT, the caller has the option
  * of making the data a BYTES_PER_XDR_UNIT multiple (b_wptr - b_rptr is
  * a BYTES_PER_XDR_UNIT multiple), but in this case the caller has to ensure
- * that the filler bytes are initialized to zero. Note: Doesn't to work for
- * chained mblks.
+ * that the filler bytes are initialized to zero.
  */
 bool_t
 xdrmblk_putmblk(XDR *xdrs, mblk_t *m, uint_t len)
 {
 	int32_t llen = (int32_t)len;
 
-	if (((m->b_wptr - m->b_rptr) % BYTES_PER_XDR_UNIT) != 0)
+	if ((DLEN(m) % BYTES_PER_XDR_UNIT) != 0)
 		return (FALSE);
 	if (!xdrmblk_putint32(xdrs, &llen))
 		return (FALSE);
+
 	/* LINTED pointer alignment */
 	((mblk_t *)xdrs->x_base)->b_cont = m;
+
+	/* base points to the last mblk */
+	while (m->b_cont)
+		m = m->b_cont;
 	xdrs->x_base = (caddr_t)m;
 	xdrs->x_handy = 0;
 	return (TRUE);
--- a/usr/src/uts/common/sys/fem.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/fem.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_FEM_H
 #define	_SYS_FEM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/mutex.h>
 #include <sys/pathname.h>
@@ -260,7 +258,13 @@
 			struct shrlock *shr, int flag, cred_t *cr,	\
 			caller_context_t *ct);				\
 	int (*femop_vnevent)(femarg_t *vf, vnevent_t vnevent,		\
-			vnode_t *dvp, char *cname, caller_context_t *ct)
+			vnode_t *dvp, char *cname, 			\
+			caller_context_t *ct);				\
+	int (*femop_reqzcbuf)(femarg_t *vf, enum uio_rw ioflag,		\
+			xuio_t *xuio, cred_t *cr,			\
+			caller_context_t *ct);				\
+	int (*femop_retzcbuf)(femarg_t *vf, xuio_t *xuio, cred_t *cr,	\
+			caller_context_t *ct)
 	/* NB: No ";" */
 
 struct fem {
@@ -392,6 +396,10 @@
 			int flag, cred_t *cr, caller_context_t *ct);
 extern int vnext_vnevent(femarg_t *vf, vnevent_t vevent, vnode_t *dvp,
 			char *cname, caller_context_t *ct);
+extern int vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop,
+			cred_t *cr, caller_context_t *ct);
+extern int vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr,
+			caller_context_t *ct);
 
 extern int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap,
 			cred_t *cr);
--- a/usr/src/uts/common/sys/uio.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/uio.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -133,6 +133,49 @@
 	uioa_page_t	uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */
 } uioa_t;
 
+/*
+ * uio extensions
+ *
+ * PSARC 2009/478: Copy Reduction Interfaces
+ */
+typedef enum xuio_type {
+	UIOTYPE_ASYNCIO,
+	UIOTYPE_ZEROCOPY
+} xuio_type_t;
+
+typedef struct xuio {
+	uio_t xu_uio;		/* Embedded UIO structure */
+
+	/* Extended uio fields */
+	enum xuio_type xu_type;	/* What kind of uio structure? */
+	union {
+		/* Async I/O Support, intend to replace uioa_t. */
+		struct {
+			uint32_t xu_a_state;	/* state of async i/o */
+			/* bytes that have been uioamove()ed */
+			ssize_t xu_a_mbytes;
+			uioa_page_t *xu_a_lcur;	/* pointer into uioa_locked[] */
+			/* pointer into lcur->uioa_ppp[] */
+			void **xu_a_lppp;
+			void *xu_a_hwst[4];	/* opaque hardware state */
+			/* Per iov locked pages */
+			uioa_page_t xu_a_locked[UIOA_IOV_MAX];
+		} xu_aio;
+
+		/*
+		 * Copy Reduction Support -- facilate loaning / returning of
+		 * filesystem cache buffers.
+		 */
+		struct {
+			int xu_zc_rw;	/* read or write buffer */
+			void *xu_zc_priv;	/* fs specific */
+		} xu_zc;
+	} xu_ext;
+} xuio_t;
+
+#define	XUIO_XUZC_PRIV(xuio)    xuio->xu_ext.xu_zc.xu_zc_priv
+#define	XUIO_XUZC_RW(xuio)	xuio->xu_ext.xu_zc.xu_zc_rw
+
 #define	UIOA_ALLOC	0x0001		/* allocated but not yet initialized */
 #define	UIOA_INIT	0x0002		/* initialized but not yet enabled */
 #define	UIOA_ENABLED	0x0004		/* enabled, asynch i/o active */
@@ -177,6 +220,7 @@
 #define	UIO_COPY_CACHED		0x0001	/* copy should not bypass caches */
 
 #define	UIO_ASYNC		0x0002	/* uio_t is really a uioa_t */
+#define	UIO_XUIO		0x0004	/* Structure is xuio_t */
 
 /*
  * Global uioasync capability shadow state.
--- a/usr/src/uts/common/sys/vfs.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/vfs.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -302,7 +302,8 @@
 #define	VFSFT_SYSATTR_VIEWS	0x100000040	/* Supports sysattr view i/f */
 #define	VFSFT_ACCESS_FILTER	0x100000080	/* dirents filtered by access */
 #define	VFSFT_REPARSE		0x100000100	/* Supports reparse point */
-
+#define	VFSFT_ZEROCOPY_SUPPORTED	0x100000200
+				/* Support loaning /returning cache buffer */
 /*
  * Argument structure for mount(2).
  *
--- a/usr/src/uts/common/sys/vnode.h	Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/vnode.h	Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -118,6 +118,8 @@
 	kstat_named_t	ngetsecattr;	/* VOP_GETSECATTR */
 	kstat_named_t	nshrlock;	/* VOP_SHRLOCK */
 	kstat_named_t	nvnevent;	/* VOP_VNEVENT */
+	kstat_named_t	nreqzcbuf;	/* VOP_REQZCBUF */
+	kstat_named_t	nretzcbuf;	/* VOP_RETZCBUF */
 } vopstats_t;
 
 /*
@@ -900,7 +902,11 @@
 	int	(*vop_shrlock)(vnode_t *, int, struct shrlock *,	\
 				int, cred_t *, caller_context_t *);	\
 	int	(*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *,		\
-				char *, caller_context_t *)
+				char *, caller_context_t *);		\
+	int	(*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *,	\
+				cred_t *, caller_context_t *);		\
+	int	(*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *,		\
+				caller_context_t *)
 	/* NB: No ";" */
 
 /*
@@ -997,6 +1003,9 @@
 				caller_context_t *);
 extern int	fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
 				caller_context_t *);
+extern int	fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *,
+				caller_context_t *);
+extern int	fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *);
 
 #endif	/* _KERNEL */
 
@@ -1088,6 +1097,10 @@
 	fop_shrlock(vp, cmd, shr, f, cr, ct)
 #define	VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \
 	fop_vnevent(vp, vnevent, dvp, fnm, ct)
+#define	VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \
+	fop_reqzcbuf(vp, rwflag, xuiop, cr, ct)
+#define	VOP_RETZCBUF(vp, xuiop, cr, ct) \
+	fop_retzcbuf(vp, xuiop, cr, ct)
 
 #define	VOPNAME_OPEN		"open"
 #define	VOPNAME_CLOSE		"close"
@@ -1133,6 +1146,8 @@
 #define	VOPNAME_SETSECATTR	"setsecattr"
 #define	VOPNAME_SHRLOCK		"shrlock"
 #define	VOPNAME_VNEVENT		"vnevent"
+#define	VOPNAME_REQZCBUF	"reqzcbuf"
+#define	VOPNAME_RETZCBUF	"retzcbuf"
 
 /*
  * Flags for VOP_LOOKUP