--- a/usr/src/cmd/stat/fsstat/fsstat.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/cmd/stat/fsstat/fsstat.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -428,6 +428,8 @@
PRINT_VOPSTAT(niceflag, setsecattr);
PRINT_VOPSTAT(niceflag, shrlock);
PRINT_VOPSTAT(niceflag, vnevent);
+ PRINT_VOPSTAT(niceflag, reqzcbuf);
+ PRINT_VOPSTAT(niceflag, retzcbuf);
if (niceflag) {
/* Make it easier on the eyes */
--- a/usr/src/uts/common/fs/fem.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/fem.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,10 +19,9 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/types.h>
#include <sys/atomic.h>
@@ -124,6 +123,8 @@
_FEMOPDEF(GETSECATTR, getsecattr),
_FEMOPDEF(SHRLOCK, shrlock),
_FEMOPDEF(VNEVENT, vnevent),
+ _FEMOPDEF(REQZCBUF, reqzcbuf),
+ _FEMOPDEF(RETZCBUF, retzcbuf),
{ NULL, 0, NULL, NULL }
};
@@ -176,6 +177,8 @@
_FEMGUARD(GETSECATTR, getsecattr),
_FEMGUARD(SHRLOCK, shrlock),
_FEMGUARD(VNEVENT, vnevent),
+ _FEMGUARD(REQZCBUF, reqzcbuf),
+ _FEMGUARD(RETZCBUF, retzcbuf),
{ NULL, NULL }
};
@@ -1645,6 +1648,61 @@
}
static int
+vhead_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ femarg_t farg;
+ struct fem_list *femsp;
+ int (*func)();
+ void *arg0;
+ int errc;
+
+ if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+ func = (int (*)()) (vp->v_op->vop_reqzcbuf);
+ arg0 = vp;
+ fem_unlock(vp->v_femhead);
+ errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+ } else {
+ fem_addref(femsp);
+ fem_unlock(vp->v_femhead);
+ farg.fa_vnode.vp = vp;
+ farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+ vsop_find(&farg, &func, int, &arg0, vop_reqzcbuf,
+ femop_reqzcbuf);
+ errc = (*func)(arg0, ioflag, xuiop, cr, ct);
+ fem_release(femsp);
+ }
+ return (errc);
+}
+
+static int
+vhead_retzcbuf(vnode_t *vp, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+ femarg_t farg;
+ struct fem_list *femsp;
+ int (*func)();
+ void *arg0;
+ int errc;
+
+ if ((femsp = fem_lock(vp->v_femhead)) == NULL) {
+ func = (int (*)()) (vp->v_op->vop_retzcbuf);
+ arg0 = vp;
+ fem_unlock(vp->v_femhead);
+ errc = (*func)(arg0, xuiop, cr, ct);
+ } else {
+ fem_addref(femsp);
+ fem_unlock(vp->v_femhead);
+ farg.fa_vnode.vp = vp;
+ farg.fa_fnode = femsp->feml_nodes + femsp->feml_tos;
+ vsop_find(&farg, &func, int, &arg0, vop_retzcbuf,
+ femop_retzcbuf);
+ errc = (*func)(arg0, xuiop, cr, ct);
+ fem_release(femsp);
+ }
+ return (errc);
+}
+
+static int
fshead_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
fsemarg_t farg;
@@ -1942,6 +2000,8 @@
{ VOPNAME_GETSECATTR, (femop_t *)vhead_getsecattr },
{ VOPNAME_SHRLOCK, (femop_t *)vhead_shrlock },
{ VOPNAME_VNEVENT, (femop_t *)vhead_vnevent },
+ { VOPNAME_REQZCBUF, (femop_t *)vhead_reqzcbuf },
+ { VOPNAME_RETZCBUF, (femop_t *)vhead_retzcbuf },
{ NULL, NULL }
};
@@ -2642,6 +2702,35 @@
}
int
+vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ int (*func)() = NULL;
+ void *arg0 = NULL;
+
+ ASSERT(vf != NULL);
+ vf->fa_fnode--;
+ vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
+ ASSERT(func != NULL);
+ ASSERT(arg0 != NULL);
+ return ((*func)(arg0, ioflag, xuiop, cr, ct));
+}
+
+int
+vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
+{
+ int (*func)() = NULL;
+ void *arg0 = NULL;
+
+ ASSERT(vf != NULL);
+ vf->fa_fnode--;
+ vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
+ ASSERT(func != NULL);
+ ASSERT(arg0 != NULL);
+ return ((*func)(arg0, xuiop, cr, ct));
+}
+
+int
vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
int (*func)() = NULL;
--- a/usr/src/uts/common/fs/nfs/nfs3_srv.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -87,6 +87,8 @@
static void vattr_to_wcc_data(struct vattr *, struct vattr *, wcc_data *);
static int rdma_setup_read_data3(READ3args *, READ3resok *);
+extern int nfs_loaned_buffers;
+
u_longlong_t nfs3_srv_caller_id;
/* ARGSUSED */
@@ -994,6 +996,9 @@
int in_crit = 0;
int need_rwunlock = 0;
caller_context_t ct;
+ int rdma_used = 0;
+ int loaned_buffers;
+ struct uio *uiop;
vap = NULL;
@@ -1007,6 +1012,12 @@
goto out;
}
+ if (args->wlist)
+ rdma_used = 1;
+
+ /* use loaned buffers for TCP */
+ loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
if (is_system_labeled()) {
bslabel_t *clabel = req->rq_label;
@@ -1136,12 +1147,38 @@
if (args->count > rfs3_tsize(req))
args->count = rfs3_tsize(req);
+ if (loaned_buffers) {
+ uiop = (uio_t *)rfs_setup_xuio(vp);
+ ASSERT(uiop != NULL);
+ uiop->uio_segflg = UIO_SYSSPACE;
+ uiop->uio_loffset = args->offset;
+ uiop->uio_resid = args->count;
+
+ /* Jump to do the read if successful */
+ if (VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cr, &ct) == 0) {
+ /*
+ * Need to hold the vnode until after VOP_RETZCBUF()
+ * is called.
+ */
+ VN_HOLD(vp);
+ goto doio_read;
+ }
+
+ DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+ uiop->uio_loffset, int, uiop->uio_resid);
+
+ uiop->uio_extflg = 0;
+ /* failure to setup for zero copy */
+ rfs_free_xuio((void *)uiop);
+ loaned_buffers = 0;
+ }
+
/*
* If returning data via RDMA Write, then grab the chunk list.
* If we aren't returning READ data w/RDMA_WRITE, then grab
* a mblk.
*/
- if (args->wlist) {
+ if (rdma_used) {
mp = NULL;
(void) rdma_get_wchunk(req, &iov, args->wlist);
} else {
@@ -1167,11 +1204,14 @@
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = args->offset;
uio.uio_resid = args->count;
-
- error = VOP_READ(vp, &uio, 0, cr, &ct);
+ uiop = &uio;
+
+doio_read:
+ error = VOP_READ(vp, uiop, 0, cr, &ct);
if (error) {
- freeb(mp);
+ if (mp)
+ freemsg(mp);
/* check if a monitor detected a delegation conflict */
if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
resp->status = NFS3ERR_JUKEBOX;
@@ -1180,6 +1220,12 @@
goto out;
}
+ /* make mblk using zc buffers */
+ if (loaned_buffers) {
+ mp = uio_to_mblk(uiop);
+ ASSERT(mp != NULL);
+ }
+
va.va_mask = AT_ALL;
error = VOP_GETATTR(vp, &va, 0, cr, &ct);
@@ -1205,16 +1251,20 @@
resp->status = NFS3_OK;
vattr_to_post_op_attr(vap, &resp->resok.file_attributes);
- resp->resok.count = args->count - uio.uio_resid;
+ resp->resok.count = args->count - uiop->uio_resid;
if (!error && offset + resp->resok.count == va.va_size)
resp->resok.eof = TRUE;
else
resp->resok.eof = FALSE;
resp->resok.data.data_len = resp->resok.count;
+
+ if (mp)
+ rfs_rndup_mblks(mp, resp->resok.count, loaned_buffers);
+
resp->resok.data.mp = mp;
resp->resok.size = (uint_t)args->count;
- if (args->wlist) {
+ if (rdma_used) {
resp->resok.data.data_val = (caddr_t)iov.iov_base;
if (!rdma_setup_read_data3(args, &(resp->resok))) {
resp->status = NFS3ERR_INVAL;
@@ -1260,7 +1310,7 @@
if (resp->status == NFS3_OK) {
mp = resp->resok.data.mp;
if (mp != NULL)
- freeb(mp);
+ freemsg(mp);
}
}
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c Mon Jan 18 10:34:16 2010 -0800
@@ -1003,7 +1003,7 @@
static int nfs3_dynamic = 0; /* global variable to enable dynamic retrans. */
static ushort_t nfs3_max_threads = 8; /* max number of active async threads */
-static uint_t nfs3_bsize = 32 * 1024; /* client `block' size */
+uint_t nfs3_bsize = 32 * 1024; /* client `block' size */
static uint_t nfs3_async_clusters = 1; /* # of reqs from each async queue */
static uint_t nfs3_cots_timeo = NFS_COTS_TIMEO;
--- a/usr/src/uts/common/fs/nfs/nfs3_xdr.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs3_xdr.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1320,16 +1320,9 @@
}
if (xdrs->x_op == XDR_ENCODE) {
- int i, rndup;
mp = resokp->data.mp;
if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
- mp->b_wptr += resokp->count;
- rndup = BYTES_PER_XDR_UNIT -
- (resokp->data.data_len % BYTES_PER_XDR_UNIT);
- if (rndup != BYTES_PER_XDR_UNIT)
- for (i = 0; i < rndup; i++)
- *mp->b_wptr++ = '\0';
if (xdrmblk_putmblk(xdrs, mp, resokp->count) == TRUE) {
resokp->data.mp = NULL;
return (TRUE);
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -84,6 +84,8 @@
#define RFS4_LOCK_DELAY 10 /* Milliseconds */
static clock_t rfs4_lock_delay = RFS4_LOCK_DELAY;
extern struct svc_ops rdma_svc_ops;
+extern int nfs_loaned_buffers;
+/* End of Tunables */
static int rdma_setup_read_data4(READ4args *, READ4res *);
@@ -3140,9 +3142,12 @@
bool_t *deleg = &cs->deleg;
nfsstat4 stat;
int in_crit = 0;
- mblk_t *mp;
+ mblk_t *mp = NULL;
int alloc_err = 0;
+ int rdma_used = 0;
+ int loaned_buffers;
caller_context_t ct;
+ struct uio *uiop;
DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs,
READ4args, args);
@@ -3183,6 +3188,12 @@
goto out;
}
+ if (args->wlist)
+ rdma_used = 1;
+
+ /* use loaned buffers for TCP */
+ loaned_buffers = (nfs_loaned_buffers && !rdma_used) ? 1 : 0;
+
va.va_mask = AT_MODE|AT_SIZE|AT_UID;
verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
@@ -3250,11 +3261,38 @@
if (args->count > rfs4_tsize(req))
args->count = rfs4_tsize(req);
+ if (loaned_buffers) {
+ uiop = (uio_t *)rfs_setup_xuio(vp);
+ ASSERT(uiop != NULL);
+ uiop->uio_segflg = UIO_SYSSPACE;
+ uiop->uio_loffset = args->offset;
+ uiop->uio_resid = args->count;
+
+ /* Jump to do the read if successful */
+ if (!VOP_REQZCBUF(vp, UIO_READ, (xuio_t *)uiop, cs->cr, &ct)) {
+ /*
+ * Need to hold the vnode until after VOP_RETZCBUF()
+ * is called.
+ */
+ VN_HOLD(vp);
+ goto doio_read;
+ }
+
+ DTRACE_PROBE2(nfss__i__reqzcbuf_failed, int,
+ uiop->uio_loffset, int, uiop->uio_resid);
+
+ uiop->uio_extflg = 0;
+
+ /* failure to setup for zero copy */
+ rfs_free_xuio((void *)uiop);
+ loaned_buffers = 0;
+ }
+
/*
* If returning data via RDMA Write, then grab the chunk list. If we
* aren't returning READ data w/RDMA_WRITE, then grab a mblk.
*/
- if (args->wlist) {
+ if (rdma_used) {
mp = NULL;
(void) rdma_get_wchunk(req, &iov, args->wlist);
} else {
@@ -3287,27 +3325,38 @@
uio.uio_extflg = UIO_COPY_CACHED;
uio.uio_loffset = args->offset;
uio.uio_resid = args->count;
-
- error = do_io(FREAD, vp, &uio, 0, cs->cr, &ct);
+ uiop = &uio;
+
+doio_read:
+ error = do_io(FREAD, vp, uiop, 0, cs->cr, &ct);
va.va_mask = AT_SIZE;
verror = VOP_GETATTR(vp, &va, 0, cs->cr, &ct);
if (error) {
- freeb(mp);
+ if (mp)
+ freemsg(mp);
*cs->statusp = resp->status = puterrno4(error);
goto out;
}
+ /* make mblk using zc buffers */
+ if (loaned_buffers) {
+ mp = uio_to_mblk(uiop);
+ ASSERT(mp != NULL);
+ }
+
*cs->statusp = resp->status = NFS4_OK;
- ASSERT(uio.uio_resid >= 0);
- resp->data_len = args->count - uio.uio_resid;
+ ASSERT(uiop->uio_resid >= 0);
+ resp->data_len = args->count - uiop->uio_resid;
if (mp) {
resp->data_val = (char *)mp->b_datap->db_base;
+ rfs_rndup_mblks(mp, resp->data_len, loaned_buffers);
} else {
resp->data_val = (caddr_t)iov.iov_base;
}
+
resp->mblk = mp;
if (!verror && offset + resp->data_len == va.va_size)
@@ -3315,7 +3364,7 @@
else
resp->eof = FALSE;
- if (args->wlist) {
+ if (rdma_used) {
if (!rdma_setup_read_data4(args, resp)) {
*cs->statusp = resp->status = NFS4ERR_INVAL;
}
@@ -3337,7 +3386,7 @@
READ4res *resp = &resop->nfs_resop4_u.opread;
if (resp->status == NFS4_OK && resp->mblk != NULL) {
- freeb(resp->mblk);
+ freemsg(resp->mblk);
resp->mblk = NULL;
resp->data_val = NULL;
resp->data_len = 0;
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c Mon Jan 18 10:34:16 2010 -0800
@@ -2159,7 +2159,7 @@
}
static ushort_t nfs4_max_threads = 8; /* max number of active async threads */
-static uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
+uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
static uint_t nfs4_async_clusters = 1; /* # of reqs from each async queue */
static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
--- a/usr/src/uts/common/fs/nfs/nfs4_xdr.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs4_xdr.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -3350,7 +3350,6 @@
static bool_t
xdr_READ4res(XDR *xdrs, READ4res *objp)
{
- int i, rndup;
mblk_t *mp;
if (xdrs->x_op == XDR_DECODE)
@@ -3378,12 +3377,6 @@
mp = objp->mblk;
if (mp != NULL && xdrs->x_ops == &xdrmblk_ops) {
- mp->b_wptr += objp->data_len;
- rndup = BYTES_PER_XDR_UNIT -
- (objp->data_len % BYTES_PER_XDR_UNIT);
- if (rndup != BYTES_PER_XDR_UNIT)
- for (i = 0; i < rndup; i++)
- *mp->b_wptr++ = '\0';
if (xdrmblk_putmblk(xdrs, mp, objp->data_len) == TRUE) {
objp->mblk = NULL;
return (TRUE);
--- a/usr/src/uts/common/fs/nfs/nfs_server.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -106,6 +106,9 @@
char _depends_on[] = "misc/klmmod";
+kmem_cache_t *nfs_xuio_cache;
+int nfs_loaned_buffers = 0;
+
int
_init(void)
{
@@ -139,6 +142,11 @@
/* setup DSS paths here; must be done before initial server startup */
rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
+ /* initialize the copy reduction caches */
+
+ nfs_xuio_cache = kmem_cache_create("nfs_xuio_cache",
+ sizeof (nfs_xuio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
return (status);
}
@@ -3215,3 +3223,140 @@
label_rele(tslabel);
return (result);
}
+
+/*
+ * Callback function to return the loaned buffers.
+ * Calls VOP_RETZCBUF() only after all uio_iov[]
+ * buffers are returned. nu_ref maintains the count.
+ */
+void
+rfs_free_xuio(void *free_arg)
+{
+ uint_t ref;
+ nfs_xuio_t *nfsuiop = (nfs_xuio_t *)free_arg;
+
+ ref = atomic_dec_uint_nv(&nfsuiop->nu_ref);
+
+ /*
+ * Call VOP_RETZCBUF() only when all the iov buffers
+ * are sent OTW.
+ */
+ if (ref != 0)
+ return;
+
+ if (((uio_t *)nfsuiop)->uio_extflg & UIO_XUIO) {
+ (void) VOP_RETZCBUF(nfsuiop->nu_vp, (xuio_t *)free_arg, NULL,
+ NULL);
+ VN_RELE(nfsuiop->nu_vp);
+ }
+
+ kmem_cache_free(nfs_xuio_cache, free_arg);
+}
+
+xuio_t *
+rfs_setup_xuio(vnode_t *vp)
+{
+ nfs_xuio_t *nfsuiop;
+
+ nfsuiop = kmem_cache_alloc(nfs_xuio_cache, KM_SLEEP);
+
+ bzero(nfsuiop, sizeof (nfs_xuio_t));
+ nfsuiop->nu_vp = vp;
+
+ /*
+ * ref count set to 1. more may be added
+ * if multiple mblks refer to multiple iov's.
+ * This is done in uio_to_mblk().
+ */
+
+ nfsuiop->nu_ref = 1;
+
+ nfsuiop->nu_frtn.free_func = rfs_free_xuio;
+ nfsuiop->nu_frtn.free_arg = (char *)nfsuiop;
+
+ nfsuiop->nu_uio.xu_type = UIOTYPE_ZEROCOPY;
+
+ return (&nfsuiop->nu_uio);
+}
+
+mblk_t *
+uio_to_mblk(uio_t *uiop)
+{
+ struct iovec *iovp;
+ int i;
+ mblk_t *mp, *mp1;
+ nfs_xuio_t *nfsuiop = (nfs_xuio_t *)uiop;
+
+ if (uiop->uio_iovcnt == 0)
+ return (NULL);
+
+ iovp = uiop->uio_iov;
+ mp = mp1 = esballoca((uchar_t *)iovp->iov_base, iovp->iov_len,
+ BPRI_MED, &nfsuiop->nu_frtn);
+ ASSERT(mp != NULL);
+
+ mp->b_wptr += iovp->iov_len;
+ mp->b_datap->db_type = M_DATA;
+
+ for (i = 1; i < uiop->uio_iovcnt; i++) {
+ iovp = (uiop->uio_iov + i);
+
+ mp1->b_cont = esballoca(
+ (uchar_t *)iovp->iov_base, iovp->iov_len, BPRI_MED,
+ &nfsuiop->nu_frtn);
+
+ mp1 = mp1->b_cont;
+ ASSERT(mp1 != NULL);
+ mp1->b_wptr += iovp->iov_len;
+ mp1->b_datap->db_type = M_DATA;
+ }
+
+ nfsuiop->nu_ref = uiop->uio_iovcnt;
+
+ return (mp);
+}
+
+void
+rfs_rndup_mblks(mblk_t *mp, uint_t len, int buf_loaned)
+{
+ int i, rndup;
+ int alloc_err = 0;
+ mblk_t *rmp;
+
+ rndup = BYTES_PER_XDR_UNIT - (len % BYTES_PER_XDR_UNIT);
+
+ /* single mblk_t non copy-reduction case */
+ if (!buf_loaned) {
+ mp->b_wptr += len;
+ if (rndup != BYTES_PER_XDR_UNIT) {
+ for (i = 0; i < rndup; i++)
+ *mp->b_wptr++ = '\0';
+ }
+ return;
+ }
+
+ /* no need for extra rndup */
+ if (rndup == BYTES_PER_XDR_UNIT)
+ return;
+
+ while (mp->b_cont)
+ mp = mp->b_cont;
+
+ /*
+ * In case of copy-reduction mblks, the size of the mblks
+ * are fixed and are of the size of the loaned buffers.
+ * Allocate a roundup mblk and chain it to the data
+ * buffers. This is sub-optimal, but not expected to
+ * happen in regular common workloads.
+ */
+
+ rmp = allocb_wait(rndup, BPRI_MED, STR_NOSIG, &alloc_err);
+ ASSERT(rmp != NULL);
+ ASSERT(alloc_err == 0);
+
+ for (i = 0; i < rndup; i++)
+ *rmp->b_wptr++ = '\0';
+
+ rmp->b_datap->db_type = M_DATA;
+ mp->b_cont = rmp;
+}
--- a/usr/src/uts/common/fs/vnode.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/vnode.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -362,6 +362,12 @@
(fs_generic_func_p) fs_vnevent_nosupport,
(fs_generic_func_p) fs_vnevent_nosupport,
+ VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
+ fs_nosys, fs_nosys,
+
+ VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
+ fs_nosys, fs_nosys,
+
NULL, 0, NULL, NULL
};
@@ -522,6 +528,10 @@
kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
/* VOP_VNEVENT */
kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
+ /* VOP_REQZCBUF */
+ kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
+ /* VOP_RETZCBUF */
+ kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
return (vsp);
}
@@ -4151,6 +4161,31 @@
return (err);
}
+int
+fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, reqzcbuf);
+ return (err);
+}
+
+int
+fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ int err;
+
+ if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
+ return (ENOTSUP);
+ err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
+ VOPSTATS_UPDATE(vp, retzcbuf);
+ return (err);
+}
+
/*
* Default destructor
* Needed because NULL destructor means that the key is unused
--- a/usr/src/uts/common/fs/zfs/arc.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1241,14 +1241,31 @@
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_state == arc_anon);
ASSERT(buf->b_data != NULL);
- VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
- VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
+ (void) refcount_add(&hdr->b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+ arc_buf_hdr_t *hdr;
+
+ rw_enter(&buf->b_lock, RW_WRITER);
+ ASSERT(buf->b_data != NULL);
+ hdr = buf->b_hdr;
+ (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_refcnt, tag);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+
+ atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+ rw_exit(&buf->b_lock);
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
--- a/usr/src/uts/common/fs/zfs/dbuf.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -406,6 +406,29 @@
}
}
+/*
+ * Loan out an arc_buf for read. Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+ arc_buf_t *abuf;
+
+ mutex_enter(&db->db_mtx);
+ if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+ int blksz = db->db.db_size;
+ mutex_exit(&db->db_mtx);
+ abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+ bcopy(db->db.db_data, abuf->b_data, blksz);
+ } else {
+ abuf = db->db_buf;
+ arc_loan_inuse_buf(abuf, db);
+ dbuf_set_data(db, NULL);
+ mutex_exit(&db->db_mtx);
+ }
+ return (abuf);
+}
+
uint64_t
dbuf_whichblock(dnode_t *dn, uint64_t offset)
{
@@ -1162,7 +1185,6 @@
ASSERT(db->db_blkid != DB_BONUS_BLKID);
mutex_enter(&db->db_mtx);
-
/*
* If this buffer is not dirty, we're done.
*/
@@ -1341,9 +1363,11 @@
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ xuio_stat_wbuf_copied();
return;
}
+ xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
--- a/usr/src/uts/common/fs/zfs/dmu.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -661,12 +661,136 @@
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+ dmu_xuio_t *priv;
+ uio_t *uio = &xuio->xu_uio;
+
+ uio->uio_iovcnt = nblk;
+ uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+ priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+ priv->cnt = nblk;
+ priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+ priv->iovp = uio->uio_iov;
+ XUIO_XUZC_PRIV(xuio) = priv;
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+ return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int nblk = priv->cnt;
+
+ kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+ kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+ kmem_free(priv, sizeof (dmu_xuio_t));
+
+ if (XUIO_XUZC_RW(xuio) == UIO_READ)
+ XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+ else
+ XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+ struct iovec *iov;
+ uio_t *uio = &xuio->xu_uio;
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ int i = priv->next++;
+
+ ASSERT(i < priv->cnt);
+ ASSERT(off + n <= arc_buf_size(abuf));
+ iov = uio->uio_iov + i;
+ iov->iov_base = (char *)abuf->b_data + off;
+ iov->iov_len = n;
+ priv->bufs[i] = abuf;
+ return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+ return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+ dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+ ASSERT(i < priv->cnt);
+ priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+ xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (xuio_ksp != NULL) {
+ xuio_ksp->ks_data = &xuio_stats;
+ kstat_install(xuio_ksp);
+ }
+}
+
+static void
+xuio_stat_fini(void)
+{
+ if (xuio_ksp != NULL) {
+ kstat_delete(xuio_ksp);
+ xuio_ksp = NULL;
+ }
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+ XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
#ifdef _KERNEL
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
+ xuio_t *xuio = NULL;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -677,6 +801,9 @@
if (err)
return (err);
+ if (uio->uio_extflg == UIO_XUIO)
+ xuio = (xuio_t *)uio;
+
for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
@@ -687,8 +814,24 @@
bufoff = uio->uio_loffset - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, size);
- err = uiomove((char *)db->db_data + bufoff, tocpy,
- UIO_READ, uio);
+ if (xuio) {
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ arc_buf_t *dbuf_abuf = dbi->db_buf;
+ arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+ err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+ if (!err) {
+ uio->uio_resid -= tocpy;
+ uio->uio_loffset += tocpy;
+ }
+
+ if (abuf == dbuf_abuf)
+ XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+ else
+ XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+ } else {
+ err = uiomove((char *)db->db_data + bufoff, tocpy,
+ UIO_READ, uio);
+ }
if (err)
break;
@@ -857,6 +1000,7 @@
dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
buf->b_data, tx);
dmu_return_arcbuf(buf);
+ XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
}
@@ -1369,6 +1513,7 @@
zfetch_init();
arc_init();
l2arc_init();
+ xuio_stat_init();
}
void
@@ -1379,4 +1524,5 @@
dnode_fini();
dbuf_fini();
l2arc_fini();
+ xuio_stat_fini();
}
--- a/usr/src/uts/common/fs/zfs/sys/arc.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -87,6 +87,7 @@
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -267,6 +267,7 @@
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,6 +45,7 @@
#endif
struct uio;
+struct xuio;
struct page;
struct vnode;
struct spa;
@@ -500,6 +501,15 @@
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+ size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
extern int zfs_prefetch_disable;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -232,6 +232,39 @@
struct objset;
struct dmu_pool;
+typedef struct dmu_xuio {
+ int next;
+ int cnt;
+ struct arc_buf **bufs;
+ iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+ /* loaned yet not returned arc_buf */
+ kstat_named_t xuiostat_onloan_rbuf;
+ kstat_named_t xuiostat_onloan_wbuf;
+ /* whether a copy is made when loaning out a read buffer */
+ kstat_named_t xuiostat_rbuf_copied;
+ kstat_named_t xuiostat_rbuf_nocopy;
+ /* whether a copy is made when assigning a write buffer */
+ kstat_named_t xuiostat_wbuf_copied;
+ kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+ { "onloan_read_buf", KSTAT_DATA_UINT64 },
+ { "onloan_write_buf", KSTAT_DATA_UINT64 },
+ { "read_buf_copied", KSTAT_DATA_UINT64 },
+ { "read_buf_nocopy", KSTAT_DATA_UINT64 },
+ { "write_buf_copied", KSTAT_DATA_UINT64 },
+ { "write_buf_nocopy", KSTAT_DATA_UINT64 }
+};
+
+#define XUIOSTAT_INCR(stat, val) \
+ atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
+
+
#ifdef __cplusplus
}
#endif
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1115,6 +1115,7 @@
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
}
+ vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval;
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -447,6 +447,7 @@
ssize_t n, nbytes;
int error;
rl_t *rl;
+ xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
@@ -507,6 +508,35 @@
ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(vp)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ dmu_xuio_add(xuio,
+ dmu_request_arcbuf(zp->z_dbuf, blksz),
+ 0, blksz);
+ }
+ }
+ }
+
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
@@ -524,7 +554,6 @@
n -= nbytes;
}
-
out:
zfs_range_unlock(rl);
@@ -570,6 +599,12 @@
uint64_t pflags;
int error;
arc_buf_t *abuf;
+ iovec_t *aiov;
+ xuio_t *xuio = NULL;
+ int i_iov = 0;
+ int iovcnt = uio->uio_iovcnt;
+ iovec_t *iovp = uio->uio_iov;
+ int write_eof;
/*
* Fasttrack empty write
@@ -619,8 +654,13 @@
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
*/
- uio_prefaultpages(n, uio);
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(n, uio);
/*
* If in append mode, set the io offset pointer to eof.
@@ -659,6 +699,9 @@
if ((woff + n) > limit || woff > (limit - n))
n = limit - woff;
+ /* Will this write extend the file length? */
+ write_eof = (woff + n > zp->z_phys->zp_size);
+
end_size = MAX(zp->z_phys->zp_size, woff + n);
/*
@@ -669,7 +712,6 @@
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
-
again:
if (zfs_usergroup_overquota(zfsvfs,
B_FALSE, zp->z_phys->zp_uid) ||
@@ -681,16 +723,28 @@
break;
}
- /*
- * If dmu_assign_arcbuf() is expected to execute with minimum
- * overhead loan an arc buffer and copy user data to it before
- * we enter a txg. This avoids holding a txg forever while we
- * pagefault on a hanging NFS server mapping.
- */
- if (abuf == NULL && n >= max_blksz &&
+ if (xuio && abuf == NULL) {
+ ASSERT(i_iov < iovcnt);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+ iovec_t *, aiov, arc_buf_t *, abuf);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+ } else if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_phys->zp_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
size_t cbytes;
abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
@@ -755,8 +809,24 @@
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
- ASSERT(tx_bytes == max_blksz);
- dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+ }
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
@@ -4571,6 +4641,160 @@
}
/*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ * an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+ caller_context_t *ct)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ int max_blksz = zfsvfs->z_max_blksz;
+ uio_t *uio = &xuio->xu_uio;
+ ssize_t size = uio->uio_resid;
+ offset_t offset = uio->uio_loffset;
+ int blksz;
+ int fullblk, i;
+ arc_buf_t *abuf;
+ ssize_t maxsize;
+ int preamble, postamble;
+
+ if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+ return (EINVAL);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ switch (ioflag) {
+ case UIO_WRITE:
+ /*
+ * Loan out an arc_buf for write if write size is bigger than
+ * max_blksz, and the file's block size is also max_blksz.
+ */
+ blksz = max_blksz;
+ if (size < blksz || zp->z_blksz != blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ /*
+ * Caller requests buffers for write before knowing where the
+ * write offset might be (e.g. NFS TCP write).
+ */
+ if (offset == -1) {
+ preamble = 0;
+ } else {
+ preamble = P2PHASE(offset, blksz);
+ if (preamble) {
+ preamble = blksz - preamble;
+ size -= preamble;
+ }
+ }
+
+ postamble = P2PHASE(size, blksz);
+ size -= postamble;
+
+ fullblk = size / blksz;
+ dmu_xuio_init(xuio,
+ (preamble != 0) + fullblk + (postamble != 0));
+ DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+ int, postamble, int,
+ (preamble != 0) + fullblk + (postamble != 0));
+
+ /*
+ * Have to fix iov base/len for partial buffers. They
+ * currently represent full arc_buf's.
+ */
+ if (preamble) {
+ /* data begins in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, blksz - preamble, preamble);
+ }
+
+ for (i = 0; i < fullblk; i++) {
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, 0, blksz);
+ }
+
+ if (postamble) {
+ /* data ends in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+ ASSERT(abuf);
+ dmu_xuio_add(xuio, abuf, 0, postamble);
+ }
+ break;
+ case UIO_READ:
+ /*
+ * Loan out an arc_buf for read if the read size is larger than
+ * the current file block size. Block alignment is not
+ * considered. Partial arc_buf will be loaned out for read.
+ */
+ blksz = zp->z_blksz;
+ if (blksz < zcr_blksz_min)
+ blksz = zcr_blksz_min;
+ if (blksz > zcr_blksz_max)
+ blksz = zcr_blksz_max;
+ /* avoid potential complexity of dealing with it */
+ if (blksz > max_blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ maxsize = zp->z_phys->zp_size - uio->uio_loffset;
+ if (size > maxsize)
+ size = maxsize;
+
+ if (size < blksz || vn_has_cached_data(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+ break;
+ default:
+ ZFS_EXIT(zfsvfs);
+ return (EINVAL);
+ }
+
+ uio->uio_extflg = UIO_XUIO;
+ XUIO_XUZC_RW(xuio) = ioflag;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+ int i;
+ arc_buf_t *abuf;
+ int ioflag = XUIO_XUZC_RW(xuio);
+
+ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+ i = dmu_xuio_cnt(xuio);
+ while (i-- > 0) {
+ abuf = dmu_xuio_arcbuf(xuio, i);
+ /*
+ * if abuf == NULL, it must be a write buffer
+ * that has been returned in zfs_write().
+ */
+ if (abuf)
+ dmu_return_arcbuf(abuf);
+ ASSERT(abuf || ioflag == UIO_WRITE);
+ }
+
+ dmu_xuio_fini(xuio);
+ return (0);
+}
+
+/*
* Predeclare these here so that the compiler assumes that
* this is an "old style" function declaration that does
* not include arguments => we won't get type mismatch errors
@@ -4653,6 +4877,8 @@
VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
+ VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
NULL, NULL
};
--- a/usr/src/uts/common/nfs/nfs.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/nfs/nfs.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -1447,6 +1447,7 @@
#ifdef _KERNEL
uint_t wlist_len;
struct clist *wlist;
+ frtn_t zcopy;
#endif
};
typedef struct READ3resok READ3resok;
@@ -2322,6 +2323,24 @@
extern ts_label_t *nfs_getflabel(vnode_t *, struct exportinfo *);
extern boolean_t do_rfs_label_check(bslabel_t *, vnode_t *, int,
struct exportinfo *);
+
+/*
+ * Copy Reduction support.
+ * xuio_t wrapper with additional private data.
+ */
+
+typedef struct nfs_xuio {
+ xuio_t nu_uio;
+ vnode_t *nu_vp;
+ uint_t nu_ref;
+ frtn_t nu_frtn;
+} nfs_xuio_t;
+
+xuio_t *rfs_setup_xuio(vnode_t *);
+mblk_t *uio_to_mblk(uio_t *);
+void rfs_rndup_mblks(mblk_t *, uint_t, int);
+void rfs_free_xuio(void *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
--- a/usr/src/uts/common/rpc/rpcmod.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/rpcmod.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1059,8 +1059,6 @@
#define MIR_SVC_ORDREL_TIMEOUT (10 * (60 * 1000L)) /* 10 minutes */
#define MIR_LASTFRAG 0x80000000 /* Record marker */
-#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
-
#define MIR_SVC_QUIESCED(mir) \
(mir->mir_ref_cnt == 0 && mir->mir_inrservice == 0)
--- a/usr/src/uts/common/rpc/xdr.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/xdr.h Mon Jan 18 10:34:16 2010 -0800
@@ -18,7 +18,7 @@
*
* CDDL HEADER END
*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -585,6 +585,8 @@
#endif
#else
+#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
+
extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
extern void xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);
extern bool_t xdrmblk_getmblk(XDR *, mblk_t **, uint_t *);
--- a/usr/src/uts/common/rpc/xdr_mblk.c Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/rpc/xdr_mblk.c Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -361,20 +361,24 @@
* not a multiple of BYTES_PER_XDR_UNIT, the caller has the option
* of making the data a BYTES_PER_XDR_UNIT multiple (b_wptr - b_rptr is
* a BYTES_PER_XDR_UNIT multiple), but in this case the caller has to ensure
- * that the filler bytes are initialized to zero. Note: Doesn't to work for
- * chained mblks.
+ * that the filler bytes are initialized to zero.
*/
bool_t
xdrmblk_putmblk(XDR *xdrs, mblk_t *m, uint_t len)
{
int32_t llen = (int32_t)len;
- if (((m->b_wptr - m->b_rptr) % BYTES_PER_XDR_UNIT) != 0)
+ if ((DLEN(m) % BYTES_PER_XDR_UNIT) != 0)
return (FALSE);
if (!xdrmblk_putint32(xdrs, &llen))
return (FALSE);
+
/* LINTED pointer alignment */
((mblk_t *)xdrs->x_base)->b_cont = m;
+
+ /* base points to the last mblk */
+ while (m->b_cont)
+ m = m->b_cont;
xdrs->x_base = (caddr_t)m;
xdrs->x_handy = 0;
return (TRUE);
--- a/usr/src/uts/common/sys/fem.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/fem.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_FEM_H
#define _SYS_FEM_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/mutex.h>
#include <sys/pathname.h>
@@ -260,7 +258,13 @@
struct shrlock *shr, int flag, cred_t *cr, \
caller_context_t *ct); \
int (*femop_vnevent)(femarg_t *vf, vnevent_t vnevent, \
- vnode_t *dvp, char *cname, caller_context_t *ct)
+ vnode_t *dvp, char *cname, \
+ caller_context_t *ct); \
+ int (*femop_reqzcbuf)(femarg_t *vf, enum uio_rw ioflag, \
+ xuio_t *xuio, cred_t *cr, \
+ caller_context_t *ct); \
+ int (*femop_retzcbuf)(femarg_t *vf, xuio_t *xuio, cred_t *cr, \
+ caller_context_t *ct)
/* NB: No ";" */
struct fem {
@@ -392,6 +396,10 @@
int flag, cred_t *cr, caller_context_t *ct);
extern int vnext_vnevent(femarg_t *vf, vnevent_t vevent, vnode_t *dvp,
char *cname, caller_context_t *ct);
+extern int vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop,
+ cred_t *cr, caller_context_t *ct);
+extern int vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr,
+ caller_context_t *ct);
extern int vfsnext_mount(fsemarg_t *vf, vnode_t *mvp, struct mounta *uap,
cred_t *cr);
--- a/usr/src/uts/common/sys/uio.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/uio.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,6 +133,49 @@
uioa_page_t uioa_locked[UIOA_IOV_MAX]; /* Per iov locked pages */
} uioa_t;
+/*
+ * uio extensions
+ *
+ * PSARC 2009/478: Copy Reduction Interfaces
+ */
+typedef enum xuio_type {
+ UIOTYPE_ASYNCIO,
+ UIOTYPE_ZEROCOPY
+} xuio_type_t;
+
+typedef struct xuio {
+ uio_t xu_uio; /* Embedded UIO structure */
+
+ /* Extended uio fields */
+ enum xuio_type xu_type; /* What kind of uio structure? */
+ union {
+ /* Async I/O Support, intend to replace uioa_t. */
+ struct {
+ uint32_t xu_a_state; /* state of async i/o */
+ /* bytes that have been uioamove()ed */
+ ssize_t xu_a_mbytes;
+ uioa_page_t *xu_a_lcur; /* pointer into uioa_locked[] */
+ /* pointer into lcur->uioa_ppp[] */
+ void **xu_a_lppp;
+ void *xu_a_hwst[4]; /* opaque hardware state */
+ /* Per iov locked pages */
+ uioa_page_t xu_a_locked[UIOA_IOV_MAX];
+ } xu_aio;
+
+ /*
+ * Copy Reduction Support -- facilate loaning / returning of
+ * filesystem cache buffers.
+ */
+ struct {
+ int xu_zc_rw; /* read or write buffer */
+ void *xu_zc_priv; /* fs specific */
+ } xu_zc;
+ } xu_ext;
+} xuio_t;
+
+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
+
#define UIOA_ALLOC 0x0001 /* allocated but not yet initialized */
#define UIOA_INIT 0x0002 /* initialized but not yet enabled */
#define UIOA_ENABLED 0x0004 /* enabled, asynch i/o active */
@@ -177,6 +220,7 @@
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
#define UIO_ASYNC 0x0002 /* uio_t is really a uioa_t */
+#define UIO_XUIO 0x0004 /* Structure is xuio_t */
/*
* Global uioasync capability shadow state.
--- a/usr/src/uts/common/sys/vfs.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/vfs.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -302,7 +302,8 @@
#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */
#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */
#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */
-
+#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200
+ /* Support loaning /returning cache buffer */
/*
* Argument structure for mount(2).
*
--- a/usr/src/uts/common/sys/vnode.h Mon Jan 18 09:39:57 2010 -0800
+++ b/usr/src/uts/common/sys/vnode.h Mon Jan 18 10:34:16 2010 -0800
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -118,6 +118,8 @@
kstat_named_t ngetsecattr; /* VOP_GETSECATTR */
kstat_named_t nshrlock; /* VOP_SHRLOCK */
kstat_named_t nvnevent; /* VOP_VNEVENT */
+ kstat_named_t nreqzcbuf; /* VOP_REQZCBUF */
+ kstat_named_t nretzcbuf; /* VOP_RETZCBUF */
} vopstats_t;
/*
@@ -900,7 +902,11 @@
int (*vop_shrlock)(vnode_t *, int, struct shrlock *, \
int, cred_t *, caller_context_t *); \
int (*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *, \
- char *, caller_context_t *)
+ char *, caller_context_t *); \
+ int (*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *, \
+ cred_t *, caller_context_t *); \
+ int (*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *, \
+ caller_context_t *)
/* NB: No ";" */
/*
@@ -997,6 +1003,9 @@
caller_context_t *);
extern int fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
caller_context_t *);
+extern int fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *,
+ caller_context_t *);
+extern int fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *);
#endif /* _KERNEL */
@@ -1088,6 +1097,10 @@
fop_shrlock(vp, cmd, shr, f, cr, ct)
#define VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \
fop_vnevent(vp, vnevent, dvp, fnm, ct)
+#define VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \
+ fop_reqzcbuf(vp, rwflag, xuiop, cr, ct)
+#define VOP_RETZCBUF(vp, xuiop, cr, ct) \
+ fop_retzcbuf(vp, xuiop, cr, ct)
#define VOPNAME_OPEN "open"
#define VOPNAME_CLOSE "close"
@@ -1133,6 +1146,8 @@
#define VOPNAME_SETSECATTR "setsecattr"
#define VOPNAME_SHRLOCK "shrlock"
#define VOPNAME_VNEVENT "vnevent"
+#define VOPNAME_REQZCBUF "reqzcbuf"
+#define VOPNAME_RETZCBUF "retzcbuf"
/*
* Flags for VOP_LOOKUP