6213799 VMODSORT support for NFS. AKA "close()/fsync() slow on clients with lots of memory and cached pages"
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -120,11 +120,6 @@
static void nfs3_set_mod(vnode_t *);
static void nfs3_get_commit(vnode_t *);
static void nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
-#if 0 /* unused */
-#ifdef DEBUG
-static int nfs3_no_uncommitted_pages(vnode_t *);
-#endif
-#endif /* unused */
static int nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
static int nfs3_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *);
static int nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
@@ -6252,24 +6247,11 @@
static void
nfs3_set_mod(vnode_t *vp)
{
- page_t *pp;
- kmutex_t *vphm;
-
ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
- vphm = page_vnode_mutex(vp);
- mutex_enter(vphm);
- if ((pp = vp->v_pages) != NULL) {
- do {
- if (pp->p_fsdata != C_NOCOMMIT) {
- hat_setmod(pp);
- pp->p_fsdata = C_NOCOMMIT;
- }
- } while ((pp = pp->p_vpnext) != vp->v_pages);
- }
- mutex_exit(vphm);
+
+ pvn_vplist_setdirty(vp, nfs_setmod_check);
}
-
/*
* This routine is used to gather together a page list of the pages
* which are to be committed on the server. This routine must not
@@ -6308,6 +6290,10 @@
* looking for pages which need to be committed.
*/
do {
+ /* Skip marker pages. */
+ if (pp->p_hash == PVN_VPLIST_HASH_TAG)
+ continue;
+
/*
* If this page does not need to be committed or is
* modified, then just skip it.
@@ -6433,31 +6419,6 @@
}
}
-#if 0 /* unused */
-#ifdef DEBUG
-static int
-nfs3_no_uncommitted_pages(vnode_t *vp)
-{
- page_t *pp;
- kmutex_t *vphm;
-
- vphm = page_vnode_mutex(vp);
- mutex_enter(vphm);
- if ((pp = vp->v_pages) != NULL) {
- do {
- if (pp->p_fsdata != C_NOCOMMIT) {
- mutex_exit(vphm);
- return (0);
- }
- } while ((pp = pp->p_vpnext) != vp->v_pages);
- }
- mutex_exit(vphm);
-
- return (1);
-}
-#endif
-#endif
-
static int
nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
{
--- a/usr/src/uts/common/fs/nfs/nfs4_rnode.c Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_rnode.c Thu Mar 11 07:11:09 2010 +0000
@@ -61,6 +61,8 @@
#include <sys/callb.h>
#include <sys/sdt.h>
+#include <vm/pvn.h>
+
#include <rpc/types.h>
#include <rpc/xdr.h>
#include <rpc/auth.h>
@@ -173,7 +175,8 @@
mutex_enter(vphm);
if ((pp = vp->v_pages) != NULL) {
do {
- if (pp->p_fsdata != C_NOCOMMIT) {
+ if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
+ pp->p_fsdata != C_NOCOMMIT) {
mutex_exit(vphm);
return (1);
}
@@ -679,6 +682,7 @@
vp->v_vfsp = vfsp;
VFS_HOLD(vfsp);
vp->v_type = VNON;
+ vp->v_flag |= VMODSORT;
if (isrootfh(fh, rp))
vp->v_flag = VROOT;
vn_exists(vp);
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c Thu Mar 11 07:11:09 2010 +0000
@@ -11859,36 +11859,10 @@
static void
nfs4_set_mod(vnode_t *vp)
{
- page_t *pp;
- kmutex_t *vphm;
- rnode4_t *rp;
-
ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
/* make sure we're looking at the master vnode, not a shadow */
-
- rp = VTOR4(vp);
- if (IS_SHADOW(vp, rp))
- vp = RTOV4(rp);
-
- vphm = page_vnode_mutex(vp);
- mutex_enter(vphm);
- /*
- * If there are no pages associated with this vnode, then
- * just return.
- */
- if ((pp = vp->v_pages) == NULL) {
- mutex_exit(vphm);
- return;
- }
-
- do {
- if (pp->p_fsdata != C_NOCOMMIT) {
- hat_setmod(pp);
- pp->p_fsdata = C_NOCOMMIT;
- }
- } while ((pp = pp->p_vpnext) != vp->v_pages);
- mutex_exit(vphm);
+ pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
}
/*
@@ -11939,6 +11913,10 @@
* looking for pages which need to be committed.
*/
do {
+ /* Skip marker pages. */
+ if (pp->p_hash == PVN_VPLIST_HASH_TAG)
+ continue;
+
/*
* First short-cut everything (without the page_lock)
* and see if this page does not need to be committed
--- a/usr/src/uts/common/fs/nfs/nfs_subr.c Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs_subr.c Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -2590,6 +2590,7 @@
vp->v_data = (caddr_t)rp;
vp->v_vfsp = vfsp;
vp->v_type = VNON;
+ vp->v_flag |= VMODSORT;
nfs_set_vroot(vp);
/*
@@ -2612,6 +2613,20 @@
return (vp);
}
+/*
+ * Callback function to check if the page should be marked as
+ * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
+ */
+int
+nfs_setmod_check(page_t *pp)
+{
+ if (pp->p_fsdata != C_NOCOMMIT) {
+ pp->p_fsdata = C_NOCOMMIT;
+ return (1);
+ }
+ return (0);
+}
+
static void
nfs_set_vroot(vnode_t *vp)
{
--- a/usr/src/uts/common/nfs/nfs.h Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/nfs/nfs.h Thu Mar 11 07:11:09 2010 +0000
@@ -954,6 +954,7 @@
extern void (*nfs_srv_quiesce_func)(void);
extern int rfs4_dss_setpaths(char *, size_t);
extern int (*nfs_srv_dss_func)(char *, size_t);
+extern int nfs_setmod_check(page_t *pp);
extern time_t rfs4_lease_time;
extern time_t rfs4_grace_period;
extern nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
--- a/usr/src/uts/common/nfs/rnode.h Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/nfs/rnode.h Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -110,6 +110,16 @@
/*
* The various values for the commit states. These are stored in
* the p_fsdata byte in the page struct.
+ * NFSv3,4 can use asynchronous writes - the NFS server can send a response
+ * before storing the data to the stable store (disk). The response contains
+ * information if the data are on a disk or not. NFS client marks pages
+ * which are already on the stable store as C_NOCOMMIT. The pages which were
+ * sent but are not yet on the stable store are only partially 'safe' and are
+ * marked as C_DELAYCOMMIT, which can be later changed to C_COMMIT if the
+ * commit operation is in progress. If the NFS server is e.g. rebooted, the
+ * client needs to resend all the uncommitted data. The client walks all the
+ * vp->v_pages and if C_DELAYCOMMIT or C_COMMIT is set, the page is marked as
+ * dirty and thus will be written to the server again.
*/
#define C_NOCOMMIT 0 /* no commit is required */
#define C_COMMIT 1 /* a commit is required so do it now */
--- a/usr/src/uts/common/vm/pvn.h Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/vm/pvn.h Thu Mar 11 07:11:09 2010 +0000
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -40,8 +39,6 @@
#ifndef _VM_PVN_H
#define _VM_PVN_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/buf.h>
#include <vm/seg.h>
@@ -73,6 +70,7 @@
int (*putapage)(vnode_t *, struct page *, u_offset_t *,
size_t *, int, cred_t *),
int flags, struct cred *cred);
+void pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *));
int pvn_getdirty(struct page *pp, int flags);
void pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes);
int pvn_getpages(
@@ -88,6 +86,12 @@
void pvn_init(void);
/*
+ * The value is put in p_hash to identify marker pages. It is safe to
+ * test p_hash ==(!=) PVN_VPLIST_HASH_TAG even without holding p_selock.
+ */
+#define PVN_VPLIST_HASH_TAG ((page_t *)-1)
+
+/*
* When requesting pages from the getpage routines, pvn_getpages will
* allocate space to return PVN_GETPAGE_NUM pages which map PVN_GETPAGE_SZ
* worth of bytes. These numbers are chosen to be the minimum of the max's
--- a/usr/src/uts/common/vm/vm_pvn.c Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/vm/vm_pvn.c Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -217,8 +217,8 @@
}
}
TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
- "pvn_read_kluster:seg %p addr %x isra %x",
- seg, addr, isra);
+ "pvn_read_kluster:seg %p addr %x isra %x",
+ seg, addr, isra);
return (plist);
}
@@ -460,7 +460,7 @@
pgout = 1;
pgpgout++;
TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
- "page_ws_out:pp %p", pp);
+ "page_ws_out:pp %p", pp);
/*
* The page_struct_lock need not be acquired to
@@ -483,7 +483,7 @@
* to avoid having to flush the cache.
*/
ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
- HAT_SYNC_STOPON_MOD);
+ HAT_SYNC_STOPON_MOD);
ck_refmod:
if (!(ppattr & (P_REF | P_MOD))) {
if (hat_page_is_mapped(pp)) {
@@ -502,9 +502,9 @@
* lock on the page.
*/
(void) hat_pageunload(pp,
- HAT_FORCE_PGUNLOAD);
+ HAT_FORCE_PGUNLOAD);
ppattr = hat_page_getattr(pp,
- P_REF | P_MOD);
+ P_REF | P_MOD);
goto ck_refmod;
}
/*
@@ -525,7 +525,7 @@
}
/*LINTED: constant in conditional ctx*/
VN_DISPOSE(pp, B_FREE,
- (flags & B_DONTNEED), kcred);
+ (flags & B_DONTNEED), kcred);
dfree++;
} else {
page_unlock(pp);
@@ -567,10 +567,10 @@
/* Kernel probe */
TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
- tnf_opaque, vnode, vp,
- tnf_ulong, pages_pageout, pgpgout,
- tnf_ulong, pages_freed, dfree,
- tnf_ulong, pages_reclaimed, pgrec);
+ tnf_opaque, vnode, vp,
+ tnf_ulong, pages_pageout, pgpgout,
+ tnf_ulong, pages_freed, dfree,
+ tnf_ulong, pages_reclaimed, pgrec);
}
/*
@@ -699,6 +699,7 @@
{
page_t *mark = buf;
bzero(mark, sizeof (page_t));
+ mark->p_hash = PVN_VPLIST_HASH_TAG;
return (0);
}
@@ -993,6 +994,58 @@
}
/*
+ * Walk the vp->v_pages list, for every page call the callback function
+ * pointed by *page_check. If page_check returns non-zero, then mark the
+ * page as modified and if VMODSORT is set, move it to the end of v_pages
+ * list. Moving makes sense only if we have at least two pages - this also
+ * avoids having v_pages temporarily being NULL after calling page_vpsub()
+ * if there was just one page.
+ */
+void
+pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
+{
+ page_t *pp, *next, *end;
+ kmutex_t *vphm;
+ int shuffle;
+
+ vphm = page_vnode_mutex(vp);
+ mutex_enter(vphm);
+
+ if (vp->v_pages == NULL) {
+ mutex_exit(vphm);
+ return;
+ }
+
+ end = vp->v_pages->p_vpprev;
+ shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
+ pp = vp->v_pages;
+
+ for (;;) {
+ next = pp->p_vpnext;
+ if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
+ /*
+ * hat_setmod_only() in contrast to hat_setmod() does
+ * not shuffle the pages and does not grab the mutex
+ * page_vnode_mutex. Exactly what we need.
+ */
+ hat_setmod_only(pp);
+ if (shuffle) {
+ page_vpsub(&vp->v_pages, pp);
+ ASSERT(vp->v_pages != NULL);
+ page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
+ pp);
+ }
+ }
+ /* Stop if we have just processed the last page. */
+ if (pp == end)
+ break;
+ pp = next;
+ }
+
+ mutex_exit(vphm);
+}
+
+/*
* Zero out zbytes worth of data. Caller should be aware that this
* routine may enter back into the fs layer (xxx_getpage). Locks
* that the xxx_getpage routine may need should not be held while