6213799 VMODSORT support for NFS. AKA "close()/fsync() slow on clients with lots of memory and cached pages"
authorPavel Filipensky <Pavel.Filipensky@Sun.COM>
Thu, 11 Mar 2010 07:11:09 +0000
changeset 11888 542e7ffc22d6
parent 11887 ceff5abc0d48
child 11889 109dcf0079d2
6213799 VMODSORT support for NFS. AKA "close()/fsync() slow on clients with lots of memory and cached pages"
usr/src/uts/common/fs/nfs/nfs3_vnops.c
usr/src/uts/common/fs/nfs/nfs4_rnode.c
usr/src/uts/common/fs/nfs/nfs4_vnops.c
usr/src/uts/common/fs/nfs/nfs_subr.c
usr/src/uts/common/nfs/nfs.h
usr/src/uts/common/nfs/rnode.h
usr/src/uts/common/vm/pvn.h
usr/src/uts/common/vm/vm_pvn.c
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c	Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -120,11 +120,6 @@
 static void	nfs3_set_mod(vnode_t *);
 static void	nfs3_get_commit(vnode_t *);
 static void	nfs3_get_commit_range(vnode_t *, u_offset_t, size_t);
-#if 0 /* unused */
-#ifdef DEBUG
-static int	nfs3_no_uncommitted_pages(vnode_t *);
-#endif
-#endif /* unused */
 static int	nfs3_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 static int	nfs3_commit_vp(vnode_t *, u_offset_t, size_t,  cred_t *);
 static int	nfs3_sync_commit(vnode_t *, page_t *, offset3, count3,
@@ -6252,24 +6247,11 @@
 static void
 nfs3_set_mod(vnode_t *vp)
 {
-	page_t *pp;
-	kmutex_t *vphm;
-
 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
-	vphm = page_vnode_mutex(vp);
-	mutex_enter(vphm);
-	if ((pp = vp->v_pages) != NULL) {
-		do {
-			if (pp->p_fsdata != C_NOCOMMIT) {
-				hat_setmod(pp);
-				pp->p_fsdata = C_NOCOMMIT;
-			}
-		} while ((pp = pp->p_vpnext) != vp->v_pages);
-	}
-	mutex_exit(vphm);
+
+	pvn_vplist_setdirty(vp, nfs_setmod_check);
 }
 
-
 /*
  * This routine is used to gather together a page list of the pages
  * which are to be committed on the server.  This routine must not
@@ -6308,6 +6290,10 @@
 	 * looking for pages which need to be committed.
 	 */
 	do {
+		/* Skip marker pages. */
+		if (pp->p_hash == PVN_VPLIST_HASH_TAG)
+			continue;
+
 		/*
 		 * If this page does not need to be committed or is
 		 * modified, then just skip it.
@@ -6433,31 +6419,6 @@
 	}
 }
 
-#if 0	/* unused */
-#ifdef DEBUG
-static int
-nfs3_no_uncommitted_pages(vnode_t *vp)
-{
-	page_t *pp;
-	kmutex_t *vphm;
-
-	vphm = page_vnode_mutex(vp);
-	mutex_enter(vphm);
-	if ((pp = vp->v_pages) != NULL) {
-		do {
-			if (pp->p_fsdata != C_NOCOMMIT) {
-				mutex_exit(vphm);
-				return (0);
-			}
-		} while ((pp = pp->p_vpnext) != vp->v_pages);
-	}
-	mutex_exit(vphm);
-
-	return (1);
-}
-#endif
-#endif
-
 static int
 nfs3_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
 {
--- a/usr/src/uts/common/fs/nfs/nfs4_rnode.c	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_rnode.c	Thu Mar 11 07:11:09 2010 +0000
@@ -61,6 +61,8 @@
 #include <sys/callb.h>
 #include <sys/sdt.h>
 
+#include <vm/pvn.h>
+
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <rpc/auth.h>
@@ -173,7 +175,8 @@
 	mutex_enter(vphm);
 	if ((pp = vp->v_pages) != NULL) {
 		do {
-			if (pp->p_fsdata != C_NOCOMMIT) {
+			if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
+			    pp->p_fsdata != C_NOCOMMIT) {
 				mutex_exit(vphm);
 				return (1);
 			}
@@ -679,6 +682,7 @@
 	vp->v_vfsp = vfsp;
 	VFS_HOLD(vfsp);
 	vp->v_type = VNON;
+	vp->v_flag |= VMODSORT;
 	if (isrootfh(fh, rp))
 		vp->v_flag = VROOT;
 	vn_exists(vp);
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Thu Mar 11 07:11:09 2010 +0000
@@ -11859,36 +11859,10 @@
 static void
 nfs4_set_mod(vnode_t *vp)
 {
-	page_t *pp;
-	kmutex_t *vphm;
-	rnode4_t *rp;
-
 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
 
 	/* make sure we're looking at the master vnode, not a shadow */
-
-	rp = VTOR4(vp);
-	if (IS_SHADOW(vp, rp))
-		vp = RTOV4(rp);
-
-	vphm = page_vnode_mutex(vp);
-	mutex_enter(vphm);
-	/*
-	 * If there are no pages associated with this vnode, then
-	 * just return.
-	 */
-	if ((pp = vp->v_pages) == NULL) {
-		mutex_exit(vphm);
-		return;
-	}
-
-	do {
-		if (pp->p_fsdata != C_NOCOMMIT) {
-			hat_setmod(pp);
-			pp->p_fsdata = C_NOCOMMIT;
-		}
-	} while ((pp = pp->p_vpnext) != vp->v_pages);
-	mutex_exit(vphm);
+	pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
 }
 
 /*
@@ -11939,6 +11913,10 @@
 	 * looking for pages which need to be committed.
 	 */
 	do {
+		/* Skip marker pages. */
+		if (pp->p_hash == PVN_VPLIST_HASH_TAG)
+			continue;
+
 		/*
 		 * First short-cut everything (without the page_lock)
 		 * and see if this page does not need to be committed
--- a/usr/src/uts/common/fs/nfs/nfs_subr.c	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs_subr.c	Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -2590,6 +2590,7 @@
 	vp->v_data = (caddr_t)rp;
 	vp->v_vfsp = vfsp;
 	vp->v_type = VNON;
+	vp->v_flag |= VMODSORT;
 	nfs_set_vroot(vp);
 
 	/*
@@ -2612,6 +2613,20 @@
 	return (vp);
 }
 
+/*
+ * Callback function to check if the page should be marked as
+ * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
+ */
+int
+nfs_setmod_check(page_t *pp)
+{
+	if (pp->p_fsdata != C_NOCOMMIT) {
+		pp->p_fsdata = C_NOCOMMIT;
+		return (1);
+	}
+	return (0);
+}
+
 static void
 nfs_set_vroot(vnode_t *vp)
 {
--- a/usr/src/uts/common/nfs/nfs.h	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/nfs/nfs.h	Thu Mar 11 07:11:09 2010 +0000
@@ -954,6 +954,7 @@
 extern void	(*nfs_srv_quiesce_func)(void);
 extern int	rfs4_dss_setpaths(char *, size_t);
 extern int	(*nfs_srv_dss_func)(char *, size_t);
+extern int	nfs_setmod_check(page_t *pp);
 extern time_t	rfs4_lease_time;
 extern time_t	rfs4_grace_period;
 extern nvlist_t	*rfs4_dss_paths, *rfs4_dss_oldpaths;
--- a/usr/src/uts/common/nfs/rnode.h	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/nfs/rnode.h	Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -110,6 +110,16 @@
 /*
  * The various values for the commit states.  These are stored in
  * the p_fsdata byte in the page struct.
+ * NFSv3,4 can use asynchronous writes - the NFS server can send a response
+ * before storing the data to the stable store (disk). The response contains
+ * information if the data are on a disk or not. NFS client marks pages
+ * which are already on the stable store as C_NOCOMMIT. The pages which were
+ * sent but are not yet on the stable store are only partially 'safe' and are
+ * marked as C_DELAYCOMMIT, which can be later changed to C_COMMIT if the
+ * commit operation is in progress. If the NFS server is e.g. rebooted, the
+ * client needs to resend all the uncommitted data. The client walks all the
+ * vp->v_pages and if C_DELAYCOMMIT or C_COMMIT is set, the page is marked as
+ * dirty and thus will be written to the server again.
  */
 #define	C_NOCOMMIT	0	/* no commit is required */
 #define	C_COMMIT	1	/* a commit is required so do it now */
--- a/usr/src/uts/common/vm/pvn.h	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/vm/pvn.h	Thu Mar 11 07:11:09 2010 +0000
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2002 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,8 +39,6 @@
 #ifndef	_VM_PVN_H
 #define	_VM_PVN_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/buf.h>
 #include <vm/seg.h>
 
@@ -73,6 +70,7 @@
 			int (*putapage)(vnode_t *, struct page *, u_offset_t *,
 				size_t *, int, cred_t *),
 			int flags, struct cred *cred);
+void		pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *));
 int		pvn_getdirty(struct page *pp, int flags);
 void		pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes);
 int		pvn_getpages(
@@ -88,6 +86,12 @@
 void		pvn_init(void);
 
 /*
+ * The value is put in p_hash to identify marker pages. It is safe to
+ * test p_hash ==(!=) PVN_VPLIST_HASH_TAG even without holding p_selock.
+ */
+#define	PVN_VPLIST_HASH_TAG	((page_t *)-1)
+
+/*
  * When requesting pages from the getpage routines, pvn_getpages will
  * allocate space to return PVN_GETPAGE_NUM pages which map PVN_GETPAGE_SZ
  * worth of bytes.  These numbers are chosen to be the minimum of the max's
--- a/usr/src/uts/common/vm/vm_pvn.c	Wed Mar 10 20:09:03 2010 -0500
+++ b/usr/src/uts/common/vm/vm_pvn.c	Thu Mar 11 07:11:09 2010 +0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -217,8 +217,8 @@
 		}
 	}
 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
-		"pvn_read_kluster:seg %p addr %x isra %x",
-		seg, addr, isra);
+	    "pvn_read_kluster:seg %p addr %x isra %x",
+	    seg, addr, isra);
 	return (plist);
 }
 
@@ -460,7 +460,7 @@
 			pgout = 1;
 			pgpgout++;
 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
-				"page_ws_out:pp %p", pp);
+			    "page_ws_out:pp %p", pp);
 
 			/*
 			 * The page_struct_lock need not be acquired to
@@ -483,7 +483,7 @@
 				 * to avoid having to flush the cache.
 				 */
 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
-					HAT_SYNC_STOPON_MOD);
+				    HAT_SYNC_STOPON_MOD);
 			ck_refmod:
 				if (!(ppattr & (P_REF | P_MOD))) {
 					if (hat_page_is_mapped(pp)) {
@@ -502,9 +502,9 @@
 						 * lock on the page.
 						 */
 						(void) hat_pageunload(pp,
-							HAT_FORCE_PGUNLOAD);
+						    HAT_FORCE_PGUNLOAD);
 						ppattr = hat_page_getattr(pp,
-							P_REF | P_MOD);
+						    P_REF | P_MOD);
 						goto ck_refmod;
 					}
 					/*
@@ -525,7 +525,7 @@
 					}
 					/*LINTED: constant in conditional ctx*/
 					VN_DISPOSE(pp, B_FREE,
-						(flags & B_DONTNEED), kcred);
+					    (flags & B_DONTNEED), kcred);
 					dfree++;
 				} else {
 					page_unlock(pp);
@@ -567,10 +567,10 @@
 
 	/* Kernel probe */
 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
-		tnf_opaque,	vnode,			vp,
-		tnf_ulong,	pages_pageout,		pgpgout,
-		tnf_ulong,	pages_freed,		dfree,
-		tnf_ulong,	pages_reclaimed,	pgrec);
+	    tnf_opaque,	vnode,			vp,
+	    tnf_ulong,	pages_pageout,		pgpgout,
+	    tnf_ulong,	pages_freed,		dfree,
+	    tnf_ulong,	pages_reclaimed,	pgrec);
 }
 
 /*
@@ -699,6 +699,7 @@
 {
 	page_t *mark = buf;
 	bzero(mark, sizeof (page_t));
+	mark->p_hash = PVN_VPLIST_HASH_TAG;
 	return (0);
 }
 
@@ -993,6 +994,58 @@
 }
 
 /*
+ * Walk the vp->v_pages list, for every page call the callback function
+ * pointed by *page_check. If page_check returns non-zero, then mark the
+ * page as modified and if VMODSORT is set, move it to the end of v_pages
+ * list. Moving makes sense only if we have at least two pages - this also
+ * avoids having v_pages temporarily being NULL after calling page_vpsub()
+ * if there was just one page.
+ */
+void
+pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
+{
+	page_t	*pp, *next, *end;
+	kmutex_t	*vphm;
+	int	shuffle;
+
+	vphm = page_vnode_mutex(vp);
+	mutex_enter(vphm);
+
+	if (vp->v_pages == NULL) {
+		mutex_exit(vphm);
+		return;
+	}
+
+	end = vp->v_pages->p_vpprev;
+	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
+	pp = vp->v_pages;
+
+	for (;;) {
+		next = pp->p_vpnext;
+		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
+			/*
+			 * hat_setmod_only() in contrast to hat_setmod() does
+			 * not shuffle the pages and does not grab the mutex
+			 * page_vnode_mutex. Exactly what we need.
+			 */
+			hat_setmod_only(pp);
+			if (shuffle) {
+				page_vpsub(&vp->v_pages, pp);
+				ASSERT(vp->v_pages != NULL);
+				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
+				    pp);
+			}
+		}
+		/* Stop if we have just processed the last page. */
+		if (pp == end)
+			break;
+		pp = next;
+	}
+
+	mutex_exit(vphm);
+}
+
+/*
  * Zero out zbytes worth of data. Caller should be aware that this
  * routine may enter back into the fs layer (xxx_getpage). Locks
  * that the xxx_getpage routine may need should not be held while