PSARC/2006/313 NFSv4: nfsd "-s" distributed stable storage
authorcalum
Mon, 22 May 2006 15:43:31 -0700
changeset 2035 a29bc457bcb9
parent 2034 495fa2236cf1
child 2036 6b118cfdcfda
PSARC/2006/313 NFSv4: nfsd "-s" distributed stable storage 6244819 NFSv4 needs distributed stable storage to work on Cluster HA-NFS
usr/src/cmd/fs.d/nfs/nfsd/Makefile
usr/src/cmd/fs.d/nfs/nfsd/nfsd.c
usr/src/uts/common/fs/nfs/nfs4_srv.c
usr/src/uts/common/fs/nfs/nfs4_state.c
usr/src/uts/common/fs/nfs/nfs_server.c
usr/src/uts/common/fs/nfs/nfs_sys.c
usr/src/uts/common/nfs/nfs.h
usr/src/uts/common/nfs/nfs4.h
usr/src/uts/common/nfs/nfssys.h
--- a/usr/src/cmd/fs.d/nfs/nfsd/Makefile	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/cmd/fs.d/nfs/nfsd/Makefile	Mon May 22 15:43:31 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 1989,2001-2003 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -32,7 +31,7 @@
 
 include		../../Makefile.fstype
 
-LDLIBS +=	-lnsl -lcmd
+LDLIBS +=	-lnsl -lcmd -lnvpair
 
 LOCAL=		nfsd.o
 OBJS=		$(LOCAL) nfs_tbind.o thrpool.o
--- a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c	Mon May 22 15:43:31 2006 -0700
@@ -45,6 +45,7 @@
 
 #include <sys/param.h>
 #include <sys/types.h>
+#include <sys/stat.h>
 #include <syslog.h>
 #include <tiuser.h>
 #include <rpc/rpc.h>
@@ -73,15 +74,22 @@
 #include <deflt.h>
 #include <rpcsvc/daemon_utils.h>
 #include <rpcsvc/nfs4_prot.h>
+#include <libnvpair.h>
 #include "nfs_tbind.h"
 #include "thrpool.h"
 
 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
 #define	QUIESCE_VERSMIN	4
+/* DSS: distributed stable storage */
+#define	DSS_VERSMIN	4
 
 static	int	nfssvc(int, struct netbuf, struct netconfig *);
-static int nfssvcpool(int maxservers);
+static	int	nfssvcpool(int maxservers);
+static	int	dss_init(uint_t npaths, char **pathnames);
+static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
+static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
 static	void	usage(void);
+int		qstrcmp(const void *s1, const void *s2);
 
 extern	int	_nfssys(int, void *);
 
@@ -138,6 +146,8 @@
 	NETSELPDECL(providerp);
 	char *defval;
 	boolean_t can_do_mlp;
+	uint_t dss_npaths = 0;
+	char **dss_pathnames = NULL;
 
 	MyName = *av;
 
@@ -239,7 +249,7 @@
 	}
 	opt_cnt = 0;
 
-	while ((i = getopt(ac, av, "ac:p:t:l:")) != EOF) {
+	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
 		switch (i) {
 		case 'a':
 			free(df_proto);
@@ -261,6 +271,39 @@
 			opt_cnt++;
 			break;
 
+		/*
+		 * DSS: NFSv4 distributed stable storage.
+		 *
+		 * This is a Contracted Project Private interface, for
+		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
+		 */
+		case 's':
+			if (strlen(optarg) < MAXPATHLEN) {
+				/* first "-s" option encountered? */
+				if (dss_pathnames == NULL) {
+					/*
+					 * Allocate maximum possible space
+					 * required given cmdline arg count;
+					 * "-s <path>" consumes two args.
+					 */
+					size_t sz = (ac / 2) * sizeof (char *);
+					dss_pathnames = (char **)malloc(sz);
+					if (dss_pathnames == NULL) {
+						(void) fprintf(stderr, "%s: "
+						    "dss paths malloc failed\n",
+						    av[0]);
+						exit(1);
+					}
+					(void) memset(dss_pathnames, 0, sz);
+				}
+				dss_pathnames[dss_npaths] = optarg;
+				dss_npaths++;
+			} else {
+				(void) fprintf(stderr,
+				    "%s: -s pathname too long.\n", av[0]);
+			}
+			break;
+
 		case 't':
 			provider = optarg;
 			df_allflag = 0;
@@ -410,6 +453,18 @@
 		exit(0);
 	}
 
+	/*
+	 * If we've been given a list of paths to be used for distributed
+	 * stable storage, and provided we're going to run a version
+	 * that supports it, setup the DSS paths.
+	 */
+	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
+		if (dss_init(dss_npaths, dss_pathnames) != 0) {
+			syslog(LOG_ERR, "dss_init failed. Exiting.");
+			exit(1);
+		}
+	}
+
 	sigset(SIGTERM, sigflush);
 	sigset(SIGUSR1, quiesce);
 
@@ -520,7 +575,7 @@
 
 	if (num_fds == 0) {
 		(void) syslog(LOG_ERR,
-		"Could not start NFS service for any protocol. Exiting.");
+		"Could not start NFS service for any protocol. Exiting");
 		exit(1);
 	}
 
@@ -643,7 +698,12 @@
 
 /*
  * SIGUSR1 handler.
- * Request server quiesce, then exit. For subsequent warm start.
+ *
+ * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
+ *
+ * This is a Contracted Project Private interface, for the sole use
+ * of Sun Cluster HA-NFS. See PSARC/2004/497.
+ *
  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
  */
 static void
@@ -654,10 +714,10 @@
 
 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
 		/* Request server quiesce at next shutdown */
-		error = _nfssys(NFS_SVC_REQUEST_QUIESCE, &id);
+		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
 		if (error) {
 			syslog(LOG_ERR,
-			    "_nfssys(NFS_SVC_REQUEST_QUIESCE) failed: %s\n",
+			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
 			    strerror(errno));
 			return;
 		}
@@ -668,3 +728,214 @@
 
 	exit(0);
 }
+
+/*
+ * DSS: distributed stable storage.
+ * Create leaf directories as required, keeping an eye on path
+ * lengths. Calls exit(1) on failure.
+ * The pathnames passed in must already exist, and must be writeable by nfsd.
+ * Note: the leaf directories under NFS4_VAR_DIR are not created here;
+ * they're created at pkg install.
+ */
+static void
+dss_mkleafdirs(uint_t npaths, char **pathnames)
+{
+	int i;
+	char *tmppath = NULL;
+
+	/*
+	 * Create the temporary storage used by dss_mkleafdir() here,
+	 * rather than in that function, so that it only needs to be
+	 * done once, rather than once for each call. Too big to put
+	 * on the function's stack.
+	 */
+	tmppath = (char *)malloc(MAXPATHLEN);
+	if (tmppath == NULL) {
+		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
+		exit(1);
+	}
+
+	for (i = 0; i < npaths; i++) {
+		char *p = pathnames[i];
+
+		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
+		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
+	}
+
+	free(tmppath);
+}
+
+/*
+ * Create "leaf" in "dir" (which must already exist).
+ * leaf: should start with a '/'
+ */
+static void
+dss_mkleafdir(char *dir, char *leaf, char *tmppath)
+{
+	/* MAXPATHLEN includes the terminating NUL */
+	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
+		syslog(LOG_ERR, "stable storage path too long: %s%s. Exiting",
+		    dir, leaf);
+		exit(1);
+	}
+
+	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
+
+	/* the directory may already exist: that's OK */
+	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
+		syslog(LOG_ERR, "error creating stable storage directory: "
+		    "%s: %s. Exiting", strerror(errno), tmppath);
+		exit(1);
+	}
+}
+
+/*
+ * Create the storage dirs, and pass the path list to the kernel.
+ * This requires the nfssrv module to be loaded; the _nfssys() syscall
+ * will fail ENOTSUP if it is not.
+ * Use libnvpair(3LIB) to pass the data to the kernel.
+ */
+static int
+dss_init(uint_t npaths, char **pathnames)
+{
+	int i, j, nskipped, error;
+	char *bufp;
+	uint32_t bufsize;
+	size_t buflen;
+	nvlist_t *nvl;
+
+	if (npaths > 1) {
+		/*
+		 * We need to remove duplicate paths; this might be user error
+		 * in the general case, but HA-NFSv4 can also cause this.
+		 * Sort the pathnames array, and NULL out duplicates,
+		 * then write the non-NULL entries to a new array.
+		 * Sorting will also allow the kernel to optimise its searches.
+		 */
+
+		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
+
+		/* now NULL out any duplicates */
+		i = 0; j = 1; nskipped = 0;
+		while (j < npaths) {
+			if (strcmp(pathnames[i], pathnames[j]) == NULL) {
+				pathnames[j] = NULL;
+				j++;
+				nskipped++;
+				continue;
+			}
+
+			/* skip i over any of its NULLed duplicates */
+			i = j++;
+		}
+
+		/* finally, write the non-NULL entries to a new array */
+		if (nskipped > 0) {
+			int nreal;
+			size_t sz;
+			char **tmp_pathnames;
+
+			nreal = npaths - nskipped;
+
+			sz = nreal * sizeof (char *);
+			tmp_pathnames = (char **)malloc(sz);
+			if (tmp_pathnames == NULL) {
+				syslog(LOG_ERR, "tmp_pathnames malloc failed");
+				exit(1);
+			}
+
+			for (i = 0, j = 0; i < npaths; i++)
+				if (pathnames[i] != NULL)
+					tmp_pathnames[j++] = pathnames[i];
+			free(pathnames);
+			pathnames = tmp_pathnames;
+			npaths = nreal;
+		}
+
+	}
+
+	/* Create directories to store the distributed state files */
+	dss_mkleafdirs(npaths, pathnames);
+
+	/* Create the name-value pair list */
+	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
+	if (error) {
+		syslog(LOG_ERR, "nvlist_alloc failed: %s.", strerror(errno));
+		return (1);
+	}
+
+	/* Add the pathnames array as a single name-value pair */
+	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
+	    pathnames, npaths);
+	if (error) {
+		syslog(LOG_ERR, "nvlist_add_string_array failed: %s.",
+		    strerror(errno));
+		nvlist_free(nvl);
+		return (1);
+	}
+
+	/*
+	 * Pack list into contiguous memory, for passing to kernel.
+	 * nvlist_pack() will allocate the memory for the buffer,
+	 * which we should free() when no longer needed.
+	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
+	 */
+	bufp = NULL;
+	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
+	if (error) {
+		syslog(LOG_ERR, "nvlist_pack failed: %s.", strerror(errno));
+		nvlist_free(nvl);
+		return (1);
+	}
+
+	/* Now we have the packed buffer, we no longer need the list */
+	nvlist_free(nvl);
+
+	/*
+	 * Let the kernel know in advance how big the buffer is.
+	 * NOTE: we cannot just pass buflen, since size_t is a long, and
+	 * thus a different size between ILP32 userland and LP64 kernel.
+	 * Use an int for the transfer, since that should be big enough;
+	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
+	 * that could change.
+	 */
+	bufsize = (uint32_t)buflen;
+	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
+	if (error) {
+		syslog(LOG_ERR,
+		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ",
+		    strerror(errno));
+		free(bufp);
+		return (1);
+	}
+
+	/* Pass the packed buffer to the kernel */
+	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
+	if (error) {
+		syslog(LOG_ERR,
+		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno));
+		free(bufp);
+		return (1);
+	}
+
+	/*
+	 * The kernel has now unpacked the buffer and extracted the
+	 * pathnames array, we no longer need the buffer.
+	 */
+	free(bufp);
+
+	return (0);
+}
+
+/*
+ * Quick sort string compare routine, for qsort.
+ * Needed to make arg types correct.
+ */
+int
+qstrcmp(const void *p1, const void *p2)
+{
+	char *s1 = *((char **)p1);
+	char *s2 = *((char **)p2);
+
+	return (strcmp(s1, s2));
+}
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c	Mon May 22 15:43:31 2006 -0700
@@ -54,6 +54,7 @@
 #include <sys/policy.h>
 #include <sys/fem.h>
 #include <sys/sdt.h>
+#include <sys/ddi.h>
 
 #include <rpc/types.h>
 #include <rpc/auth.h>
@@ -272,10 +273,6 @@
 kmutex_t	rfs4_servinst_lock;		/* protects linked list */
 int		rfs4_seen_first_compound;	/* set first time we see one */
 
-#ifdef DEBUG
-int	rfs4_servinst_debug = 0;
-#endif
-
 /*
  * NFS4 op dispatch table
  */
@@ -470,6 +467,8 @@
 
 void rfs4_ss_chkclid(rfs4_client_t *);
 
+extern size_t strlcpy(char *dst, const char *src, size_t dstsize);
+
 #ifdef	nextdp
 #undef nextdp
 #endif
@@ -601,9 +600,6 @@
 {
 	time_t now = gethrestime_sec();
 
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_grace_start: inst %p: 0x%lx", (void *)sip, now));
-
 	rw_enter(&sip->rwlock, RW_WRITER);
 	sip->start_time = now;
 	sip->grace_period = rfs4_grace_period;
@@ -655,24 +651,13 @@
 void
 rfs4_grace_reset_all(void)
 {
-#ifdef DEBUG
-	int n = 0;
-#endif
 	rfs4_servinst_t *sip;
 
 	mutex_enter(&rfs4_servinst_lock);
-	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
-		if (rfs4_servinst_in_grace(sip)) {
+	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev)
+		if (rfs4_servinst_in_grace(sip))
 			rfs4_grace_start(sip);
-#ifdef DEBUG
-			n++;
-#endif
-		}
-	}
 	mutex_exit(&rfs4_servinst_lock);
-
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_grace_reset_all: reset %d instances", n));
 }
 
 /*
@@ -681,23 +666,52 @@
 void
 rfs4_grace_start_new(void)
 {
-#ifdef DEBUG
-	int n = 0;
-#endif
 	rfs4_servinst_t *sip;
 
 	mutex_enter(&rfs4_servinst_lock);
-	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
+	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev)
 		if (rfs4_servinst_grace_new(sip))
 			rfs4_grace_start(sip);
-#ifdef DEBUG
-		n++;
-#endif
-	}
 	mutex_exit(&rfs4_servinst_lock);
-
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_grace_start_new: started %d new instances", n));
+}
+
+static rfs4_dss_path_t *
+rfs4_dss_newpath(rfs4_servinst_t *sip, char *path, unsigned index)
+{
+	size_t len;
+	rfs4_dss_path_t *dss_path;
+
+	dss_path = kmem_alloc(sizeof (rfs4_dss_path_t), KM_SLEEP);
+
+	/*
+	 * Take a copy of the string, since the original may be overwritten.
+	 * Sadly, no strdup() in the kernel.
+	 */
+	/* allow for NUL */
+	len = strlen(path) + 1;
+	dss_path->path = kmem_alloc(len, KM_SLEEP);
+	(void) strlcpy(dss_path->path, path, len);
+
+	/* associate with servinst */
+	dss_path->sip = sip;
+	dss_path->index = index;
+
+	/*
+	 * Add to list of served paths.
+	 * No locking required, as we're only ever called at startup.
+	 */
+	if (rfs4_dss_pathlist == NULL) {
+		/* this is the first dss_path_t */
+
+		/* needed for insque/remque */
+		dss_path->next = dss_path->prev = dss_path;
+
+		rfs4_dss_pathlist = dss_path;
+	} else {
+		insque(dss_path, rfs4_dss_pathlist);
+	}
+
+	return (dss_path);
 }
 
 /*
@@ -706,9 +720,11 @@
  * recovery window.
  */
 void
-rfs4_servinst_create(int start_grace)
+rfs4_servinst_create(int start_grace, int dss_npaths, char **dss_paths)
 {
+	unsigned i;
 	rfs4_servinst_t *sip;
+	rfs4_oldstate_t *oldstate;
 
 	sip = kmem_alloc(sizeof (rfs4_servinst_t), KM_SLEEP);
 	rw_init(&sip->rwlock, NULL, RW_DEFAULT, NULL);
@@ -718,11 +734,28 @@
 	sip->next = NULL;
 	sip->prev = NULL;
 
+	rw_init(&sip->oldstate_lock, NULL, RW_DEFAULT, NULL);
+	/*
+	 * This initial dummy entry is required to setup for insque/remque.
+	 * It must be skipped over whenever the list is traversed.
+	 */
+	oldstate = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
+	/* insque/remque require initial list entry to be self-terminated */
+	oldstate->next = oldstate;
+	oldstate->prev = oldstate;
+	sip->oldstate = oldstate;
+
+
+	sip->dss_npaths = dss_npaths;
+	sip->dss_paths = kmem_alloc(dss_npaths *
+	    sizeof (rfs4_dss_path_t *), KM_SLEEP);
+
+	for (i = 0; i < dss_npaths; i++) {
+		sip->dss_paths[i] = rfs4_dss_newpath(sip, dss_paths[i], i);
+	}
+
 	mutex_enter(&rfs4_servinst_lock);
-	if (rfs4_cur_servinst == NULL) {
-		NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-		    "rfs4_servinst_create: creating first instance"));
-	} else {
+	if (rfs4_cur_servinst != NULL) {
 		/* add to linked list */
 		sip->prev = rfs4_cur_servinst;
 		rfs4_cur_servinst->next = sip;
@@ -731,11 +764,8 @@
 		rfs4_grace_start(sip);
 	/* make the new instance "current" */
 	rfs4_cur_servinst = sip;
+
 	mutex_exit(&rfs4_servinst_lock);
-
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_servinst_create: new current instance: %p; start_grace: %d",
-	    (void *)sip, start_grace));
 }
 
 /*
@@ -757,15 +787,17 @@
 	for (sip = current; sip != NULL; sip = prev) {
 		prev = sip->prev;
 		rw_destroy(&sip->rwlock);
+		if (sip->oldstate)
+			kmem_free(sip->oldstate, sizeof (rfs4_oldstate_t));
+		if (sip->dss_paths)
+			kmem_free(sip->dss_paths,
+			    sip->dss_npaths * sizeof (rfs4_dss_path_t *));
 		kmem_free(sip, sizeof (rfs4_servinst_t));
 #ifdef DEBUG
 		n++;
 #endif
 	}
 	mutex_exit(&rfs4_servinst_lock);
-
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_servinst_destroy_all: destroyed %d instances", n));
 }
 
 /*
@@ -777,10 +809,6 @@
 {
 	ASSERT(rfs4_dbe_refcnt(cp->dbe) > 0);
 
-	NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE,
-	    "rfs4_servinst_assign: client: %p, old: %p, new: %p", (void *)cp,
-	    (void *)cp->server_instance, (void *)sip));
-
 	/*
 	 * The lock ensures that if the current instance is in the process
 	 * of changing, we will see the new one.
@@ -7486,7 +7514,15 @@
 	}
 
 	/*
-	 * Record clientid in stable storage
+	 * Update the client's associated server instance, if it's changed
+	 * since the client was created.
+	 */
+	if (rfs4_servinst(cp) != rfs4_cur_servinst)
+		rfs4_servinst_assign(cp, rfs4_cur_servinst);
+
+	/*
+	 * Record clientid in stable storage.
+	 * Must be done after server instance has been assigned.
 	 */
 	rfs4_ss_clid(cp, req);
 
@@ -7501,13 +7537,6 @@
 	rfs4_update_lease(cp);
 
 	/*
-	 * Update the client's associated server instance, if it's changed
-	 * since the client was created.
-	 */
-	if (rfs4_servinst(cp) != rfs4_cur_servinst)
-		rfs4_servinst_assign(cp, rfs4_cur_servinst);
-
-	/*
 	 * Check to see if client can perform reclaims
 	 */
 	rfs4_ss_chkclid(cp);
--- a/usr/src/uts/common/fs/nfs/nfs4_state.c	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs4_state.c	Mon May 22 15:43:31 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +38,7 @@
 #include <nfs/nfssys.h>
 #include <nfs/lm.h>
 #include <sys/pathname.h>
-
+#include <sys/nvpair.h>
 
 
 extern time_t rfs4_start_time;
@@ -72,6 +71,11 @@
 
 static uint32_t rfs4_database_debug = 0x00;
 
+static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf);
+static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
+static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
+static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
+
 /*
  * Couple of simple init/destroy functions for a general waiter
  */
@@ -333,6 +337,8 @@
 static time_t rfs4_deleg_state_cache_time = 0;
 
 static bool_t rfs4_client_create(rfs4_entry_t, void *);
+static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
+static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
 static void rfs4_client_destroy(rfs4_entry_t);
 static bool_t rfs4_client_expiry(rfs4_entry_t);
 static uint32_t clientid_hash(void *);
@@ -394,15 +400,8 @@
 
 static void rfs4_state_rele_nounlock(rfs4_state_t *);
 
-static rfs4_oldstate_t *rfs4_oldstate = NULL;
-static krwlock_t rfs4_oldstate_lock;
 static int rfs4_ss_enabled = 0;
 
-#define	NFS4_VAR_DIR		"/var/nfs"
-#define	NFS4_STATE_DIR 		NFS4_VAR_DIR"/v4_state"
-#define	NFS4_OLDSTATE_DIR 	NFS4_VAR_DIR"/v4_oldstate"
-#define	NFS4_SS_DIR_MODE	0755
-
 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
 
 void
@@ -411,24 +410,6 @@
 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
 }
 
-/*
- * Free all malloced rsf4_oldstate_t memory
- */
-void
-rfs4_oldstate_free(rfs4_oldstate_t *ros)
-{
-	if (ros == NULL)
-		return;
-
-	if (ros->cl_id4.id_val)
-		kmem_free(ros->cl_id4.id_val, ros->cl_id4.id_len);
-
-	if (ros->ss_pn)
-		kmem_free(ros->ss_pn, sizeof (rfs4_ss_pn_t));
-
-	kmem_free(ros, sizeof (rfs4_oldstate_t));
-}
-
 static rfs4_ss_pn_t *
 rfs4_ss_pnalloc(char *dir, char *leaf)
 {
@@ -465,9 +446,8 @@
 {
 	rfs4_ss_pn_t *src, *dst;
 
-	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL) {
+	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
 		return (NULL);
-	}
 
 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
 		rfs4_ss_pnfree(src);
@@ -500,9 +480,8 @@
 	uint_t id_len;
 	int err, kill_file, file_vers;
 
-	if (ss_pn == NULL) {
+	if (ss_pn == NULL)
 		return (NULL);
-	}
 
 	/*
 	 * open the state file.
@@ -554,7 +533,7 @@
 	 */
 	iov[0].iov_base = (caddr_t)&file_vers;
 	iov[0].iov_len = sizeof (int);
-	iov[1].iov_base = (caddr_t)cl_ss;
+	iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
 	iov[2].iov_base = (caddr_t)&id_len;
 	iov[2].iov_len = sizeof (uint_t);
@@ -626,9 +605,11 @@
 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
 
 /*
+ * Add entries from statedir to supplied oldstate list.
+ * Optionally, move all entries from statedir -> destdir.
  */
 void
-rfs4_ss_oldstate(char *dir, int do_move)
+rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
 {
 	rfs4_ss_pn_t *ss_pn;
 	rfs4_oldstate_t *cl_ss = NULL;
@@ -643,24 +624,11 @@
 	/*
 	 * open the state directory
 	 */
-	if (err = vn_open(dir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0)) {
+	if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
 		return;
-	}
-
-	/*
-	 * if this is not a directory return
-	 */
-	if (dvp->v_type != VDIR) {
-		(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
-		VN_RELE(dvp);
-		return;
-	}
-
-	err = VOP_ACCESS(dvp, VREAD, 0, CRED());
-	if (err) {
-		/* Can't read the directory. So get the heck out. */
+
+	if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED()))
 		goto out;
-	}
 
 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
 
@@ -678,12 +646,9 @@
 		uio.uio_resid = RFS4_SS_DIRSIZE;
 
 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof);
-
 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
-
-		if (err) {
+		if (err)
 			goto out;
-		}
 
 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
 
@@ -700,96 +665,43 @@
 			/*
 			 * Skip '.' and '..'
 			 */
-			if (NFS_IS_DOTNAME(dep->d_name)) {
+			if (NFS_IS_DOTNAME(dep->d_name))
 				continue;
-			}
-
-			if ((ss_pn = rfs4_ss_pnalloc(dir, dep->d_name))
-							== NULL) {
+
+			ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
+			if (ss_pn == NULL)
 				continue;
-			}
 
 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
-				if (do_move) {
+				if (destdir != NULL) {
 					rfs4_ss_pnfree(ss_pn);
 					cl_ss->ss_pn = rfs4_ss_movestate(
-						NFS4_STATE_DIR,
-						NFS4_OLDSTATE_DIR,
-						dep->d_name);
+						statedir, destdir, dep->d_name);
 				} else {
 					cl_ss->ss_pn = ss_pn;
 				}
-				insque(cl_ss, rfs4_oldstate);
+				insque(cl_ss, oldstate);
 			} else {
 				rfs4_ss_pnfree(ss_pn);
 			}
 		}
 	}
+
 out:
-
 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
 	VN_RELE(dvp);
 	if (dirt)
 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
 }
 
-/*
- * Validates that the needed directories exist
- */
-bool_t
-rfs4_validate_var(void)
-{
-	vnode_t *vp;
-	int i;
-	char *dnp;
-	bool_t ret_val = TRUE;
-	char *dir_names[] = {
-			NFS4_VAR_DIR,
-			NFS4_STATE_DIR,
-			NFS4_OLDSTATE_DIR,
-			NULL
-	};
-
-	for (i = 0, dnp = dir_names[i]; dnp; i++) {
-		if (lookupname(dnp, UIO_SYSSPACE,
-					NO_FOLLOW, NULLVPP, &vp) != 0) {
-			cmn_err(CE_WARN, "!NFS4 stable storage directory "
-				"missing!: %s", dnp);
-			ret_val = FALSE;
-		} else {
-			VN_RELE(vp);
-		}
-		dnp = dir_names[i];
-	}
-	return (ret_val);
-}
-
-/*
- *
- */
 static void
 rfs4_ss_init(void)
 {
-	rw_init(&rfs4_oldstate_lock, NULL, RW_DEFAULT, NULL);
-
-	if (rfs4_validate_var() == FALSE) {
-		rfs4_oldstate = NULL;
-		return;
-	}
-
-	rfs4_oldstate = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
-	rfs4_oldstate->next = rfs4_oldstate;
-	rfs4_oldstate->prev = rfs4_oldstate;
-
-	/*
-	 * load info from the OLD directory
-	 */
-	rfs4_ss_oldstate(NFS4_OLDSTATE_DIR, 0);
-
-	/*
-	 * Gather and move NFS4_STATE_DIR to NFS4_OLDSTATE_DIR
-	 */
-	rfs4_ss_oldstate(NFS4_STATE_DIR, 1);
+	int npaths = 1;
+	char *default_dss_path = NFS4_DSS_VAR_DIR;
+
+	/* read the default stable storage state */
+	rfs4_dss_readstate(npaths, &default_dss_path);
 
 	rfs4_ss_enabled = 1;
 }
@@ -797,34 +709,92 @@
 static void
 rfs4_ss_fini(void)
 {
-
-	rfs4_oldstate_t *ost, *osp, *os_head;
-
-	rw_destroy(&rfs4_oldstate_lock);
-
-	/*
-	 * short circuit everything if we have no
-	 * remaining oldstate!
-	 */
-	if (rfs4_oldstate == NULL) {
+	rfs4_servinst_t *sip;
+
+	mutex_enter(&rfs4_servinst_lock);
+	sip = rfs4_cur_servinst;
+	while (sip != NULL) {
+		rfs4_dss_clear_oldstate(sip);
+		sip = sip->next;
+	}
+	mutex_exit(&rfs4_servinst_lock);
+}
+
+/*
+ * Remove all oldstate files referenced by this servinst.
+ */
+static void
+rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
+{
+	rfs4_oldstate_t *os_head, *osp;
+
+	rw_enter(&sip->oldstate_lock, RW_WRITER);
+	os_head = sip->oldstate;
+
+	if (os_head == NULL)
 		return;
+
+	/* skip dummy entry */
+	osp = os_head->next;
+	while (osp != os_head) {
+		char *leaf = osp->ss_pn->leaf;
+		rfs4_oldstate_t *os_next;
+
+		rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
+
+		if (osp->cl_id4.id_val)
+			kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
+		if (osp->ss_pn)
+			kmem_free(osp->ss_pn, sizeof (rfs4_ss_pn_t));
+
+		os_next = osp->next;
+		remque(osp);
+		kmem_free(osp, sizeof (rfs4_oldstate_t));
+		osp = os_next;
 	}
 
-	/*
-	 * It is possible to start and immediately stop the server
-	 * in which case we would not have cleaned up the oldstate
-	 * circular queue so we may do it here.
-	 */
-	os_head = rfs4_oldstate;
-	osp = os_head->next;
-
-	while (osp != os_head) {
-		ost = osp->next;
-		remque(osp);
-		rfs4_oldstate_free(osp);
-		osp = ost;
+	/* free dummy entry */
+	kmem_free(osp, sizeof (rfs4_oldstate_t));
+
+	sip->oldstate = NULL;
+
+	rw_exit(&sip->oldstate_lock);
+}
+
+/*
+ * Form the state and oldstate paths, and read in the stable storage files.
+ */
+void
+rfs4_dss_readstate(int npaths, char **paths)
+{
+	int i;
+	char *state, *oldstate;
+
+	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	for (i = 0; i < npaths; i++) {
+		char *path = paths[i];
+
+		(void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
+		(void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
+
+		/*
+		 * Populate the current server instance's oldstate list.
+		 *
+		 * 1. Read stable storage data from old state directory,
+		 *    leaving its contents alone.
+		 *
+		 * 2. Read stable storage data from state directory,
+		 *    and move the latter's contents to old state
+		 *    directory.
+		 */
+		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL);
+		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate);
 	}
-	kmem_free(os_head, sizeof (rfs4_oldstate_t));
+
+	kmem_free(state, MAXPATHLEN);
+	kmem_free(oldstate, MAXPATHLEN);
 }
 
 
@@ -835,63 +805,63 @@
 void
 rfs4_ss_chkclid(rfs4_client_t *cp)
 {
-	rfs4_oldstate_t *ost, *osp, *os_head;
-
-	/*
-	 * short circuit everything if we have no
-	 * oldstate!
-	 */
-	if (rfs4_oldstate == NULL) {
-		return;
-	}
+	rfs4_servinst_t *sip;
 
 	/*
-	 * if we are not in the grace_period then
-	 * we can destroy and mutilate all the old state.
+	 * It should be sufficient to check the oldstate data for just
+	 * this client's instance. However, since our per-instance
+	 * client grouping is solely temporal, HA-NFSv4 RG failover
+	 * might result in clients of the same RG being partitioned into
+	 * separate instances.
+	 *
+	 * Until the client grouping is improved, we must check the
+	 * oldstate data for all instances with an active grace period.
+	 *
+	 * This also serves as the mechanism to remove stale oldstate data.
+	 * The first time we check an instance after its grace period has
+	 * expired, the oldstate data should be cleared.
+	 *
+	 * Start at the current instance, and walk the list backwards
+	 * to the first.
 	 */
-	if (!rfs4_clnt_in_grace(cp)) {
-		rw_enter(&rfs4_oldstate_lock, RW_WRITER);
-		if (rfs4_oldstate == NULL) {
-			/*
-			 * some other thread is killing
-			 * the state so we get to just return.
-			 */
-			rw_exit(&rfs4_oldstate_lock);
-			return;
-		}
-
-		os_head = rfs4_oldstate;
-		rfs4_oldstate = NULL;
-		rw_exit(&rfs4_oldstate_lock);
-
-		/*
-		 * Now ditch the state files and structures
-		 * we've malloc()'d
-		 */
-		osp = os_head->next;
-
-		while (osp != os_head) {
-			if (osp->ss_pn != NULL) {
-				(void) vn_remove(osp->ss_pn->pn,
-						UIO_SYSSPACE, RMFILE);
-			}
-			ost = osp->next;
-			remque(osp);
-			rfs4_oldstate_free(osp);
-			osp = ost;
-		}
-		kmem_free(os_head, sizeof (rfs4_oldstate_t));
+	mutex_enter(&rfs4_servinst_lock);
+	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
+		rfs4_ss_chkclid_sip(cp, sip);
+
+		/* if the above check found this client, we're done */
+		if (cp->can_reclaim)
+			break;
+	}
+	mutex_exit(&rfs4_servinst_lock);
+}
+
+static void
+rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
+{
+	rfs4_oldstate_t *osp, *os_head;
+
+	/* short circuit everything if this server instance has no oldstate */
+	rw_enter(&sip->oldstate_lock, RW_READER);
+	os_head = sip->oldstate;
+	rw_exit(&sip->oldstate_lock);
+	if (os_head == NULL)
 		return;
-	}
 
 	/*
-	 * we're still in grace, search for the clientid
+	 * If this server instance is no longer in a grace period then
+	 * the client won't be able to reclaim. No further need for this
+	 * instance's oldstate data, so it can be cleared.
 	 */
-	rw_enter(&rfs4_oldstate_lock, RW_READER);
-
-	os_head = rfs4_oldstate;
+	if (!rfs4_servinst_in_grace(sip))
+		return;
+
+	/* this instance is still in grace; search for the clientid */
+
+	rw_enter(&sip->oldstate_lock, RW_READER);
+
+	os_head = sip->oldstate;
+	/* skip dummy entry */
 	osp = os_head->next;
-
 	while (osp != os_head) {
 		if (osp->cl_id4.id_len == cp->nfs_client.id_len) {
 			if (bcmp(osp->cl_id4.id_val, cp->nfs_client.id_val,
@@ -903,25 +873,19 @@
 		osp = osp->next;
 	}
 
-	rw_exit(&rfs4_oldstate_lock);
+	rw_exit(&sip->oldstate_lock);
 }
 
 /*
- * Place client information into stable storage.
+ * Place client information into stable storage: 1/3.
+ * First, generate the leaf filename, from the client's IP address and
+ * the server-generated short-hand clientid.
  */
 void
 rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req)
 {
 	const char *kinet_ntop6(uchar_t *, char *, size_t);
-
-	nfs_client_id4		*cl_id4;
-	rfs4_ss_pn_t *ss_pn;
 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
-	vnode_t *vp;
-	struct uio uio;
-	struct iovec iov[4];
-	int file_vers = NFS4_SS_VERSION;
-	int ioflag;
 	struct sockaddr *ca;
 	uchar_t *b;
 
@@ -959,10 +923,70 @@
 
 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
 	    (longlong_t)cp->clientid);
-
-	if ((ss_pn = rfs4_ss_pnalloc(NFS4_STATE_DIR, leaf)) == NULL) {
+	rfs4_ss_clid_write(cp, leaf);
+}
+
+/*
+ * Place client information into stable storage: 2/3.
+ * DSS: distributed stable storage: the file may need to be written to
+ * multiple directories.
+ */
+static void
+rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf)
+{
+	rfs4_servinst_t *sip;
+
+	/*
+	 * It should be sufficient to write the leaf file to (all) DSS paths
+	 * associated with just this client's instance. However, since our
+	 * per-instance client grouping is solely temporal, HA-NFSv4 RG
+	 * failover might result in us losing DSS data.
+	 *
+	 * Until the client grouping is improved, we must write the DSS data
+	 * to all instances' paths. Start at the current instance, and
+	 * walk the list backwards to the first.
+	 */
+	mutex_enter(&rfs4_servinst_lock);
+	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
+		int i, npaths = sip->dss_npaths;
+
+		/* write the leaf file to all DSS paths */
+		for (i = 0; i < npaths; i++) {
+			rfs4_dss_path_t *dss_path = sip->dss_paths[i];
+
+			/* HA-NFSv4 path might have been failed-away from us */
+			if (dss_path == NULL)
+				continue;
+
+			rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
+		}
+	}
+	mutex_exit(&rfs4_servinst_lock);
+}
+
+/*
+ * Place client information into stable storage: 3/3.
+ * Write the stable storage data to the requested file.
+ */
+static void
+rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
+{
+	int ioflag;
+	int file_vers = NFS4_SS_VERSION;
+	struct uio uio;
+	struct iovec iov[4];
+	char *dir;
+	rfs4_ss_pn_t *ss_pn;
+	vnode_t *vp;
+	nfs_client_id4 *cl_id4 = &(cp->nfs_client);
+
+	/* allow 2 extra bytes for '/' & NUL */
+	dir = kmem_alloc(strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2,
+	    KM_SLEEP);
+	(void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
+
+	if ((ss_pn = rfs4_ss_pnalloc(dir, leaf)) == NULL)
 		return;
-	}
 
 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
 			    CRCREAT, 0)) {
@@ -970,19 +994,31 @@
 		return;
 	}
 
-	if (cp->ss_pn)
-		rfs4_ss_pnfree(cp->ss_pn);
-
-	cp->ss_pn = ss_pn;
-
-	cl_id4 = &(cp->nfs_client);
+	/*
+	 * We need to record leaf - i.e. the filename - so that we know
+	 * what to remove, in the future. However, the dir part of cp->ss_pn
+	 * should never be referenced directly, since it's potentially only
+	 * one of several paths with this leaf in it.
+	 */
+	if (cp->ss_pn != NULL) {
+		if (strcmp(cp->ss_pn->leaf, leaf) == 0) {
+			/* we've already recorded *this* leaf */
+			rfs4_ss_pnfree(ss_pn);
+		} else {
+			/* replace with this leaf */
+			rfs4_ss_pnfree(cp->ss_pn);
+			cp->ss_pn = ss_pn;
+		}
+	} else {
+		cp->ss_pn = ss_pn;
+	}
 
 	/*
 	 * Build a scatter list that points to the nfs_client_id4
 	 */
 	iov[0].iov_base = (caddr_t)&file_vers;
 	iov[0].iov_len = sizeof (int);
-	iov[1].iov_base = (caddr_t)cl_id4;
+	iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
 	iov[2].iov_len = sizeof (uint_t);
@@ -1010,6 +1046,45 @@
 }
 
 /*
+ * DSS: distributed stable storage.
+ * Unpack the list of paths passed by nfsd.
+ * Use nvlist_alloc(9F) to manage the data.
+ * The caller is responsible for allocating and freeing the buffer.
+ */
+int
+rfs4_dss_setpaths(char *buf, size_t buflen)
+{
+	int error;
+
+	/*
+	 * If this is a "warm start", i.e. we previously had DSS paths,
+	 * preserve the old paths.
+	 */
+	if (rfs4_dss_paths != NULL) {
+		/*
+		 * Before we lose the ptr, destroy the nvlist and pathnames
+		 * array from the warm start before this one.
+		 */
+		if (rfs4_dss_oldpaths)
+			nvlist_free(rfs4_dss_oldpaths);
+		rfs4_dss_oldpaths = rfs4_dss_paths;
+	}
+
+	/* unpack the buffer into a searchable nvlist */
+	error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
+	if (error)
+		return (error);
+
+	/*
+	 * Search the nvlist for the pathnames nvpair (which is the only nvpair
+	 * in the list, and record its location.
+	 */
+	error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
+	    &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
+	return (error);
+}
+
+/*
  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
  * to find and mark the client for forced expire.
  */
@@ -1089,6 +1164,7 @@
 {
 	int start_grace;
 	extern boolean_t rfs4_cpr_callb(void *, int);
+	char *dss_path = NFS4_DSS_VAR_DIR;
 
 	mutex_enter(&rfs4_state_lock);
 
@@ -1114,6 +1190,9 @@
 	else
 		rfs4_start_time++;
 
+	/* DSS: distributed stable storage: initialise served paths list */
+	rfs4_dss_pathlist = NULL;
+
 	/*
 	 * Create the first server instance, or a new one if the server has
 	 * been restarted; see above comments on rfs4_start_time. Don't
@@ -1121,7 +1200,7 @@
 	 * clients' recovery window.
 	 */
 	start_grace = 0;
-	rfs4_servinst_create(start_grace);
+	rfs4_servinst_create(start_grace, 1, &dss_path);
 
 	/* reset the "first NFSv4 request" status */
 	rfs4_seen_first_compound = 0;
@@ -1355,6 +1434,13 @@
 
 	/* reset the "first NFSv4 request" status */
 	rfs4_seen_first_compound = 0;
+
+	/* DSS: distributed stable storage */
+	if (rfs4_dss_oldpaths)
+		nvlist_free(rfs4_dss_oldpaths);
+	if (rfs4_dss_paths)
+		nvlist_free(rfs4_dss_paths);
+	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
 }
 
 typedef union {
@@ -1455,11 +1541,50 @@
 	cp_expired = (cp->forced_expire ||
 		(gethrestime_sec() - cp->last_access
 			> rfs4_lease_time));
+
 	if (!cp->ss_remove && cp_expired)
 		cp->ss_remove = 1;
 	return (cp_expired);
 }
 
+/*
+ * Remove the leaf file from all distributed stable storage paths.
+ */
+static void
+rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
+{
+	char *leaf = cp->ss_pn->leaf;
+
+	rfs4_dss_remove_leaf(cp->server_instance, NFS4_DSS_STATE_LEAF, leaf);
+}
+
+static void
+rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
+{
+	int i, npaths = sip->dss_npaths;
+
+	for (i = 0; i < npaths; i++) {
+		rfs4_dss_path_t *dss_path = sip->dss_paths[i];
+		char *path, *dir;
+		size_t pathlen;
+
+		/* the HA-NFSv4 path might have been failed-over away from us */
+		if (dss_path == NULL)
+			continue;
+
+		dir = dss_path->path;
+
+		/* allow 3 extra bytes for two '/' & a NUL */
+		pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
+		path = kmem_alloc(pathlen, KM_SLEEP);
+		(void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
+
+		(void) vn_remove(path, UIO_SYSSPACE, RMFILE);
+
+		kmem_free(path, pathlen);
+	}
+}
+
 static void
 rfs4_client_destroy(rfs4_entry_t u_entry)
 {
@@ -1476,12 +1601,9 @@
 		rfs4_client_rele(cp->cp_confirmed);
 
 	if (cp->ss_pn) {
-		/*
-		 * check if the stable storage file needs
-		 * to be removed
-		 */
+		/* check if the stable storage files need to be removed */
 		if (cp->ss_remove)
-			(void) vn_remove(cp->ss_pn->pn, UIO_SYSSPACE, RMFILE);
+			rfs4_dss_remove_cpleaf(cp);
 		rfs4_ss_pnfree(cp->ss_pn);
 	}
 
--- a/usr/src/uts/common/fs/nfs/nfs_server.c	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c	Mon May 22 15:43:31 2006 -0700
@@ -106,6 +106,9 @@
 
 char _depends_on[] = "misc/klmmod";
 
+/* for testing RG failover code path on non-Cluster system */
+int hanfsv4_force = 0;
+
 int
 _init(void)
 {
@@ -125,7 +128,19 @@
 		nfs_srvfini();
 	}
 
+	/*
+	 * Initialise some placeholders for nfssys() calls. These have
+	 * to be declared by the nfs module, since that handles nfssys()
+	 * calls - also used by NFS clients - but are provided by this
+	 * nfssrv module. These also then serve as confirmation to the
+	 * relevant code in nfs that nfssrv has been loaded, as they're
+	 * initially NULL.
+	 */
 	nfs_srv_quiesce_func = nfs_srv_quiesce_all;
+	nfs_srv_dss_func = rfs4_dss_setpaths;
+
+	/* setup DSS paths here; must be done before initial server startup */
+	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
 
 	return (status);
 }
@@ -166,6 +181,7 @@
 static void	common_dispatch(struct svc_req *, SVCXPRT *,
 		rpcvers_t, rpcvers_t, char *,
 		struct rpc_disptable *);
+static void	hanfsv4_failover(void);
 static	int	checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
 			bool_t);
 static char	*client_name(struct svc_req *req);
@@ -241,6 +257,12 @@
 static kmutex_t nfs_server_upordown_lock;
 static	kcondvar_t nfs_server_upordown_cv;
 
+/*
+ * DSS: distributed stable storage
+ * lists of all DSS paths: current, and before last warmstart
+ */
+nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
+
 int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
 
 /*
@@ -298,6 +320,11 @@
 			nfs_server_upordown == NFS_SERVER_OFFLINE) {
 			nfs_server_upordown = NFS_SERVER_QUIESCED;
 			cv_signal(&nfs_server_upordown_cv);
+
+			/* reset DSS state, for subsequent warm restart */
+			rfs4_dss_numnewpaths = 0;
+			rfs4_dss_newpaths = NULL;
+
 			cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
 			    "NFSv4 state has been preserved");
 		}
@@ -458,7 +485,7 @@
 
 	releasef(STRUCT_FGET(uap, fd));
 
-	/* save the cluster nodeid */
+	/* HA-NFSv4: save the cluster nodeid */
 	if (cluster_bootflags & CLUSTER_BOOTED)
 		lm_global_nlmid = clconf_get_nodeid();
 
@@ -489,28 +516,20 @@
 
 			/* is this an nfsd warm start? */
 			if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
-				int start_grace;
-
 				cmn_err(CE_NOTE, "nfs_server: "
 				    "server was previously quiesced; "
 				    "existing NFSv4 state will be re-used");
 
 				/*
-				 * Cluster: this is also the signal that
-				 * a failover has occurred, so create a new
-				 * server instance, and start its grace period.
-				 * We also need to reset all currently
-				 * active grace periods in case of multiple
-				 * failovers within the grace duration,
-				 * to avoid partitioning clients of the same
-				 * resource into different instances.
+				 * HA-NFSv4: this is also the signal
+				 * that a Resource Group failover has
+				 * occurred.
 				 */
-				if (cluster_bootflags & CLUSTER_BOOTED) {
-					rfs4_grace_reset_all();
-					start_grace = 1;
-					rfs4_servinst_create(start_grace);
-				}
+				if (cluster_bootflags & CLUSTER_BOOTED ||
+				    hanfsv4_force)
+					hanfsv4_failover();
 			} else {
+				/* cold start */
 				rfs4_state_init();
 				nfs4_drc = rfs4_init_drc(nfs4_drc_max,
 							nfs4_drc_hash,
@@ -2836,3 +2855,160 @@
 
 	return (error);
 }
+
+/*
+ * Do the main work of handling HA-NFSv4 Resource Group failover on
+ * Sun Cluster.
+ * We need to detect whether any RG admin paths have been added or removed,
+ * and adjust resources accordingly.
+ * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
+ * order to scale, the list and array of paths need to be held in more
+ * suitable data structures.
+ */
+static void
+hanfsv4_failover(void)
+{
+	int i, start_grace, numadded_paths = 0;
+	char **added_paths = NULL;
+	rfs4_dss_path_t *dss_path;
+
+	/*
+	 * First, look for removed paths: RGs that have been failed-over
+	 * away from this node.
+	 * Walk the "currently-serving" rfs4_dss_pathlist and, for each
+	 * path, check if it is on the "passed-in" rfs4_dss_newpaths array
+	 * from nfsd. If not, that RG path has been removed.
+	 *
+	 * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
+	 * any duplicates.
+	 */
+	dss_path = rfs4_dss_pathlist;
+	do {
+		int found = 0;
+		char *path = dss_path->path;
+
+		/* used only for non-HA so may not be removed */
+		if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+			dss_path = dss_path->next;
+			continue;
+		}
+
+		for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+			int cmpret;
+			size_t ncmp;
+			char *newpath = rfs4_dss_newpaths[i];
+
+			ncmp = MAX(strlen(path), strlen(newpath));
+			cmpret = strncmp(path, newpath, ncmp);
+
+			/*
+			 * Since nfsd has sorted rfs4_dss_newpaths for us,
+			 * once the return from strncmp is negative we know
+			 * we've passed the point where "path" should be,
+			 * and can stop searching: "path" has been removed.
+			 */
+			if (cmpret < 0)
+				break;
+
+			if (cmpret == 0) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (found == 0) {
+			unsigned index = dss_path->index;
+			rfs4_servinst_t *sip = dss_path->sip;
+			rfs4_dss_path_t *path_next = dss_path->next;
+
+			/*
+			 * This path has been removed.
+			 * We must clear out the servinst reference to
+			 * it, since it's now owned by another
+			 * node: we should not attempt to touch it.
+			 */
+			ASSERT(dss_path == sip->dss_paths[index]);
+			sip->dss_paths[index] = NULL;
+
+			/* remove from "currently-serving" list, and destroy */
+			remque(dss_path);
+			kmem_free(dss_path, sizeof (rfs4_dss_path_t));
+
+			dss_path = path_next;
+		} else {
+			/* path was found; not removed */
+			dss_path = dss_path->next;
+		}
+	} while (dss_path != rfs4_dss_pathlist);
+
+	/*
+	 * Now, look for added paths: RGs that have been failed-over
+	 * to this node.
+	 * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
+	 * for each path, check if it is on the "currently-serving"
+	 * rfs4_dss_pathlist. If not, that RG path has been added.
+	 *
+	 * Note: we don't do duplicate detection here; nfsd does that for us.
+	 *
+	 * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
+	 * an upper bound for the size needed for added_paths[numadded_paths].
+	 */
+
+	/* probably more space than we need, but guaranteed to be enough */
+	if (rfs4_dss_numnewpaths > 0) {
+		size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
+		added_paths = kmem_zalloc(sz, KM_SLEEP);
+	}
+
+	/* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
+	for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+		int found = 0;
+		char *newpath = rfs4_dss_newpaths[i];
+
+		dss_path = rfs4_dss_pathlist;
+		do {
+			char *path = dss_path->path;
+
+			/* used only for non-HA */
+			if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+				dss_path = dss_path->next;
+				continue;
+			}
+
+			if (strncmp(path, newpath, strlen(path)) == 0) {
+				found = 1;
+				break;
+			}
+
+			dss_path = dss_path->next;
+		} while (dss_path != rfs4_dss_pathlist);
+
+		if (found == 0) {
+			added_paths[numadded_paths] = newpath;
+			numadded_paths++;
+		}
+	}
+
+	/* did we find any added paths? */
+	if (numadded_paths > 0) {
+		/* create a new server instance, and start its grace period */
+		start_grace = 1;
+		rfs4_servinst_create(start_grace, numadded_paths, added_paths);
+
+		/* read in the stable storage state from these paths */
+		rfs4_dss_readstate(numadded_paths, added_paths);
+
+		/*
+		 * Multiple failovers during a grace period will cause
+		 * clients of the same resource group to be partitioned
+		 * into different server instances, with different
+		 * grace periods.  Since clients of the same resource
+		 * group must be subject to the same grace period,
+		 * we need to reset all currently active grace periods.
+		 */
+		rfs4_grace_reset_all();
+	}
+
+	if (rfs4_dss_numnewpaths > 0)
+		kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
+}
--- a/usr/src/uts/common/fs/nfs/nfs_sys.c	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/fs/nfs/nfs_sys.c	Mon May 22 15:43:31 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
@@ -38,6 +37,7 @@
 #include <sys/policy.h>
 #include <sys/siginfo.h>
 #include <sys/proc.h>		/* for exit() declaration */
+#include <sys/kmem.h>
 #include <nfs/nfs4.h>
 #include <nfs/nfssys.h>
 #include <sys/thread.h>
@@ -70,6 +70,12 @@
 time_t rfs4_lease_time = RFS4_LEASETIME;
 time_t rfs4_grace_period = RFS4_LEASETIME;
 
+/* DSS: distributed stable storage */
+size_t nfs4_dss_buflen = 0;
+/* This filled in by nfssrv:_init() */
+int (*nfs_srv_dss_func)(char *, size_t) = NULL;
+
+
 int
 nfssys(enum nfssys_op opcode, void *arg)
 {
@@ -182,22 +188,6 @@
 		break;
 	}
 
-	/* Request that NFS server quiesce on next shutdown */
-	case NFS_SVC_REQUEST_QUIESCE: {
-		int id;
-
-		/* check that nfssrv module is loaded */
-		if (nfs_srv_quiesce_func == NULL)
-			return (set_errno(ENOTSUP));
-
-		if (copyin(arg, &id, sizeof (id)))
-			return (set_errno(EFAULT));
-
-		error = svc_pool_control(id, SVCPSET_SHUTDOWN_PROC,
-		    (void *)nfs_srv_quiesce_func);
-		break;
-	}
-
 	case EXPORTFS: { /* export a file system */
 		STRUCT_DECL(exportfs_args, ea);
 
@@ -295,6 +285,22 @@
 		break;
 	}
 
+	/* Request that NFSv4 server quiesce on next shutdown */
+	case NFS4_SVC_REQUEST_QUIESCE: {
+		int id;
+
+		/* check that nfssrv module is loaded */
+		if (nfs_srv_quiesce_func == NULL)
+			return (set_errno(ENOTSUP));
+
+		if (copyin(arg, &id, sizeof (id)))
+			return (set_errno(EFAULT));
+
+		error = svc_pool_control(id, SVCPSET_SHUTDOWN_PROC,
+		    (void *)nfs_srv_quiesce_func);
+		break;
+	}
+
 	case NFS_IDMAP: {
 		struct nfsidmap_args idm;
 
@@ -306,6 +312,47 @@
 		break;
 	}
 
+	case NFS4_DSS_SETPATHS_SIZE: {
+		/* crosses ILP32/LP64 boundary */
+		uint32_t nfs4_dss_bufsize = 0;
+
+		if (copyin(arg, &nfs4_dss_bufsize, sizeof (nfs4_dss_bufsize)))
+			return (set_errno(EFAULT));
+		nfs4_dss_buflen = (long)nfs4_dss_bufsize;
+		error = 0;
+		break;
+	}
+
+	case NFS4_DSS_SETPATHS: {
+		char *nfs4_dss_bufp;
+
+		/* check that nfssrv module is loaded */
+		if (nfs_srv_dss_func == NULL)
+			return (set_errno(ENOTSUP));
+
+		/*
+		 * NFS4_DSS_SETPATHS_SIZE must be called before
+		 * NFS4_DSS_SETPATHS, to tell us how big a buffer we need
+		 * to allocate.
+		 */
+		if (nfs4_dss_buflen == 0)
+			return (set_errno(EINVAL));
+		nfs4_dss_bufp = kmem_alloc(nfs4_dss_buflen, KM_SLEEP);
+		if (nfs4_dss_bufp == NULL)
+			return (set_errno(ENOMEM));
+
+		if (copyin(arg, nfs4_dss_bufp, nfs4_dss_buflen)) {
+			kmem_free(nfs4_dss_bufp, nfs4_dss_buflen);
+			return (set_errno(EFAULT));
+		}
+
+		/* unpack the buffer and extract the pathnames */
+		error = nfs_srv_dss_func(nfs4_dss_bufp, nfs4_dss_buflen);
+		kmem_free(nfs4_dss_bufp, nfs4_dss_buflen);
+
+		break;
+	}
+
 	default:
 		error = EINVAL;
 		break;
--- a/usr/src/uts/common/nfs/nfs.h	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/nfs/nfs.h	Mon May 22 15:43:31 2006 -0700
@@ -44,6 +44,7 @@
 #include <sys/dirent.h>
 #include <sys/zone.h>
 #include <sys/tsol/label.h>
+#include <sys/nvpair.h>
 #include <nfs/mount.h>
 #endif
 #include <vm/page.h>
@@ -933,8 +934,12 @@
 extern void	nfs_srv_stop_all(void);
 extern void	nfs_srv_quiesce_all(void);
 extern void	(*nfs_srv_quiesce_func)(void);
+extern int	rfs4_dss_setpaths(char *, size_t);
+extern int	(*nfs_srv_dss_func)(char *, size_t);
 extern time_t	rfs4_lease_time;
 extern time_t	rfs4_grace_period;
+extern nvlist_t	*rfs4_dss_paths, *rfs4_dss_oldpaths;
+
 
 extern kstat_named_t	*global_svstat_ptr[];
 
--- a/usr/src/uts/common/nfs/nfs4.h	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/nfs/nfs4.h	Mon May 22 15:43:31 2006 -0700
@@ -36,6 +36,7 @@
 
 #ifdef _KERNEL
 #include <nfs/nfs4_kprot.h>
+#include <sys/nvpair.h>
 #else
 #include <rpcsvc/nfs4_prot.h>
 #endif
@@ -324,17 +325,46 @@
  *
  * Currently used only for Sun Cluster HA-NFS support, to group clients
  * on NFS resource failover so each set of clients gets its own dedicated
- * grace period.
+ * grace period and distributed stable storage data.
  */
 typedef struct rfs4_servinst {
+	int			dss_npaths;
 	krwlock_t		rwlock;
+	krwlock_t		oldstate_lock;
 	time_t			start_time;
 	time_t			grace_period;
+	rfs4_oldstate_t		*oldstate;
+	struct rfs4_dss_path	**dss_paths;
 	struct rfs4_servinst	*next;
 	struct rfs4_servinst	*prev;
 } rfs4_servinst_t;
 
 /*
+ * DSS: distributed stable storage
+ */
+
+typedef struct rfs4_dss_path {
+	struct rfs4_dss_path	*next; /* for insque/remque */
+	struct rfs4_dss_path	*prev; /* for insque/remque */
+	char			*path;
+	struct rfs4_servinst	*sip;
+	unsigned		index; /* offset in servinst's array */
+} rfs4_dss_path_t;
+
+/* array of paths passed-in from nfsd command-line; stored in nvlist */
+char		**rfs4_dss_newpaths;
+uint_t		rfs4_dss_numnewpaths;
+
+/*
+ * Circular doubly-linked list of paths for currently-served RGs.
+ * No locking required: only changed on warmstart. Managed with insque/remque.
+ */
+rfs4_dss_path_t	*rfs4_dss_pathlist;
+
+/* nvlists of all DSS paths: current, and before last warmstart */
+nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
+
+/*
  * List declarations (suitable for insque/remque) used to link the
  * various datastructs listed below.
  */
@@ -712,12 +742,11 @@
 	krwlock_t	file_rwlock;
 } rfs4_file_t;
 
-extern int	rfs4_servinst_debug;
 extern int	rfs4_seen_first_compound;	/* set first time we see one */
 
 extern rfs4_servinst_t	*rfs4_cur_servinst;	/* current server instance */
 extern kmutex_t		rfs4_servinst_lock;	/* protects linked list */
-extern void		rfs4_servinst_create(int);
+extern void		rfs4_servinst_create(int, int, char **);
 extern void		rfs4_servinst_destroy_all(void);
 extern void		rfs4_servinst_assign(rfs4_client_t *,
 			    rfs4_servinst_t *);
@@ -728,6 +757,8 @@
 extern void		rfs4_grace_start(rfs4_servinst_t *);
 extern void		rfs4_grace_start_new(void);
 extern void		rfs4_grace_reset_all(void);
+extern void		rfs4_ss_oldstate(rfs4_oldstate_t *, char *, char *);
+extern void		rfs4_dss_readstate(int, char **);
 
 /*
  * rfs4_deleg_policy is used to signify the server's global delegation
--- a/usr/src/uts/common/nfs/nfssys.h	Mon May 22 15:34:31 2006 -0700
+++ b/usr/src/uts/common/nfs/nfssys.h	Mon May 22 15:43:31 2006 -0700
@@ -50,7 +50,8 @@
     OLD_NFS_CNVT, NFS_REVAUTH, OLD_NFS_FH_TO_FID, OLD_LM_SVC, KILL_LOCKMGR,
     LOG_FLUSH, SVCPOOL_CREATE, NFS_SVC, LM_SVC, SVCPOOL_WAIT, SVCPOOL_RUN,
     NFS4_SVC, RDMA_SVC_INIT, NFS4_CLR_STATE, NFS_IDMAP,
-    NFS_SVC_REQUEST_QUIESCE, NFS_GETFH };
+    NFS4_SVC_REQUEST_QUIESCE, NFS_GETFH, NFS4_DSS_SETPATHS,
+    NFS4_DSS_SETPATHS_SIZE };
 
 struct nfs_svc_args {
 	int		fd;		/* Connection endpoint */
@@ -294,6 +295,14 @@
 #define	NFS4_SETPORT	2
 #define	NFS4_DQUERY	4
 
+/* DSS: distributed stable storage */
+#define	NFS4_DSS_STATE_LEAF	"v4_state"
+#define	NFS4_DSS_OLDSTATE_LEAF	"v4_oldstate"
+#define	NFS4_DSS_DIR_MODE	0755
+#define	NFS4_DSS_NVPAIR_NAME	"dss_pathname_array"
+/* default storage dir */
+#define	NFS4_DSS_VAR_DIR	"/var/nfs"
+
 #ifdef _KERNEL
 
 #include <sys/systm.h>		/* for rval_t typedef */