usr/src/uts/common/fs/nfs/nfs4_dispatch.c
changeset 74 524df0e4e452
child 76 c6ba53ffbc0e
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/nfs/nfs4_dispatch.c	Fri Jun 24 19:50:32 2005 -0700
@@ -0,0 +1,525 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+#include <rpc/types.h>
+#include <rpc/auth.h>
+#include <rpc/auth_unix.h>
+#include <rpc/auth_des.h>
+#include <rpc/svc.h>
+#include <rpc/xdr.h>
+#include <nfs/nfs4.h>
+#include <nfs/nfs_dispatch.h>
+#include <nfs/nfs4_drc.h>
+
+/*
+ * This is the duplicate request cache for NFSv4
+ */
+rfs4_drc_t *nfs4_drc = NULL;
+
+/*
+ * How long the entry can remain in the cache
+ * once it has been sent to the client and not
+ * used in a reply (in seconds)
+ */
+unsigned nfs4_drc_lifetime = 1;
+
+/*
+ * The default size of the duplicate request cache
+ */
+uint32_t nfs4_drc_max = 8 * 1024;
+
+/*
+ * The number of buckets we'd like to hash the
+ * replies into.. do not change this on the fly.
+ */
+uint32_t nfs4_drc_hash = 541;
+
+/*
+ * Initialize a duplicate request cache.
+ */
+rfs4_drc_t *
+rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size, unsigned ttl)
+{
+	rfs4_drc_t *drc;
+	uint32_t   bki;
+
+	ASSERT(drc_size);
+	ASSERT(drc_hash_size);
+
+	drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
+
+	drc->max_size = drc_size;
+	drc->in_use = 0;
+	drc->drc_ttl = ttl;
+
+	mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	drc->dr_hash = drc_hash_size;
+
+	drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
+
+	for (bki = 0; bki < drc_hash_size; bki++) {
+		list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
+		    offsetof(rfs4_dupreq_t, dr_bkt_next));
+	}
+
+	list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
+		    offsetof(rfs4_dupreq_t, dr_next));
+
+	return (drc);
+}
+
+/*
+ * Destroy a duplicate request cache.
+ */
+void
+rfs4_fini_drc(rfs4_drc_t *drc)
+{
+	rfs4_dupreq_t *drp, *drp_next;
+
+	ASSERT(drc);
+
+	/* iterate over the dr_cache and free the enties */
+	for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
+
+		if (drp->dr_state == NFS4_DUP_REPLAY)
+			rfs4_compound_free(&(drp->dr_res));
+
+		if (drp->dr_addr.buf != NULL)
+			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
+
+		drp_next = list_next(&(drc->dr_cache), drp);
+
+		kmem_free(drp, sizeof (rfs4_dupreq_t));
+	}
+
+	mutex_destroy(&drc->lock);
+	kmem_free(drc->dr_buckets,
+		sizeof (list_t)*drc->dr_hash);
+	kmem_free(drc, sizeof (rfs4_drc_t));
+}
+
+/*
+ * rfs4_dr_chstate:
+ *
+ * Change the state of a rfs4_dupreq. If it's not in transition
+ * to the FREE state, update the time used and return. If we
+ * are moving to the FREE state then we need to clean up the
+ * compound results and move the entry to the end of the list.
+ */
+void
+rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
+{
+	rfs4_drc_t *drc;
+
+	ASSERT(drp);
+	ASSERT(drp->drc);
+	ASSERT(drp->dr_bkt);
+	ASSERT(MUTEX_HELD(&drp->drc->lock));
+
+	drp->dr_state = new_state;
+
+	if (new_state != NFS4_DUP_FREE) {
+		gethrestime(&drp->dr_time_used);
+		return;
+	}
+
+	drc = drp->drc;
+
+	/*
+	 * Remove entry from the bucket and
+	 * dr_cache list, free compound results.
+	 */
+	list_remove(drp->dr_bkt, drp);
+	list_remove(&(drc->dr_cache), drp);
+	rfs4_compound_free(&(drp->dr_res));
+}
+
+/*
+ * rfs4_alloc_dr:
+ *
+ * Pick an entry off the tail -- Use if it is
+ * marked NFS4_DUP_FREE, or is an entry in the
+ * NFS4_DUP_REPLAY state that has timed-out...
+ * Otherwise malloc a new one if we have not reached
+ * our maximum cache limit.
+ *
+ * The list should be in time order, so no need
+ * to traverse backwards looking for a timed out
+ * entry, NFS4_DUP_FREE's are place on the tail.
+ */
+rfs4_dupreq_t *
+rfs4_alloc_dr(rfs4_drc_t *drc)
+{
+	rfs4_dupreq_t *drp_tail, *drp = NULL;
+
+	ASSERT(drc);
+	ASSERT(MUTEX_HELD(&drc->lock));
+
+	if ((drp_tail = list_tail(&drc->dr_cache)) != NULL) {
+
+		switch (drp_tail->dr_state) {
+
+		case NFS4_DUP_FREE:
+			list_remove(&(drc->dr_cache), drp_tail);
+			DTRACE_PROBE1(nfss__i__drc_freeclaim,
+					rfs4_dupreq_t *, drp_tail);
+			return (drp_tail);
+			/* NOTREACHED */
+
+		case NFS4_DUP_REPLAY:
+			if (gethrestime_sec() >
+			    drp_tail->dr_time_used.tv_sec+drc->drc_ttl) {
+				/* this entry has timedout so grab it. */
+				rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
+				DTRACE_PROBE1(nfss__i__drc_ttlclaim,
+					rfs4_dupreq_t *, drp_tail);
+				return (drp_tail);
+			}
+			break;
+		}
+	}
+
+	/*
+	 * Didn't find something to recycle have
+	 * we hit the cache limit ?
+	 */
+	if (drc->in_use >= drc->max_size) {
+		DTRACE_PROBE1(nfss__i__drc_full,
+			rfs4_drc_t *, drc);
+		return (NULL);
+	}
+
+
+	/* nope, so let's malloc a new one */
+	drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
+	drp->drc = drc;
+	drc->in_use++;
+	gethrestime(&drp->dr_time_created);
+	DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
+
+	return (drp);
+}
+
+/*
+ * rfs4_find_dr:
+ *
+ * Search for an entry in the duplicate request cache by
+ * calculating the hash index based on the XID, and examining
+ * the entries in the hash bucket. If we find a match stamp the
+ * time_used and return. If the entry does not match it could be
+ * ready to be freed. Once we have searched the bucket and we
+ * have not exhausted the maximum limit for the cache we will
+ * allocate a new entry.
+ */
+int
+rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
+{
+
+	uint32_t	the_xid;
+	list_t		*dr_bkt;
+	rfs4_dupreq_t	*drp;
+	int		bktdex;
+
+	/*
+	 * Get the XID, calculate the bucket and search to
+	 * see if we need to replay from the cache.
+	 */
+	the_xid = req->rq_xprt->xp_xid;
+	bktdex = the_xid % drc->dr_hash;
+
+	dr_bkt = (list_t *)
+		&(drc->dr_buckets[(the_xid % drc->dr_hash)]);
+
+	DTRACE_PROBE3(nfss__i__drc_bktdex,
+			int, bktdex,
+			uint32_t, the_xid,
+			list_t *, dr_bkt);
+
+	*dup = NULL;
+
+	mutex_enter(&drc->lock);
+	/*
+	 * Search the bucket for a matching xid and address.
+	 */
+	for (drp = list_head(dr_bkt); drp != NULL;
+		drp = list_next(dr_bkt, drp)) {
+
+		if (drp->dr_xid == the_xid &&
+		    drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
+		    bcmp((caddr_t)drp->dr_addr.buf,
+		    (caddr_t)req->rq_xprt->xp_rtaddr.buf,
+		    drp->dr_addr.len) == 0) {
+
+			/*
+			 * Found a match so REPLAY the Reply
+			 */
+			if (drp->dr_state == NFS4_DUP_REPLAY) {
+				gethrestime(&drp->dr_time_used);
+				mutex_exit(&drc->lock);
+				*dup = drp;
+				DTRACE_PROBE1(nfss__i__drc_replay,
+					rfs4_dupreq_t *, drp);
+				return (NFS4_DUP_REPLAY);
+			}
+
+			/*
+			 * This entry must be in transition, so return
+			 * the 'pending' status.
+			 */
+			mutex_exit(&drc->lock);
+			return (NFS4_DUP_PENDING);
+		}
+
+		/*
+		 * Not a match, but maybe this entry is ready
+		 * to be reused.
+		 */
+		if (drp->dr_state == NFS4_DUP_REPLAY &&
+			(gethrestime_sec() >
+			drp->dr_time_used.tv_sec+drc->drc_ttl)) {
+			rfs4_dr_chstate(drp, NFS4_DUP_FREE);
+			list_insert_tail(&(drp->drc->dr_cache), drp);
+		}
+	}
+
+	drp = rfs4_alloc_dr(drc);
+	mutex_exit(&drc->lock);
+
+	if (drp == NULL) {
+		return (NFS4_DUP_ERROR);
+	}
+
+	/*
+	 * Place at the head of the list, init the state
+	 * to NEW and clear the time used field.
+	 */
+
+	drp->dr_state = NFS4_DUP_NEW;
+	drp->dr_time_used.tv_sec = drp->dr_time_used.tv_nsec = 0;
+
+	/*
+	 * If needed, resize the address buffer
+	 */
+	if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
+		if (drp->dr_addr.buf != NULL)
+			kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
+		drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
+		drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
+		if (drp->dr_addr.buf == NULL) {
+			/*
+			 * If the malloc fails, mark the entry
+			 * as free and put on the tail.
+			 */
+			drp->dr_addr.maxlen = 0;
+			drp->dr_state = NFS4_DUP_FREE;
+			mutex_enter(&drc->lock);
+			list_insert_tail(&(drc->dr_cache), drp);
+			mutex_exit(&drc->lock);
+			return (NFS4_DUP_ERROR);
+		}
+	}
+
+
+	/*
+	 * Copy the address.
+	 */
+	drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
+
+	bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
+		(caddr_t)drp->dr_addr.buf,
+		drp->dr_addr.len);
+
+	drp->dr_xid = the_xid;
+	drp->dr_bkt = dr_bkt;
+
+	/*
+	 * Insert at the head of the bucket and
+	 * the drc lists..
+	 */
+	mutex_enter(&drc->lock);
+	list_insert_head(&drc->dr_cache, drp);
+	list_insert_head(dr_bkt, drp);
+	mutex_exit(&drc->lock);
+
+	*dup = drp;
+
+	return (NFS4_DUP_NEW);
+}
+
+/*
+ *
+ * This function handles the duplicate request cache,
+ * NULL_PROC and COMPOUND procedure calls for NFSv4;
+ *
+ * Passed into this function are:-
+ *
+ * 	disp	A pointer to our dispatch table entry
+ * 	req	The request to process
+ * 	xprt	The server transport handle
+ * 	ap	A pointer to the arguments
+ *
+ *
+ * When appropriate this function is responsible for inserting
+ * the reply into the duplicate cache or replaying an existing
+ * cached reply.
+ *
+ * dr_stat 	reflects the state of the duplicate request that
+ * 		has been inserted into or retrieved from the cache
+ *
+ * drp		is the duplicate request entry
+ *
+ */
+int
+rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
+		SVCXPRT *xprt, char *ap)
+{
+
+	COMPOUND4res res_buf, *rbp;
+	COMPOUND4args *cap;
+
+	cred_t 	*cr = NULL;
+	int	error = 0;
+	int 	dis_flags = 0;
+	int 	dr_stat = NFS4_NOT_DUP;
+	rfs4_dupreq_t *drp = NULL;
+
+	ASSERT(disp);
+
+	/*
+	 * Short circuit the RPC_NULL proc.
+	 */
+	if (disp->dis_proc == rpc_null) {
+		if (!svc_sendreply(xprt, xdr_void, NULL)) {
+			return (1);
+		}
+		return (0);
+	}
+
+	/* Only NFSv4 Compounds from this point onward */
+
+	rbp = &res_buf;
+	cap = (COMPOUND4args *)ap;
+
+	/*
+	 * Figure out the disposition of the whole COMPOUND
+	 * and record it's IDEMPOTENTCY.
+	 */
+	rfs4_compound_flagproc(cap, &dis_flags);
+
+	/*
+	 * If NON-IDEMPOTENT then we need to figure out if this
+	 * request can be replied from the duplicate cache.
+	 *
+	 * If this is a new request then we need to insert the
+	 * reply into the duplicate cache.
+	 */
+	if (!(dis_flags & RPC_IDEMPOTENT)) {
+		/* look for a replay from the cache or allocate */
+		dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
+
+		switch (dr_stat) {
+
+		case NFS4_DUP_ERROR:
+			svcerr_systemerr(xprt);
+			return (1);
+			/* NOTREACHED */
+
+		case NFS4_DUP_PENDING:
+			/*
+			 * reply has previously been inserted into the
+			 * duplicate cache, however the reply has
+			 * not yet been sent via svc_sendreply()
+			 */
+			return (1);
+			/* NOTREACHED */
+
+		case NFS4_DUP_NEW:
+			curthread->t_flag |= T_DONTPEND;
+			/* NON-IDEMPOTENT proc call */
+			rfs4_compound(cap, rbp, NULL, req, cr);
+
+			curthread->t_flag &= ~T_DONTPEND;
+			if (curthread->t_flag & T_WOULDBLOCK) {
+				curthread->t_flag &= ~T_WOULDBLOCK;
+				/*
+				 * mark this entry as FREE and plop
+				 * on the end of the cache list
+				 */
+				mutex_enter(&drp->drc->lock);
+				rfs4_dr_chstate(drp, NFS4_DUP_FREE);
+				list_insert_tail(&(drp->drc->dr_cache), drp);
+				mutex_exit(&drp->drc->lock);
+				return (1);
+			}
+			drp->dr_res = res_buf;
+			break;
+
+		case NFS4_DUP_REPLAY:
+			/* replay from the cache */
+			rbp = &(drp->dr_res);
+			break;
+		}
+	} else {
+		curthread->t_flag |= T_DONTPEND;
+		/* IDEMPOTENT proc call */
+		rfs4_compound(cap, rbp, NULL, req, cr);
+
+		curthread->t_flag &= ~T_DONTPEND;
+		if (curthread->t_flag & T_WOULDBLOCK) {
+			curthread->t_flag &= ~T_WOULDBLOCK;
+			return (1);
+		}
+	}
+
+	/*
+	 * Send out the replayed reply or the 'real' one.
+	 */
+	if (!svc_sendreply(xprt,  xdr_COMPOUND4res, (char *)rbp)) {
+		DTRACE_PROBE2(nfss__e__dispatch_sendfail,
+			struct svc_req *, xprt,
+			char *, rbp);
+		error++;
+	}
+
+	/*
+	 * If this reply was just inserted into the duplicate cache
+	 * mark it as available for replay
+	 */
+	if (dr_stat == NFS4_DUP_NEW) {
+		mutex_enter(&drp->drc->lock);
+		rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
+		mutex_exit(&drp->drc->lock);
+	} else if (dr_stat == NFS4_NOT_DUP) {
+		rfs4_compound_free(rbp);
+	}
+
+	return (error);
+}