upstream/illumos/illumos-gate: usr/src/uts/common/fs/cachefs/cachefs


/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cred.h>
#include <sys/proc.h>
#include <sys/user.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/file.h>
#include <sys/filio.h>
#include <sys/uio.h>
#include <sys/buf.h>
#include <sys/mman.h>
#include <sys/tiuser.h>
#include <sys/pathname.h>
#include <sys/dirent.h>
#include <sys/conf.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/swap.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/disp.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/mount.h>
#include <sys/bootconf.h>
#include <sys/dnlc.h>
#include <sys/stat.h>
#include <sys/acl.h>
#include <sys/policy.h>
#include <rpc/types.h>

#include <vm/hat.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <sys/fs/cachefs_fs.h>
#include <sys/fs/cachefs_dir.h>
#include <sys/fs/cachefs_dlog.h>
#include <sys/fs/cachefs_ioctl.h>
#include <sys/fs/cachefs_log.h>
#include <fs/fs_subr.h>

int cachefs_dnlc;	/* use dnlc, debugging */

static void cachefs_attr_setup(vattr_t *srcp, vattr_t *targp, cnode_t *cp,
    cred_t *cr);
static void cachefs_creategid(cnode_t *dcp, cnode_t *newcp, vattr_t *vap,
    cred_t *cr);
static void cachefs_createacl(cnode_t *dcp, cnode_t *newcp);
static int cachefs_getaclfromcache(cnode_t *cp, vsecattr_t *vsec);
static int cachefs_getacldirvp(cnode_t *cp);
static void cachefs_acl2perm(cnode_t *cp, vsecattr_t *vsec);
static int cachefs_access_local(void *cp, int mode, cred_t *cr);
static int cachefs_acl_access(struct cnode *cp, int mode, cred_t *cr);
static int cachefs_push_connected(vnode_t *vp, struct buf *bp, size_t iolen,
    u_offset_t iooff, cred_t *cr);
static int cachefs_push_front(vnode_t *vp, struct buf *bp, size_t iolen,
    u_offset_t iooff, cred_t *cr);
static int cachefs_setattr_connected(vnode_t *vp, vattr_t *vap, int flags,
    cred_t *cr, caller_context_t *ct);
static int cachefs_setattr_disconnected(vnode_t *vp, vattr_t *vap,
    int flags, cred_t *cr, caller_context_t *ct);
static int cachefs_access_connected(struct vnode *vp, int mode,
    int flags, cred_t *cr);
static int cachefs_lookup_back(vnode_t *dvp, char *nm, vnode_t **vpp,
    cred_t *cr);
static int cachefs_symlink_connected(vnode_t *dvp, char *lnm, vattr_t *tva,
    char *tnm, cred_t *cr);
static int cachefs_symlink_disconnected(vnode_t *dvp, char *lnm,
    vattr_t *tva, char *tnm, cred_t *cr);
static int cachefs_link_connected(vnode_t *tdvp, vnode_t *fvp, char *tnm,
    cred_t *cr);
static int cachefs_link_disconnected(vnode_t *tdvp, vnode_t *fvp,
    char *tnm, cred_t *cr);
static int cachefs_mkdir_connected(vnode_t *dvp, char *nm, vattr_t *vap,
    vnode_t **vpp, cred_t *cr);
static int cachefs_mkdir_disconnected(vnode_t *dvp, char *nm, vattr_t *vap,
    vnode_t **vpp, cred_t *cr);
static int cachefs_stickyrmchk(struct cnode *dcp, struct cnode *cp, cred_t *cr);
static int cachefs_rmdir_connected(vnode_t *dvp, char *nm,
    vnode_t *cdir, cred_t *cr, vnode_t *vp);
static int cachefs_rmdir_disconnected(vnode_t *dvp, char *nm,
    vnode_t *cdir, cred_t *cr, vnode_t *vp);
static char *cachefs_newname(void);
static int cachefs_remove_dolink(vnode_t *dvp, vnode_t *vp, char *nm,
    cred_t *cr);
static int cachefs_rename_connected(vnode_t *odvp, char *onm,
    vnode_t *ndvp, char *nnm, cred_t *cr, vnode_t *delvp);
static int cachefs_rename_disconnected(vnode_t *odvp, char *onm,
    vnode_t *ndvp, char *nnm, cred_t *cr, vnode_t *delvp);
static int cachefs_readdir_connected(vnode_t *vp, uio_t *uiop, cred_t *cr,
    int *eofp);
static int cachefs_readdir_disconnected(vnode_t *vp, uio_t *uiop,
    cred_t *cr, int *eofp);
static int cachefs_readback_translate(cnode_t *cp, uio_t *uiop,
	cred_t *cr, int *eofp);

static int cachefs_setattr_common(vnode_t *vp, vattr_t *vap, int flags,
    cred_t *cr, caller_context_t *ct);

static	int	cachefs_open(struct vnode **, int, cred_t *,
			caller_context_t *);
static	int	cachefs_close(struct vnode *, int, int, offset_t,
			cred_t *, caller_context_t *);
static	int	cachefs_read(struct vnode *, struct uio *, int, cred_t *,
			caller_context_t *);
static	int	cachefs_write(struct vnode *, struct uio *, int, cred_t *,
			caller_context_t *);
static	int	cachefs_ioctl(struct vnode *, int, intptr_t, int, cred_t *,
			int *, caller_context_t *);
static	int	cachefs_getattr(struct vnode *, struct vattr *, int,
			cred_t *, caller_context_t *);
static	int	cachefs_setattr(struct vnode *, struct vattr *,
			int, cred_t *, caller_context_t *);
static	int	cachefs_access(struct vnode *, int, int, cred_t *,
			caller_context_t *);
static	int	cachefs_lookup(struct vnode *, char *, struct vnode **,
			struct pathname *, int, struct vnode *, cred_t *,
			caller_context_t *, int *, pathname_t *);
static	int	cachefs_create(struct vnode *, char *, struct vattr *,
			enum vcexcl, int, struct vnode **, cred_t *, int,
			caller_context_t *, vsecattr_t *);
static	int	cachefs_create_connected(vnode_t *dvp, char *nm,
			vattr_t *vap, enum vcexcl exclusive, int mode,
			vnode_t **vpp, cred_t *cr);
static	int	cachefs_create_disconnected(vnode_t *dvp, char *nm,
			vattr_t *vap, enum vcexcl exclusive, int mode,
			vnode_t **vpp, cred_t *cr);
static	int	cachefs_remove(struct vnode *, char *, cred_t *,
			caller_context_t *, int);
static	int	cachefs_link(struct vnode *, struct vnode *, char *,
			cred_t *, caller_context_t *, int);
static	int	cachefs_rename(struct vnode *, char *, struct vnode *,
			char *, cred_t *, caller_context_t *, int);
static	int	cachefs_mkdir(struct vnode *, char *, struct
			vattr *, struct vnode **, cred_t *, caller_context_t *,
			int, vsecattr_t *);
static	int	cachefs_rmdir(struct vnode *, char *, struct vnode *,
			cred_t *, caller_context_t *, int);
static	int	cachefs_readdir(struct vnode *, struct uio *,
			cred_t *, int *, caller_context_t *, int);
static	int	cachefs_symlink(struct vnode *, char *, struct vattr *,
			char *, cred_t *, caller_context_t *, int);
static	int	cachefs_readlink(struct vnode *, struct uio *, cred_t *,
			caller_context_t *);
static int cachefs_readlink_connected(vnode_t *vp, uio_t *uiop, cred_t *cr);
static int cachefs_readlink_disconnected(vnode_t *vp, uio_t *uiop);
static	int	cachefs_fsync(struct vnode *, int, cred_t *,
			caller_context_t *);
static	void	cachefs_inactive(struct vnode *, cred_t *, caller_context_t *);
static	int	cachefs_fid(struct vnode *, struct fid *, caller_context_t *);
static	int	cachefs_rwlock(struct vnode *, int, caller_context_t *);
static	void	cachefs_rwunlock(struct vnode *, int, caller_context_t *);
static	int	cachefs_seek(struct vnode *, offset_t, offset_t *,
			caller_context_t *);
static	int	cachefs_frlock(struct vnode *, int, struct flock64 *,
			int, offset_t, struct flk_callback *, cred_t *,
			caller_context_t *);
static	int	cachefs_space(struct vnode *, int, struct flock64 *, int,
			offset_t, cred_t *, caller_context_t *);
static	int	cachefs_realvp(struct vnode *, struct vnode **,
			caller_context_t *);
static	int	cachefs_getpage(struct vnode *, offset_t, size_t, uint_t *,
			struct page *[], size_t, struct seg *, caddr_t,
			enum seg_rw, cred_t *, caller_context_t *);
static	int	cachefs_getapage(struct vnode *, u_offset_t, size_t, uint_t *,
			struct page *[], size_t, struct seg *, caddr_t,
			enum seg_rw, cred_t *);
static	int	cachefs_getapage_back(struct vnode *, u_offset_t, size_t,
		uint_t *, struct page *[], size_t, struct seg *, caddr_t,
			enum seg_rw, cred_t *);
static	int	cachefs_putpage(struct vnode *, offset_t, size_t, int,
			cred_t *, caller_context_t *);
static	int	cachefs_map(struct vnode *, offset_t, struct as *,
			caddr_t *, size_t, uchar_t, uchar_t, uint_t, cred_t *,
			caller_context_t *);
static	int	cachefs_addmap(struct vnode *, offset_t, struct as *,
			caddr_t, size_t, uchar_t, uchar_t, uint_t, cred_t *,
			caller_context_t *);
static	int	cachefs_delmap(struct vnode *, offset_t, struct as *,
			caddr_t, size_t, uint_t, uint_t, uint_t, cred_t *,
			caller_context_t *);
static int	cachefs_setsecattr(vnode_t *vp, vsecattr_t *vsec,
			int flag, cred_t *cr, caller_context_t *);
static int	cachefs_getsecattr(vnode_t *vp, vsecattr_t *vsec,
			int flag, cred_t *cr, caller_context_t *);
static	int	cachefs_shrlock(vnode_t *, int, struct shrlock *, int,
			cred_t *, caller_context_t *);
static int cachefs_getsecattr_connected(vnode_t *vp, vsecattr_t *vsec, int flag,
    cred_t *cr);
static int cachefs_getsecattr_disconnected(vnode_t *vp, vsecattr_t *vsec,
    int flag, cred_t *cr);

static int	cachefs_dump(struct vnode *, caddr_t, offset_t, offset_t,
			caller_context_t *);
static int	cachefs_pageio(struct vnode *, page_t *,
		    u_offset_t, size_t, int, cred_t *, caller_context_t *);
static int	cachefs_writepage(struct vnode *vp, caddr_t base,
		    int tcount, struct uio *uiop);
static int	cachefs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
			caller_context_t *);

static int	cachefs_read_backfs_nfsv4(vnode_t *vp, uio_t *uiop, int ioflag,
			cred_t *cr, caller_context_t *ct);
static int	cachefs_write_backfs_nfsv4(vnode_t *vp, uio_t *uiop, int ioflag,
			cred_t *cr, caller_context_t *ct);
static int	cachefs_getattr_backfs_nfsv4(vnode_t *vp, vattr_t *vap,
			int flags, cred_t *cr, caller_context_t *ct);
static int	cachefs_remove_backfs_nfsv4(vnode_t *dvp, char *nm, cred_t *cr,
			vnode_t *vp);
static int	cachefs_getpage_backfs_nfsv4(struct vnode *vp, offset_t off,
			size_t len, uint_t *protp, struct page *pl[],
			size_t plsz, struct seg *seg, caddr_t addr,
			enum seg_rw rw, cred_t *cr);
static int	cachefs_putpage_backfs_nfsv4(vnode_t *vp, offset_t off,
			size_t len, int flags, cred_t *cr);
static int	cachefs_map_backfs_nfsv4(struct vnode *vp, offset_t off,
			struct as *as, caddr_t *addrp, size_t len, uchar_t prot,
			uchar_t maxprot, uint_t flags, cred_t *cr);
static int	cachefs_space_backfs_nfsv4(struct vnode *vp, int cmd,
			struct flock64 *bfp, int flag, offset_t offset,
			cred_t *cr, caller_context_t *ct);

struct vnodeops *cachefs_vnodeops;

static const fs_operation_def_t cachefs_vnodeops_template[] = {
	VOPNAME_OPEN,		{ .vop_open = cachefs_open },
	VOPNAME_CLOSE,		{ .vop_close = cachefs_close },
	VOPNAME_READ,		{ .vop_read = cachefs_read },
	VOPNAME_WRITE,		{ .vop_write = cachefs_write },
	VOPNAME_IOCTL,		{ .vop_ioctl = cachefs_ioctl },
	VOPNAME_GETATTR,	{ .vop_getattr = cachefs_getattr },
	VOPNAME_SETATTR,	{ .vop_setattr = cachefs_setattr },
	VOPNAME_ACCESS,		{ .vop_access = cachefs_access },
	VOPNAME_LOOKUP,		{ .vop_lookup = cachefs_lookup },
	VOPNAME_CREATE,		{ .vop_create = cachefs_create },
	VOPNAME_REMOVE,		{ .vop_remove = cachefs_remove },
	VOPNAME_LINK,		{ .vop_link = cachefs_link },
	VOPNAME_RENAME,		{ .vop_rename = cachefs_rename },
	VOPNAME_MKDIR,		{ .vop_mkdir = cachefs_mkdir },
	VOPNAME_RMDIR,		{ .vop_rmdir = cachefs_rmdir },
	VOPNAME_READDIR,	{ .vop_readdir = cachefs_readdir },
	VOPNAME_SYMLINK,	{ .vop_symlink = cachefs_symlink },
	VOPNAME_READLINK,	{ .vop_readlink = cachefs_readlink },
	VOPNAME_FSYNC,		{ .vop_fsync = cachefs_fsync },
	VOPNAME_INACTIVE,	{ .vop_inactive = cachefs_inactive },
	VOPNAME_FID,		{ .vop_fid = cachefs_fid },
	VOPNAME_RWLOCK,		{ .vop_rwlock = cachefs_rwlock },
	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = cachefs_rwunlock },
	VOPNAME_SEEK,		{ .vop_seek = cachefs_seek },
	VOPNAME_FRLOCK,		{ .vop_frlock = cachefs_frlock },
	VOPNAME_SPACE,		{ .vop_space = cachefs_space },
	VOPNAME_REALVP,		{ .vop_realvp = cachefs_realvp },
	VOPNAME_GETPAGE,	{ .vop_getpage = cachefs_getpage },
	VOPNAME_PUTPAGE,	{ .vop_putpage = cachefs_putpage },
	VOPNAME_MAP,		{ .vop_map = cachefs_map },
	VOPNAME_ADDMAP,		{ .vop_addmap = cachefs_addmap },
	VOPNAME_DELMAP,		{ .vop_delmap = cachefs_delmap },
	VOPNAME_DUMP,		{ .vop_dump = cachefs_dump },
	VOPNAME_PATHCONF,	{ .vop_pathconf = cachefs_pathconf },
	VOPNAME_PAGEIO,		{ .vop_pageio = cachefs_pageio },
	VOPNAME_SETSECATTR,	{ .vop_setsecattr = cachefs_setsecattr },
	VOPNAME_GETSECATTR,	{ .vop_getsecattr = cachefs_getsecattr },
	VOPNAME_SHRLOCK,	{ .vop_shrlock = cachefs_shrlock },
	NULL,			NULL
};

/* forward declarations of statics */
static void cachefs_modified(cnode_t *cp);
static int cachefs_modified_alloc(cnode_t *cp);

int
cachefs_init_vnops(char *name)
{
	return (vn_make_ops(name,
	    cachefs_vnodeops_template, &cachefs_vnodeops));
}

struct vnodeops *
cachefs_getvnodeops(void)
{
	return (cachefs_vnodeops);
}

static int
cachefs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
{
	int error = 0;
	cnode_t *cp = VTOC(*vpp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int held = 0;
	int type;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_open: ENTER vpp %p flag %x\n",
		    (void *)vpp, flag);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}
	if ((flag & FWRITE) &&
	    ((*vpp)->v_type == VDIR || (*vpp)->v_type == VLNK)) {
		error = EISDIR;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the open operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			goto out;
		held = 1;

		mutex_enter(&cp->c_statelock);

		/* grab creds if we do not have any yet */
		if (cp->c_cred == NULL) {
			crhold(cr);
			cp->c_cred = cr;
		}
		cp->c_flags |= CN_NEEDOPEN;

		/* if we are disconnected */
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			/* if we cannot write to the file system */
			if ((flag & FWRITE) && CFS_ISFS_WRITE_AROUND(fscp)) {
				mutex_exit(&cp->c_statelock);
				connected = 1;
				continue;
			}
			/*
			 * Allow read only requests to continue
			 */
			if ((flag & (FWRITE|FREAD)) == FREAD) {
				/* track the flag for opening the backvp */
				cp->c_rdcnt++;
				mutex_exit(&cp->c_statelock);
				error = 0;
				break;
			}

			/*
			 * check credentials  - if this procs
			 * credentials don't match the creds in the
			 * cnode disallow writing while disconnected.
			 */
			if (crcmp(cp->c_cred, CRED()) != 0 &&
			    secpolicy_vnode_access2(CRED(), *vpp,
			    cp->c_attr.va_uid, 0, VWRITE) != 0) {
				mutex_exit(&cp->c_statelock);
				connected = 1;
				continue;
			}
			/* to get here, we know that the WRITE flag is on */
			cp->c_wrcnt++;
			if (flag & FREAD)
				cp->c_rdcnt++;
		}

		/* else if we are connected */
		else {
			/* if cannot use the cached copy of the file */
			if ((flag & FWRITE) && CFS_ISFS_WRITE_AROUND(fscp) &&
			    ((cp->c_flags & CN_NOCACHE) == 0))
				cachefs_nocache(cp);

			/* pass open to the back file */
			if (cp->c_backvp) {
				cp->c_flags &= ~CN_NEEDOPEN;
				CFS_DPRINT_BACKFS_NFSV4(fscp,
				    ("cachefs_open (nfsv4): cnode %p, "
				    "backvp %p\n", cp, cp->c_backvp));
				error = VOP_OPEN(&cp->c_backvp, flag, cr, ct);
				if (CFS_TIMEOUT(fscp, error)) {
					mutex_exit(&cp->c_statelock);
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					continue;
				} else if (error) {
					mutex_exit(&cp->c_statelock);
					break;
				}
			} else {
				/* backvp will be VOP_OPEN'd later */
				if (flag & FREAD)
					cp->c_rdcnt++;
				if (flag & FWRITE)
					cp->c_wrcnt++;
			}

			/*
			 * Now perform a consistency check on the file.
			 * If strict consistency then force a check to
			 * the backfs even if the timeout has not expired
			 * for close-to-open consistency.
			 */
			type = 0;
			if (fscp->fs_consttype == CFS_FS_CONST_STRICT)
				type = C_BACK_CHECK;
			error = CFSOP_CHECK_COBJECT(fscp, cp, type, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				mutex_exit(&cp->c_statelock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				continue;
			}
		}
		mutex_exit(&cp->c_statelock);
		break;
	}
	if (held)
		cachefs_cd_release(fscp);
out:
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_open: EXIT vpp %p error %d\n",
		    (void *)vpp, error);
#endif
	return (error);
}

/* ARGSUSED */
static int
cachefs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
	caller_context_t *ct)
{
	int error = 0;
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int held = 0;
	int connected = 0;
	int close_cnt = 1;
	cachefscache_t *cachep;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_close: ENTER vp %p\n", (void *)vp);
#endif
	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the close operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/*
	 * File could have been passed in or inherited from the global zone, so
	 * we don't want to flat out reject the request; we'll just leave things
	 * the way they are and let the backfs (NFS) deal with it.
	 */
	/* get rid of any local locks */
	if (CFS_ISFS_LLOCK(fscp)) {
		(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
	}

	/* clean up if this is the daemon closing down */
	if ((fscp->fs_cddaemonid == ttoproc(curthread)->p_pid) &&
	    ((ttoproc(curthread)->p_pid) != 0) &&
	    (vp == fscp->fs_rootvp) &&
	    (count == 1)) {
		mutex_enter(&fscp->fs_cdlock);
		fscp->fs_cddaemonid = 0;
		if (fscp->fs_dlogfile)
			fscp->fs_cdconnected = CFS_CD_DISCONNECTED;
		else
			fscp->fs_cdconnected = CFS_CD_CONNECTED;
		cv_broadcast(&fscp->fs_cdwaitcv);
		mutex_exit(&fscp->fs_cdlock);
		if (fscp->fs_flags & CFS_FS_ROOTFS) {
			cachep = fscp->fs_cache;
			mutex_enter(&cachep->c_contentslock);
			ASSERT(cachep->c_rootdaemonid != 0);
			cachep->c_rootdaemonid = 0;
			mutex_exit(&cachep->c_contentslock);
		}
		return (0);
	}

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			goto out;
		held = 1;
		connected = 0;

		/* if not the last close */
		if (count > 1) {
			if (fscp->fs_cdconnected != CFS_CD_CONNECTED)
				goto out;
			mutex_enter(&cp->c_statelock);
			if (cp->c_backvp) {
				CFS_DPRINT_BACKFS_NFSV4(fscp,
				    ("cachefs_close (nfsv4): cnode %p, "
				    "backvp %p\n", cp, cp->c_backvp));
				error = VOP_CLOSE(cp->c_backvp, flag, count,
				    offset, cr, ct);
				if (CFS_TIMEOUT(fscp, error)) {
					mutex_exit(&cp->c_statelock);
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					continue;
				}
			}
			mutex_exit(&cp->c_statelock);
			goto out;
		}

		/*
		 * If the file is an unlinked file, then flush the lookup
		 * cache so that inactive will be called if this is
		 * the last reference.  It will invalidate all of the
		 * cached pages, without writing them out.  Writing them
		 * out is not required because they will be written to a
		 * file which will be immediately removed.
		 */
		if (cp->c_unldvp != NULL) {
			dnlc_purge_vp(vp);
			mutex_enter(&cp->c_statelock);
			error = cp->c_error;
			cp->c_error = 0;
			mutex_exit(&cp->c_statelock);
			/* always call VOP_CLOSE() for back fs vnode */
		}

		/* force dirty data to stable storage */
		else if ((vp->v_type == VREG) && (flag & FWRITE) &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
			/* clean the cachefs pages synchronously */
			error = cachefs_putpage_common(vp, (offset_t)0,
			    0, 0, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					continue;
				} else {
					connected = 1;
					continue;
				}
			}

			/* if no space left in cache, wait until connected */
			if ((error == ENOSPC) &&
			    (fscp->fs_cdconnected != CFS_CD_CONNECTED)) {
				connected = 1;
				continue;
			}

			/* clear the cnode error if putpage worked */
			if ((error == 0) && cp->c_error) {
				mutex_enter(&cp->c_statelock);
				cp->c_error = 0;
				mutex_exit(&cp->c_statelock);
			}

			/* if any other important error */
			if (cp->c_error) {
				/* get rid of the pages */
				(void) cachefs_putpage_common(vp,
				    (offset_t)0, 0, B_INVAL | B_FORCE, cr);
				dnlc_purge_vp(vp);
			}
		}

		mutex_enter(&cp->c_statelock);
		if (cp->c_backvp &&
		    (fscp->fs_cdconnected == CFS_CD_CONNECTED)) {
			error = VOP_CLOSE(cp->c_backvp, flag, close_cnt,
			    offset, cr, ct);
			if (CFS_TIMEOUT(fscp, error)) {
				mutex_exit(&cp->c_statelock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				/* don't decrement the vnode counts again */
				close_cnt = 0;
				continue;
			}
		}
		mutex_exit(&cp->c_statelock);
		break;
	}

	mutex_enter(&cp->c_statelock);
	if (!error)
		error = cp->c_error;
	cp->c_error = 0;
	mutex_exit(&cp->c_statelock);

out:
	if (held)
		cachefs_cd_release(fscp);
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_close: EXIT vp %p\n", (void *)vp);
#endif
	return (error);
}

/*ARGSUSED*/
static int
cachefs_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
	caller_context_t *ct)
{
	struct cnode *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	register u_offset_t off;
	register int mapoff;
	register caddr_t base;
	int n;
	offset_t diff;
	uint_t flags = 0;
	int error = 0;

#if 0
	if (vp->v_flag & VNOCACHE)
		flags = SM_INVAL;
#endif
	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);
	if (vp->v_type != VREG)
		return (EISDIR);

	ASSERT(RW_READ_HELD(&cp->c_rwlock));

	if (uiop->uio_resid == 0)
		return (0);


	if (uiop->uio_loffset < (offset_t)0)
		return (EINVAL);

	/*
	 * Call backfilesystem to read if NFSv4, the cachefs code
	 * does the read from the back filesystem asynchronously
	 * which is not supported by pass-through functionality.
	 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_read_backfs_nfsv4(vp, uiop, ioflag, cr, ct);
		goto out;
	}

	if (MANDLOCK(vp, cp->c_attr.va_mode)) {
		error = chklock(vp, FREAD, (offset_t)uiop->uio_loffset,
		    uiop->uio_resid, uiop->uio_fmode, ct);
		if (error)
			return (error);
	}

	/*
	 * Sit in a loop and transfer (uiomove) the data in up to
	 * MAXBSIZE chunks. Each chunk is mapped into the kernel's
	 * address space as needed and then released.
	 */
	do {
		/*
		 *	off	Offset of current MAXBSIZE chunk
		 *	mapoff	Offset within the current chunk
		 *	n	Number of bytes to move from this chunk
		 *	base	kernel address of mapped in chunk
		 */
		off = uiop->uio_loffset & (offset_t)MAXBMASK;
		mapoff = uiop->uio_loffset & MAXBOFFSET;
		n = MAXBSIZE - mapoff;
		if (n > uiop->uio_resid)
			n = (uint_t)uiop->uio_resid;

		/* perform consistency check */
		error = cachefs_cd_access(fscp, 0, 0);
		if (error)
			break;
		mutex_enter(&cp->c_statelock);
		error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
		diff = cp->c_size - uiop->uio_loffset;
		mutex_exit(&cp->c_statelock);
		if (CFS_TIMEOUT(fscp, error)) {
			cachefs_cd_release(fscp);
			cachefs_cd_timedout(fscp);
			error = 0;
			continue;
		}
		cachefs_cd_release(fscp);

		if (error)
			break;

		if (diff <= (offset_t)0)
			break;
		if (diff < (offset_t)n)
			n = diff;

		base = segmap_getmapflt(segkmap, vp, off, (uint_t)n, 1, S_READ);

		error = segmap_fault(kas.a_hat, segkmap, base, n,
		    F_SOFTLOCK, S_READ);
		if (error) {
			(void) segmap_release(segkmap, base, 0);
			if (FC_CODE(error) == FC_OBJERR)
				error =  FC_ERRNO(error);
			else
				error = EIO;
			break;
		}
		error = uiomove(base+mapoff, n, UIO_READ, uiop);
		(void) segmap_fault(kas.a_hat, segkmap, base, n,
		    F_SOFTUNLOCK, S_READ);
		if (error == 0) {
			/*
			 * if we read a whole page(s), or to eof,
			 *  we won't need this page(s) again soon.
			 */
			if (n + mapoff == MAXBSIZE ||
			    uiop->uio_loffset == cp->c_size)
				flags |= SM_DONTNEED;
		}
		(void) segmap_release(segkmap, base, flags);
	} while (error == 0 && uiop->uio_resid > 0);

out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_read: EXIT error %d resid %ld\n", error,
		    uiop->uio_resid);
#endif
	return (error);
}

/*
 * cachefs_read_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the read (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_read_backfs_nfsv4(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
			caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation
	 * is supported, the cnode backvp must exist, and cachefs
	 * optional (eg., disconnectable) flags are turned off. Assert
	 * these conditions for the read operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp, ("cachefs_read_backfs_nfsv4: cnode %p, "
	    "backvp %p\n", cp, backvp));

	(void) VOP_RWLOCK(backvp, V_WRITELOCK_FALSE, ct);
	error = VOP_READ(backvp, uiop, ioflag, cr, ct);
	VOP_RWUNLOCK(backvp, V_WRITELOCK_FALSE, ct);

	/* Increment cache miss counter */
	fscp->fs_stats.st_misses++;

	return (error);
}

/*ARGSUSED*/
static int
cachefs_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
	caller_context_t *ct)
{
	struct cnode *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;
	u_offset_t off;
	caddr_t base;
	uint_t bsize;
	uint_t flags;
	int n, on;
	rlim64_t limit = uiop->uio_llimit;
	ssize_t resid;
	offset_t offset;
	offset_t remainder;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf(
		"cachefs_write: ENTER vp %p offset %llu count %ld cflags %x\n",
		    (void *)vp, uiop->uio_loffset, uiop->uio_resid,
		    cp->c_flags);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}
	if (vp->v_type != VREG) {
		error = EISDIR;
		goto out;
	}

	ASSERT(RW_WRITE_HELD(&cp->c_rwlock));

	if (uiop->uio_resid == 0) {
		goto out;
	}

	/* Call backfilesystem to write if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_write_backfs_nfsv4(vp, uiop, ioflag, cr, ct);
		goto out2;
	}

	if (MANDLOCK(vp, cp->c_attr.va_mode)) {
		error = chklock(vp, FWRITE, (offset_t)uiop->uio_loffset,
		    uiop->uio_resid, uiop->uio_fmode, ct);
		if (error)
			goto out;
	}

	if (ioflag & FAPPEND) {
		for (;;) {
			/* do consistency check to get correct file size */
			error = cachefs_cd_access(fscp, 0, 1);
			if (error)
				goto out;
			mutex_enter(&cp->c_statelock);
			error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
			uiop->uio_loffset = cp->c_size;
			mutex_exit(&cp->c_statelock);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				cachefs_cd_timedout(fscp);
				continue;
			}
			cachefs_cd_release(fscp);
			if (error)
				goto out;
			break;
		}
	}

	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
		limit = MAXOFFSET_T;

	if (uiop->uio_loffset >= limit) {
		proc_t *p = ttoproc(curthread);

		mutex_enter(&p->p_lock);
		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
		    p, RCA_UNSAFE_SIGINFO);
		mutex_exit(&p->p_lock);
		error = EFBIG;
		goto out;
	}
	if (uiop->uio_loffset > fscp->fs_offmax) {
		error = EFBIG;
		goto out;
	}

	if (limit > fscp->fs_offmax)
		limit = fscp->fs_offmax;

	if (uiop->uio_loffset < (offset_t)0) {
		error = EINVAL;
		goto out;
	}

	offset = uiop->uio_loffset + uiop->uio_resid;
	/*
	 * Check to make sure that the process will not exceed
	 * its limit on file size.  It is okay to write up to
	 * the limit, but not beyond.  Thus, the write which
	 * reaches the limit will be short and the next write
	 * will return an error.
	 */
	remainder = 0;
	if (offset > limit) {
		remainder = (int)(offset - (u_offset_t)limit);
		uiop->uio_resid = limit - uiop->uio_loffset;
		if (uiop->uio_resid <= 0) {
			proc_t *p = ttoproc(curthread);

			uiop->uio_resid += remainder;
			mutex_enter(&p->p_lock);
			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
			mutex_exit(&p->p_lock);
			error = EFBIG;
			goto out;
		}
	}

	resid = uiop->uio_resid;
	offset = uiop->uio_loffset;
	bsize = vp->v_vfsp->vfs_bsize;

	/* loop around and do the write in MAXBSIZE chunks */
	do {
		/* mapping offset */
		off = uiop->uio_loffset & (offset_t)MAXBMASK;
		on = uiop->uio_loffset & MAXBOFFSET; /* Rel. offset */
		n = MAXBSIZE - on;
		if (n > uiop->uio_resid)
			n = (int)uiop->uio_resid;

		/*
		 * Touch the page and fault it in if it is not in
		 * core before segmap_getmapflt can lock it. This
		 * is to avoid the deadlock if the buffer is mapped
		 * to the same file through mmap which we want to
		 * write to.
		 */
		uio_prefaultpages((long)n, uiop);

		base = segmap_getmap(segkmap, vp, off);
		error = cachefs_writepage(vp, (base + on), n, uiop);
		if (error == 0) {
			flags = 0;
			/*
			 * Have written a whole block.Start an
			 * asynchronous write and mark the buffer to
			 * indicate that it won't be needed again
			 * soon.
			 */
			if (n + on == bsize) {
				flags = SM_WRITE |SM_ASYNC |SM_DONTNEED;
			}
#if 0
			/* XXX need to understand this */
			if ((ioflag & (FSYNC|FDSYNC)) ||
			    (cp->c_backvp && vn_has_flocks(cp->c_backvp))) {
				flags &= ~SM_ASYNC;
				flags |= SM_WRITE;
			}
#else
			if (ioflag & (FSYNC|FDSYNC)) {
				flags &= ~SM_ASYNC;
				flags |= SM_WRITE;
			}
#endif
			error = segmap_release(segkmap, base, flags);
		} else {
			(void) segmap_release(segkmap, base, 0);
		}
	} while (error == 0 && uiop->uio_resid > 0);

out:
	if (error == EINTR && (ioflag & (FSYNC|FDSYNC))) {
		uiop->uio_resid = resid;
		uiop->uio_loffset = offset;
	} else
		uiop->uio_resid += remainder;

out2:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_write: EXIT error %d\n", error);
#endif
	return (error);
}

/*
 * cachefs_write_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the write (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_write_backfs_nfsv4(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
			caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation
	 * is supported, the cnode backvp must exist, and cachefs
	 * optional (eg., disconnectable) flags are turned off. Assert
	 * these conditions for the read operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting the backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp, ("cachefs_write_backfs_nfsv4: cnode %p, "
	    "backvp %p\n", cp, backvp));
	(void) VOP_RWLOCK(backvp, V_WRITELOCK_TRUE, ct);
	error = VOP_WRITE(backvp, uiop, ioflag, cr, ct);
	VOP_RWUNLOCK(backvp, V_WRITELOCK_TRUE, ct);

	return (error);
}

/*
 * see if we've charged ourselves for frontfile data at
 * the given offset.  If not, allocate a block for it now.
 */
static int
cachefs_charge_page(struct cnode *cp, u_offset_t offset)
{
	u_offset_t blockoff;
	int error;
	int inc;

	ASSERT(MUTEX_HELD(&cp->c_statelock));
	/*LINTED*/
	ASSERT(PAGESIZE <= MAXBSIZE);

	error = 0;
	blockoff = offset & (offset_t)MAXBMASK;

	/* get the front file if necessary so allocblocks works */
	if ((cp->c_frontvp == NULL) &&
	    ((cp->c_flags & CN_NOCACHE) == 0)) {
		(void) cachefs_getfrontfile(cp);
	}
	if (cp->c_flags & CN_NOCACHE)
		return (1);

	if (cachefs_check_allocmap(cp, blockoff))
		return (0);

	for (inc = PAGESIZE; inc < MAXBSIZE; inc += PAGESIZE)
		if (cachefs_check_allocmap(cp, blockoff+inc))
			return (0);

	error = cachefs_allocblocks(C_TO_FSCACHE(cp)->fs_cache, 1,
	    cp->c_metadata.md_rltype);
	if (error == 0) {
		cp->c_metadata.md_frontblks++;
		cp->c_flags |= CN_UPDATED;
	}
	return (error);
}

/*
 * Called only by cachefs_write to write 1 page or less of data.
 *	base   - base address kernel addr space
 *	tcount - Total bytes to move - < MAXBSIZE
 */
static int
cachefs_writepage(vnode_t *vp, caddr_t base, int tcount, uio_t *uiop)
{
	struct cnode *cp =  VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	register int n;
	register u_offset_t offset;
	int error = 0, terror;
	extern struct as kas;
	u_offset_t lastpage_off;
	int pagecreate = 0;
	int newpage;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf(
		    "cachefs_writepage: ENTER vp %p offset %llu len %ld\\\n",
		    (void *)vp, uiop->uio_loffset, uiop->uio_resid);
#endif

	/*
	 * Move bytes in PAGESIZE chunks. We must avoid spanning pages in
	 * uiomove() because page faults may cause the cache to be invalidated
	 * out from under us.
	 */
	do {
		offset = uiop->uio_loffset;
		lastpage_off = (cp->c_size - 1) & (offset_t)PAGEMASK;

		/*
		 * If not connected then need to make sure we have space
		 * to perform the write.  We could make this check
		 * a little tighter by only doing it if we are growing the file.
		 */
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			error = cachefs_allocblocks(fscp->fs_cache, 1,
			    cp->c_metadata.md_rltype);
			if (error)
				break;
			cachefs_freeblocks(fscp->fs_cache, 1,
			    cp->c_metadata.md_rltype);
		}

		/*
		 * n is the number of bytes required to satisfy the request
		 * or the number of bytes to fill out the page.
		 */
		n = (int)(PAGESIZE - ((uintptr_t)base & PAGEOFFSET));
		if (n > tcount)
			n = tcount;

		/*
		 * The number of bytes of data in the last page can not
		 * be accurately be determined while page is being
		 * uiomove'd to and the size of the file being updated.
		 * Thus, inform threads which need to know accurately
		 * how much data is in the last page of the file.  They
		 * will not do the i/o immediately, but will arrange for
		 * the i/o to happen later when this modify operation
		 * will have finished.
		 *
		 * in similar NFS code, this is done right before the
		 * uiomove(), which is best.  but here in cachefs, we
		 * have two uiomove()s, so we must do it here.
		 */
		ASSERT(!(cp->c_flags & CN_CMODINPROG));
		mutex_enter(&cp->c_statelock);
		cp->c_flags |= CN_CMODINPROG;
		cp->c_modaddr = (offset & (offset_t)MAXBMASK);
		mutex_exit(&cp->c_statelock);

		/*
		 * Check to see if we can skip reading in the page
		 * and just allocate the memory.  We can do this
		 * if we are going to rewrite the entire mapping
		 * or if we are going to write to or beyond the current
		 * end of file from the beginning of the mapping.
		 */
		if ((offset > (lastpage_off + PAGEOFFSET)) ||
		    ((cp->c_size == 0) && (offset < PAGESIZE)) ||
		    ((uintptr_t)base & PAGEOFFSET) == 0 && (n == PAGESIZE ||
		    ((offset + n) >= cp->c_size))) {
			pagecreate = 1;

			/*
			 * segmap_pagecreate() returns 1 if it calls
			 * page_create_va() to allocate any pages.
			 */
			newpage = segmap_pagecreate(segkmap,
			    (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK),
			    PAGESIZE, 0);
			/* do not zero page if we are overwriting all of it */
			if (!((((uintptr_t)base & PAGEOFFSET) == 0) &&
			    (n == PAGESIZE))) {
				(void) kzero((void *)
				    ((uintptr_t)base & (uintptr_t)PAGEMASK),
				    PAGESIZE);
			}
			error = uiomove(base, n, UIO_WRITE, uiop);

			/*
			 * Unlock the page allocated by page_create_va()
			 * in segmap_pagecreate()
			 */
			if (newpage)
				segmap_pageunlock(segkmap,
				    (caddr_t)((uintptr_t)base &
				    (uintptr_t)PAGEMASK),
				    PAGESIZE, S_WRITE);
		} else {
			/*
			 * KLUDGE ! Use segmap_fault instead of faulting and
			 * using as_fault() to avoid a recursive readers lock
			 * on kas.
			 */
			error = segmap_fault(kas.a_hat, segkmap, (caddr_t)
			    ((uintptr_t)base & (uintptr_t)PAGEMASK),
			    PAGESIZE, F_SOFTLOCK, S_WRITE);
			if (error) {
				if (FC_CODE(error) == FC_OBJERR)
					error =  FC_ERRNO(error);
				else
					error = EIO;
				break;
			}
			error = uiomove(base, n, UIO_WRITE, uiop);
			(void) segmap_fault(kas.a_hat, segkmap, (caddr_t)
			    ((uintptr_t)base & (uintptr_t)PAGEMASK),
			    PAGESIZE, F_SOFTUNLOCK, S_WRITE);
		}
		n = (int)(uiop->uio_loffset - offset); /* n = # bytes written */
		base += n;
		tcount -= n;

		/* get access to the file system */
		if ((terror = cachefs_cd_access(fscp, 0, 1)) != 0) {
			error = terror;
			break;
		}

		/*
		 * cp->c_attr.va_size is the maximum number of
		 * bytes known to be in the file.
		 * Make sure it is at least as high as the
		 * last byte we just wrote into the buffer.
		 */
		mutex_enter(&cp->c_statelock);
		if (cp->c_size < uiop->uio_loffset) {
			cp->c_size = uiop->uio_loffset;
		}
		if (cp->c_size != cp->c_attr.va_size) {
			cp->c_attr.va_size = cp->c_size;
			cp->c_flags |= CN_UPDATED;
		}
		/* c_size is now correct, so we can clear modinprog */
		cp->c_flags &= ~CN_CMODINPROG;
		if (error == 0) {
			cp->c_flags |= CDIRTY;
			if (pagecreate && (cp->c_flags & CN_NOCACHE) == 0) {
				/*
				 * if we're not in NOCACHE mode
				 * (i.e., single-writer), we update the
				 * allocmap here rather than waiting until
				 * cachefspush is called.  This prevents
				 * getpage from clustering up pages from
				 * the backfile and stomping over the changes
				 * we make here.
				 */
				if (cachefs_charge_page(cp, offset) == 0) {
					cachefs_update_allocmap(cp,
					    offset & (offset_t)PAGEMASK,
					    (size_t)PAGESIZE);
				}

				/* else we ran out of space */
				else {
					/* nocache file if connected */
					if (fscp->fs_cdconnected ==
					    CFS_CD_CONNECTED)
						cachefs_nocache(cp);
					/*
					 * If disconnected then cannot
					 * nocache the file.  Let it have
					 * the space.
					 */
					else {
						cp->c_metadata.md_frontblks++;
						cp->c_flags |= CN_UPDATED;
						cachefs_update_allocmap(cp,
						    offset & (offset_t)PAGEMASK,
						    (size_t)PAGESIZE);
					}
				}
			}
		}
		mutex_exit(&cp->c_statelock);
		cachefs_cd_release(fscp);
	} while (tcount > 0 && error == 0);

	if (cp->c_flags & CN_CMODINPROG) {
		/* XXX assert error != 0?  FC_ERRNO() makes this more risky. */
		mutex_enter(&cp->c_statelock);
		cp->c_flags &= ~CN_CMODINPROG;
		mutex_exit(&cp->c_statelock);
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_writepage: EXIT error %d\n", error);
#endif

	return (error);
}

/*
 * Pushes out pages to the back and/or front file system.
 */
static int
cachefs_push(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
    int flags, cred_t *cr)
{
	struct cnode *cp = VTOC(vp);
	struct buf *bp;
	int error;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	u_offset_t iooff;
	size_t iolen;
	u_offset_t lbn;
	u_offset_t lbn_off;
	uint_t bsize;

	ASSERT((flags & B_ASYNC) == 0);
	ASSERT(!vn_is_readonly(vp));
	ASSERT(pp != NULL);
	ASSERT(cr != NULL);

	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
	lbn = pp->p_offset / bsize;
	lbn_off = lbn * bsize;

	/*
	 * Find a kluster that fits in one block, or in
	 * one page if pages are bigger than blocks.  If
	 * there is less file space allocated than a whole
	 * page, we'll shorten the i/o request below.
	 */

	pp = pvn_write_kluster(vp, pp, &iooff, &iolen, lbn_off,
	    roundup(bsize, PAGESIZE), flags);

	/*
	 * The CN_CMODINPROG flag makes sure that we use a correct
	 * value of c_size, below.  CN_CMODINPROG is set in
	 * cachefs_writepage().  When CN_CMODINPROG is set it
	 * indicates that a uiomove() is in progress and the c_size
	 * has not been made consistent with the new size of the
	 * file. When the uiomove() completes the c_size is updated
	 * and the CN_CMODINPROG flag is cleared.
	 *
	 * The CN_CMODINPROG flag makes sure that cachefs_push_front
	 * and cachefs_push_connected see a consistent value of
	 * c_size.  Without this handshaking, it is possible that
	 * these routines will pick up the old value of c_size before
	 * the uiomove() in cachefs_writepage() completes.  This will
	 * result in the vn_rdwr() being too small, and data loss.
	 *
	 * More precisely, there is a window between the time the
	 * uiomove() completes and the time the c_size is updated. If
	 * a VOP_PUTPAGE() operation intervenes in this window, the
	 * page will be picked up, because it is dirty; it will be
	 * unlocked, unless it was pagecreate'd. When the page is
	 * picked up as dirty, the dirty bit is reset
	 * (pvn_getdirty()). In cachefs_push_connected(), c_size is
	 * checked.  This will still be the old size.  Therefore, the
	 * page will not be written out to the correct length, and the
	 * page will be clean, so the data may disappear.
	 */
	if (cp->c_flags & CN_CMODINPROG) {
		mutex_enter(&cp->c_statelock);
		if ((cp->c_flags & CN_CMODINPROG) &&
		    cp->c_modaddr + MAXBSIZE > iooff &&
		    cp->c_modaddr < iooff + iolen) {
			page_t *plist;

			/*
			 * A write is in progress for this region of
			 * the file.  If we did not detect
			 * CN_CMODINPROG here then this path through
			 * cachefs_push_connected() would eventually
			 * do the vn_rdwr() and may not write out all
			 * of the data in the pages.  We end up losing
			 * data. So we decide to set the modified bit
			 * on each page in the page list and mark the
			 * cnode with CDIRTY.  This push will be
			 * restarted at some later time.
			 */

			plist = pp;
			while (plist != NULL) {
				pp = plist;
				page_sub(&plist, pp);
				hat_setmod(pp);
				page_io_unlock(pp);
				page_unlock(pp);
			}
			cp->c_flags |= CDIRTY;
			mutex_exit(&cp->c_statelock);
			if (offp)
				*offp = iooff;
			if (lenp)
				*lenp = iolen;
			return (0);
		}
		mutex_exit(&cp->c_statelock);
	}

	/*
	 * Set the pages up for pageout.
	 */
	bp = pageio_setup(pp, iolen, CTOV(cp), B_WRITE | flags);
	if (bp == NULL) {

		/*
		 * currently, there is no way for pageio_setup() to
		 * return NULL, since it uses its own scheme for
		 * kmem_alloc()ing that shouldn't return NULL, and
		 * since pageio_setup() itself dereferences the thing
		 * it's about to return.  still, we need to be ready
		 * in case this ever does start happening.
		 */

		error = ENOMEM;
		goto writedone;
	}
	/*
	 * pageio_setup should have set b_addr to 0.  This
	 * is correct since we want to do I/O on a page
	 * boundary.  bp_mapin will use this addr to calculate
	 * an offset, and then set b_addr to the kernel virtual
	 * address it allocated for us.
	 */
	bp->b_edev = 0;
	bp->b_dev = 0;
	bp->b_lblkno = (diskaddr_t)lbtodb(iooff);
	bp_mapin(bp);

	iolen  = cp->c_size - ldbtob(bp->b_blkno);
	if (iolen > bp->b_bcount)
		iolen  = bp->b_bcount;

	/* if connected */
	if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
		/* write to the back file first */
		error = cachefs_push_connected(vp, bp, iolen, iooff, cr);

		/* write to the front file if allowed */
		if ((error == 0) && CFS_ISFS_NONSHARED(fscp) &&
		    ((cp->c_flags & CN_NOCACHE) == 0)) {
			/* try to write to the front file */
			(void) cachefs_push_front(vp, bp, iolen, iooff, cr);
		}
	}

	/* else if disconnected */
	else {
		/* try to write to the front file */
		error = cachefs_push_front(vp, bp, iolen, iooff, cr);
	}

	bp_mapout(bp);
	pageio_done(bp);

writedone:

	pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
	if (offp)
		*offp = iooff;
	if (lenp)
		*lenp = iolen;

	/* XXX ask bob mastors how to fix this someday */
	mutex_enter(&cp->c_statelock);
	if (error) {
		if (error == ENOSPC) {
			if ((fscp->fs_cdconnected == CFS_CD_CONNECTED) ||
			    CFS_ISFS_SOFT(fscp)) {
				CFSOP_INVALIDATE_COBJECT(fscp, cp, cr);
				cp->c_error = error;
			}
		} else if ((CFS_TIMEOUT(fscp, error) == 0) &&
		    (error != EINTR)) {
			CFSOP_INVALIDATE_COBJECT(fscp, cp, cr);
			cp->c_error = error;
		}
	} else if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
		CFSOP_MODIFY_COBJECT(fscp, cp, cr);
	}
	mutex_exit(&cp->c_statelock);

	return (error);
}

/*
 * Pushes out pages to the back file system.
 */
static int
cachefs_push_connected(vnode_t *vp, struct buf *bp, size_t iolen,
    u_offset_t iooff, cred_t *cr)
{
	struct cnode *cp = VTOC(vp);
	int error = 0;
	int mode = 0;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	ssize_t resid;
	vnode_t *backvp;

	/* get the back file if necessary */
	mutex_enter(&cp->c_statelock);
	if (cp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, cp);
		if (error) {
			mutex_exit(&cp->c_statelock);
			goto out;
		}
	}
	backvp = cp->c_backvp;
	VN_HOLD(backvp);
	mutex_exit(&cp->c_statelock);

	if (CFS_ISFS_NONSHARED(fscp) && CFS_ISFS_SNR(fscp))
		mode = FSYNC;

	/* write to the back file */
	error = bp->b_error = vn_rdwr(UIO_WRITE, backvp, bp->b_un.b_addr,
	    iolen, iooff, UIO_SYSSPACE, mode,
	    RLIM64_INFINITY, cr, &resid);
	if (error) {
#ifdef CFSDEBUG
		CFS_DEBUG(CFSDEBUG_VOPS | CFSDEBUG_BACK)
			printf("cachefspush: error %d cr %p\n",
			    error, (void *)cr);
#endif
		bp->b_flags |= B_ERROR;
	}
	VN_RELE(backvp);
out:
	return (error);
}

/*
 * Pushes out pages to the front file system.
 * Called for both connected and disconnected states.
 */
static int
cachefs_push_front(vnode_t *vp, struct buf *bp, size_t iolen,
    u_offset_t iooff, cred_t *cr)
{
	struct cnode *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;
	ssize_t resid;
	u_offset_t popoff;
	off_t commit = 0;
	uint_t seq;
	enum cachefs_rl_type type;
	vnode_t *frontvp = NULL;

	mutex_enter(&cp->c_statelock);

	if (!CFS_ISFS_NONSHARED(fscp)) {
		error = ETIMEDOUT;
		goto out;
	}

	/* get the front file if necessary */
	if ((cp->c_frontvp == NULL) &&
	    ((cp->c_flags & CN_NOCACHE) == 0)) {
		(void) cachefs_getfrontfile(cp);
	}
	if (cp->c_flags & CN_NOCACHE) {
		error = ETIMEDOUT;
		goto out;
	}

	/* if disconnected, needs to be populated and have good attributes */
	if ((fscp->fs_cdconnected != CFS_CD_CONNECTED) &&
	    (((cp->c_metadata.md_flags & MD_POPULATED) == 0) ||
	    (cp->c_metadata.md_flags & MD_NEEDATTRS))) {
		error = ETIMEDOUT;
		goto out;
	}

	for (popoff = iooff; popoff < (iooff + iolen); popoff += MAXBSIZE) {
		if (cachefs_charge_page(cp, popoff)) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				cachefs_nocache(cp);
				goto out;
			} else {
				error = ENOSPC;
				goto out;
			}
		}
	}

	if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
		/* log the first putpage to a file */
		if ((cp->c_metadata.md_flags & MD_PUTPAGE) == 0) {
			/* uses open's creds if we have them */
			if (cp->c_cred)
				cr = cp->c_cred;

			if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
				error = cachefs_dlog_cidmap(fscp);
				if (error) {
					error = ENOSPC;
					goto out;
				}
				cp->c_metadata.md_flags |= MD_MAPPING;
			}

			commit = cachefs_dlog_modify(fscp, cp, cr, &seq);
			if (commit == 0) {
				/* out of space */
				error = ENOSPC;
				goto out;
			}

			cp->c_metadata.md_seq = seq;
			type = cp->c_metadata.md_rltype;
			cachefs_modified(cp);
			cp->c_metadata.md_flags |= MD_PUTPAGE;
			cp->c_metadata.md_flags &= ~MD_PUSHDONE;
			cp->c_flags |= CN_UPDATED;
		}

		/* subsequent putpages just get a new sequence number */
		else {
			/* but only if it matters */
			if (cp->c_metadata.md_seq != fscp->fs_dlogseq) {
				seq = cachefs_dlog_seqnext(fscp);
				if (seq == 0) {
					error = ENOSPC;
					goto out;
				}
				cp->c_metadata.md_seq = seq;
				cp->c_flags |= CN_UPDATED;
				/* XXX maybe should do write_metadata here */
			}
		}
	}

	frontvp = cp->c_frontvp;
	VN_HOLD(frontvp);
	mutex_exit(&cp->c_statelock);
	error = bp->b_error = vn_rdwr(UIO_WRITE, frontvp,
	    bp->b_un.b_addr, iolen, iooff, UIO_SYSSPACE, 0,
	    RLIM64_INFINITY, kcred, &resid);
	mutex_enter(&cp->c_statelock);
	VN_RELE(frontvp);
	frontvp = NULL;
	if (error) {
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			cachefs_nocache(cp);
			error = 0;
			goto out;
		} else {
			goto out;
		}
	}

	(void) cachefs_update_allocmap(cp, iooff, iolen);
	cp->c_flags |= (CN_UPDATED | CN_NEED_FRONT_SYNC |
	    CN_POPULATION_PENDING);
	if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
		gethrestime(&cp->c_metadata.md_localmtime);
		cp->c_metadata.md_flags |= MD_LOCALMTIME;
	}

out:
	if (commit) {
		/* commit the log record */
		ASSERT(fscp->fs_cdconnected == CFS_CD_DISCONNECTED);
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX fix on panic */
		}
	}

	if (error && commit) {
		cp->c_metadata.md_flags &= ~MD_PUTPAGE;
		cachefs_rlent_moveto(fscp->fs_cache, type,
		    cp->c_metadata.md_rlno, cp->c_metadata.md_frontblks);
		cp->c_metadata.md_rltype = type;
		cp->c_flags |= CN_UPDATED;
	}
	mutex_exit(&cp->c_statelock);
	return (error);
}

/*ARGSUSED*/
static int
cachefs_dump(struct vnode *vp, caddr_t foo1, offset_t foo2, offset_t foo3,
    caller_context_t *ct)
{
	return (ENOSYS); /* should we panic if we get here? */
}

/*ARGSUSED*/
static int
cachefs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cred,
	int *rvalp, caller_context_t *ct)
{
	int error;
	struct cnode *cp = VTOC(vp);
	struct fscache *fscp = C_TO_FSCACHE(cp);
	struct cachefscache *cachep;
	extern kmutex_t cachefs_cachelock;
	extern cachefscache_t *cachefs_cachelist;
	cachefsio_pack_t *packp;
	STRUCT_DECL(cachefsio_dcmd, dcmd);
	int	inlen, outlen;	/* LP64: generic int for struct in/out len */
	void *dinp, *doutp;
	int (*dcmd_routine)(vnode_t *, void *, void *);

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions which ensure
	 * that only a subset of the ioctls are "truly supported"
	 * for NFSv4 (these are CFSDCMD_DAEMONID and CFSDCMD_GETSTATS.
	 * The packing operations are meaningless since there is
	 * no caching for NFSv4, and the called functions silently
	 * return if the backfilesystem is NFSv4. The daemon
	 * commands except for those above are essentially used
	 * for disconnectable operation support (including log
	 * rolling), so in each called function, we assert that
	 * NFSv4 is not in use. The _FIO* calls (except _FIOCOD)
	 * are from "cfsfstype" which is not a documented
	 * command. However, the command is visible in
	 * /usr/lib/fs/cachefs so the commands are simply let
	 * through (don't seem to impact pass-through functionality).
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	switch (cmd) {
	case CACHEFSIO_PACK:
		packp = cachefs_kmem_alloc(sizeof (cachefsio_pack_t), KM_SLEEP);
		error = xcopyin((void *)arg, packp, sizeof (cachefsio_pack_t));
		if (!error)
			error = cachefs_pack(vp, packp->p_name, cred);
		cachefs_kmem_free(packp, sizeof (cachefsio_pack_t));
		break;

	case CACHEFSIO_UNPACK:
		packp = cachefs_kmem_alloc(sizeof (cachefsio_pack_t), KM_SLEEP);
		error = xcopyin((void *)arg, packp, sizeof (cachefsio_pack_t));
		if (!error)
			error = cachefs_unpack(vp, packp->p_name, cred);
		cachefs_kmem_free(packp, sizeof (cachefsio_pack_t));
		break;

	case CACHEFSIO_PACKINFO:
		packp = cachefs_kmem_alloc(sizeof (cachefsio_pack_t), KM_SLEEP);
		error = xcopyin((void *)arg, packp, sizeof (cachefsio_pack_t));
		if (!error)
			error = cachefs_packinfo(vp, packp->p_name,
			    &packp->p_status, cred);
		if (!error)
			error = xcopyout(packp, (void *)arg,
			    sizeof (cachefsio_pack_t));
		cachefs_kmem_free(packp, sizeof (cachefsio_pack_t));
		break;

	case CACHEFSIO_UNPACKALL:
		error = cachefs_unpackall(vp);
		break;

	case CACHEFSIO_DCMD:
		/*
		 * This is a private interface between the cachefsd and
		 * this file system.
		 */

		/* must be root to use these commands */
		if (secpolicy_fs_config(cred, vp->v_vfsp) != 0)
			return (EPERM);

		/* get the command packet */
		STRUCT_INIT(dcmd, flag & DATAMODEL_MASK);
		error = xcopyin((void *)arg, STRUCT_BUF(dcmd),
		    SIZEOF_STRUCT(cachefsio_dcmd, DATAMODEL_NATIVE));
		if (error)
			return (error);

		/* copy in the data for the operation */
		dinp = NULL;
		if ((inlen = STRUCT_FGET(dcmd, d_slen)) > 0) {
			dinp = cachefs_kmem_alloc(inlen, KM_SLEEP);
			error = xcopyin(STRUCT_FGETP(dcmd, d_sdata), dinp,
			    inlen);
			if (error)
				return (error);
		}

		/* allocate space for the result */
		doutp = NULL;
		if ((outlen = STRUCT_FGET(dcmd, d_rlen)) > 0)
			doutp = cachefs_kmem_alloc(outlen, KM_SLEEP);

		/*
		 * Assert NFSv4 only allows the daemonid and getstats
		 * daemon requests
		 */
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0 ||
		    STRUCT_FGET(dcmd, d_cmd) == CFSDCMD_DAEMONID ||
		    STRUCT_FGET(dcmd, d_cmd) == CFSDCMD_GETSTATS);

		/* get the routine to execute */
		dcmd_routine = NULL;
		switch (STRUCT_FGET(dcmd, d_cmd)) {
		case CFSDCMD_DAEMONID:
			dcmd_routine = cachefs_io_daemonid;
			break;
		case CFSDCMD_STATEGET:
			dcmd_routine = cachefs_io_stateget;
			break;
		case CFSDCMD_STATESET:
			dcmd_routine = cachefs_io_stateset;
			break;
		case CFSDCMD_XWAIT:
			dcmd_routine = cachefs_io_xwait;
			break;
		case CFSDCMD_EXISTS:
			dcmd_routine = cachefs_io_exists;
			break;
		case CFSDCMD_LOSTFOUND:
			dcmd_routine = cachefs_io_lostfound;
			break;
		case CFSDCMD_GETINFO:
			dcmd_routine = cachefs_io_getinfo;
			break;
		case CFSDCMD_CIDTOFID:
			dcmd_routine = cachefs_io_cidtofid;
			break;
		case CFSDCMD_GETATTRFID:
			dcmd_routine = cachefs_io_getattrfid;
			break;
		case CFSDCMD_GETATTRNAME:
			dcmd_routine = cachefs_io_getattrname;
			break;
		case CFSDCMD_GETSTATS:
			dcmd_routine = cachefs_io_getstats;
			break;
		case CFSDCMD_ROOTFID:
			dcmd_routine = cachefs_io_rootfid;
			break;
		case CFSDCMD_CREATE:
			dcmd_routine = cachefs_io_create;
			break;
		case CFSDCMD_REMOVE:
			dcmd_routine = cachefs_io_remove;
			break;
		case CFSDCMD_LINK:
			dcmd_routine = cachefs_io_link;
			break;
		case CFSDCMD_RENAME:
			dcmd_routine = cachefs_io_rename;
			break;
		case CFSDCMD_MKDIR:
			dcmd_routine = cachefs_io_mkdir;
			break;
		case CFSDCMD_RMDIR:
			dcmd_routine = cachefs_io_rmdir;
			break;
		case CFSDCMD_SYMLINK:
			dcmd_routine = cachefs_io_symlink;
			break;
		case CFSDCMD_SETATTR:
			dcmd_routine = cachefs_io_setattr;
			break;
		case CFSDCMD_SETSECATTR:
			dcmd_routine = cachefs_io_setsecattr;
			break;
		case CFSDCMD_PUSHBACK:
			dcmd_routine = cachefs_io_pushback;
			break;
		default:
			error = ENOTTY;
			break;
		}

		/* execute the routine */
		if (dcmd_routine)
			error = (*dcmd_routine)(vp, dinp, doutp);

		/* copy out the result */
		if ((error == 0) && doutp)
			error = xcopyout(doutp, STRUCT_FGETP(dcmd, d_rdata),
			    outlen);

		/* free allocated memory */
		if (dinp)
			cachefs_kmem_free(dinp, inlen);
		if (doutp)
			cachefs_kmem_free(doutp, outlen);

		break;

	case _FIOCOD:
		if (secpolicy_fs_config(cred, vp->v_vfsp) != 0) {
			error = EPERM;
			break;
		}

		error = EBUSY;
		if (arg) {
			/* non-zero arg means do all filesystems */
			mutex_enter(&cachefs_cachelock);
			for (cachep = cachefs_cachelist; cachep != NULL;
			    cachep = cachep->c_next) {
				mutex_enter(&cachep->c_fslistlock);
				for (fscp = cachep->c_fslist;
				    fscp != NULL;
				    fscp = fscp->fs_next) {
					if (CFS_ISFS_CODCONST(fscp)) {
						gethrestime(&fscp->fs_cod_time);
						error = 0;
					}
				}
				mutex_exit(&cachep->c_fslistlock);
			}
			mutex_exit(&cachefs_cachelock);
		} else {
			if (CFS_ISFS_CODCONST(fscp)) {
				gethrestime(&fscp->fs_cod_time);
				error = 0;
			}
		}
		break;

	case _FIOSTOPCACHE:
		error = cachefs_stop_cache(cp);
		break;

	default:
		error = ENOTTY;
		break;
	}

	/* return the result */
	return (error);
}

ino64_t
cachefs_fileno_conflict(fscache_t *fscp, ino64_t old)
{
	ino64_t new;

	ASSERT(MUTEX_HELD(&fscp->fs_fslock));

	for (;;) {
		fscp->fs_info.fi_localfileno++;
		if (fscp->fs_info.fi_localfileno == 0)
			fscp->fs_info.fi_localfileno = 3;
		fscp->fs_flags |= CFS_FS_DIRTYINFO;

		new = fscp->fs_info.fi_localfileno;
		if (! cachefs_fileno_inuse(fscp, new))
			break;
	}

	cachefs_inum_register(fscp, old, new);
	cachefs_inum_register(fscp, new, 0);
	return (new);
}

/*ARGSUSED*/
static int
cachefs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
	caller_context_t *ct)
{
	struct cnode *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getattr: ENTER vp %p\n", (void *)vp);
#endif

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	/* Call backfilesystem getattr if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_getattr_backfs_nfsv4(vp, vap, flags, cr, ct);
		goto out;
	}

	/*
	 * If it has been specified that the return value will
	 * just be used as a hint, and we are only being asked
	 * for size, fsid or rdevid, then return the client's
	 * notion of these values without checking to make sure
	 * that the attribute cache is up to date.
	 * The whole point is to avoid an over the wire GETATTR
	 * call.
	 */
	if (flags & ATTR_HINT) {
		if (vap->va_mask ==
		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
			if (vap->va_mask | AT_SIZE)
				vap->va_size = cp->c_size;
			/*
			 * Return the FSID of the cachefs filesystem,
			 * not the back filesystem
			 */
			if (vap->va_mask | AT_FSID)
				vap->va_fsid = vp->v_vfsp->vfs_dev;
			if (vap->va_mask | AT_RDEV)
				vap->va_rdev = cp->c_attr.va_rdev;
			return (0);
		}
	}

	/*
	 * Only need to flush pages if asking for the mtime
	 * and if there any dirty pages.
	 */
	if (vap->va_mask & AT_MTIME) {
		/*EMPTY*/
#if 0
		/*
		 * XXX bob: stolen from nfs code, need to do something similar
		 */
		rp = VTOR(vp);
		if ((rp->r_flags & RDIRTY) || rp->r_iocnt > 0)
			(void) nfs3_putpage(vp, (offset_t)0, 0, 0, cr);
#endif
	}

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			goto out;
		held = 1;

		/*
		 * If it has been specified that the return value will
		 * just be used as a hint, and we are only being asked
		 * for size, fsid or rdevid, then return the client's
		 * notion of these values without checking to make sure
		 * that the attribute cache is up to date.
		 * The whole point is to avoid an over the wire GETATTR
		 * call.
		 */
		if (flags & ATTR_HINT) {
			if (vap->va_mask ==
			    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
				if (vap->va_mask | AT_SIZE)
					vap->va_size = cp->c_size;
				/*
				 * Return the FSID of the cachefs filesystem,
				 * not the back filesystem
				 */
				if (vap->va_mask | AT_FSID)
					vap->va_fsid = vp->v_vfsp->vfs_dev;
				if (vap->va_mask | AT_RDEV)
					vap->va_rdev = cp->c_attr.va_rdev;
				goto out;
			}
		}

		mutex_enter(&cp->c_statelock);
		if ((cp->c_metadata.md_flags & MD_NEEDATTRS) &&
		    (fscp->fs_cdconnected != CFS_CD_CONNECTED)) {
			mutex_exit(&cp->c_statelock);
			connected = 1;
			continue;
		}

		error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
		if (CFS_TIMEOUT(fscp, error)) {
			mutex_exit(&cp->c_statelock);
			cachefs_cd_release(fscp);
			held = 0;
			cachefs_cd_timedout(fscp);
			continue;
		}
		if (error) {
			mutex_exit(&cp->c_statelock);
			break;
		}

		/* check for fileno conflict */
		if ((fscp->fs_inum_size > 0) &&
		    ((cp->c_metadata.md_flags & MD_LOCALFILENO) == 0)) {
			ino64_t fakenum;

			mutex_exit(&cp->c_statelock);
			mutex_enter(&fscp->fs_fslock);
			fakenum = cachefs_inum_real2fake(fscp,
			    cp->c_attr.va_nodeid);
			if (fakenum == 0) {
				fakenum = cachefs_fileno_conflict(fscp,
				    cp->c_attr.va_nodeid);
			}
			mutex_exit(&fscp->fs_fslock);

			mutex_enter(&cp->c_statelock);
			cp->c_metadata.md_flags |= MD_LOCALFILENO;
			cp->c_metadata.md_localfileno = fakenum;
			cp->c_flags |= CN_UPDATED;
		}

		/* copy out the attributes */
		*vap = cp->c_attr;

		/*
		 * return the FSID of the cachefs filesystem,
		 * not the back filesystem
		 */
		vap->va_fsid = vp->v_vfsp->vfs_dev;

		/* return our idea of the size */
		if (cp->c_size > vap->va_size)
			vap->va_size = cp->c_size;

		/* overwrite with our version of fileno and timestamps */
		vap->va_nodeid = cp->c_metadata.md_localfileno;
		vap->va_mtime = cp->c_metadata.md_localmtime;
		vap->va_ctime = cp->c_metadata.md_localctime;

		mutex_exit(&cp->c_statelock);
		break;
	}
out:
	if (held)
		cachefs_cd_release(fscp);
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getattr: EXIT error = %d\n", error);
#endif
	return (error);
}

/*
 * cachefs_getattr_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the getattr (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_getattr_backfs_nfsv4(vnode_t *vp, vattr_t *vap,
    int flags, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation
	 * is supported, the cnode backvp must exist, and cachefs
	 * optional (eg., disconnectable) flags are turned off. Assert
	 * these conditions for the getattr operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp, ("cachefs_getattr_backfs_nfsv4: cnode %p,"
	    " backvp %p\n", cp, backvp));
	error = VOP_GETATTR(backvp, vap, flags, cr, ct);

	/* Update attributes */
	cp->c_attr = *vap;

	/*
	 * return the FSID of the cachefs filesystem,
	 * not the back filesystem
	 */
	vap->va_fsid = vp->v_vfsp->vfs_dev;

	return (error);
}

/*ARGSUSED4*/
static int
cachefs_setattr(
	vnode_t *vp,
	vattr_t *vap,
	int flags,
	cred_t *cr,
	caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error;
	int connected;
	int held = 0;

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the setattr operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	connected = 0;
	for (;;) {
		/* drop hold on file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}

		/* acquire access to the file system */
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/* perform the setattr */
		error = cachefs_setattr_common(vp, vap, flags, cr, ct);
		if (error) {
			/* if connected */
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				if (CFS_TIMEOUT(fscp, error)) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					connected = 0;
					continue;
				}
			}

			/* else must be disconnected */
			else {
				if (CFS_TIMEOUT(fscp, error)) {
					connected = 1;
					continue;
				}
			}
		}
		break;
	}

	if (held) {
		cachefs_cd_release(fscp);
	}
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
	return (error);
}

static int
cachefs_setattr_common(
	vnode_t *vp,
	vattr_t *vap,
	int flags,
	cred_t *cr,
	caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	cachefscache_t *cachep = fscp->fs_cache;
	uint_t mask = vap->va_mask;
	int error = 0;
	uint_t bcnt;

	/* Cannot set these attributes. */
	if (mask & AT_NOSET)
		return (EINVAL);

	/*
	 * Truncate file.  Must have write permission and not be a directory.
	 */
	if (mask & AT_SIZE) {
		if (vp->v_type == VDIR) {
			if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_TRUNCATE))
				cachefs_log_truncate(cachep, EISDIR,
				    fscp->fs_cfsvfsp,
				    &cp->c_metadata.md_cookie,
				    cp->c_id.cid_fileno,
				    crgetuid(cr), vap->va_size);
			return (EISDIR);
		}
	}

	/*
	 * Gotta deal with one special case here, where we're setting the
	 * size of the file. First, we zero out part of the page after the
	 * new size of the file. Then we toss (not write) all pages after
	 * page in which the new offset occurs. Note that the NULL passed
	 * in instead of a putapage() fn parameter is correct, since
	 * no dirty pages will be found (B_TRUNC | B_INVAL).
	 */

	rw_enter(&cp->c_rwlock, RW_WRITER);

	/* sync dirty pages */
	if (!CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_putpage_common(vp, (offset_t)0, 0, 0, cr);
		if (error == EINTR)
			goto out;
	}
	error = 0;

	/* if connected */
	if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
		error = cachefs_setattr_connected(vp, vap, flags, cr, ct);
	}
	/* else must be disconnected */
	else {
		error = cachefs_setattr_disconnected(vp, vap, flags, cr, ct);
	}
	if (error)
		goto out;

	/*
	 * If the file size has been changed then
	 * toss whole pages beyond the end of the file and zero
	 * the portion of the last page that is beyond the end of the file.
	 */
	if (mask & AT_SIZE && !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		bcnt = (uint_t)(cp->c_size & PAGEOFFSET);
		if (bcnt)
			pvn_vpzero(vp, cp->c_size, PAGESIZE - bcnt);
		(void) pvn_vplist_dirty(vp, cp->c_size, cachefs_push,
		    B_TRUNC | B_INVAL, cr);
	}

out:
	rw_exit(&cp->c_rwlock);

	if ((mask & AT_SIZE) &&
	    (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_TRUNCATE)))
		cachefs_log_truncate(cachep, error, fscp->fs_cfsvfsp,
		    &cp->c_metadata.md_cookie, cp->c_id.cid_fileno,
		    crgetuid(cr), vap->va_size);

	return (error);
}

static int
cachefs_setattr_connected(
	vnode_t *vp,
	vattr_t *vap,
	int flags,
	cred_t *cr,
	caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	uint_t mask = vap->va_mask;
	int error = 0;
	int setsize;

	mutex_enter(&cp->c_statelock);

	if (cp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, cp);
		if (error)
			goto out;
	}

	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error)
		goto out;

	CFS_DPRINT_BACKFS_NFSV4(fscp, ("cachefs_setattr (nfsv4): cnode %p, "
	    "backvp %p\n", cp, cp->c_backvp));
	error = VOP_SETATTR(cp->c_backvp, vap, flags, cr, ct);
	if (error) {
		goto out;
	}

	/* if the size of the file is being changed */
	if (mask & AT_SIZE) {
		cp->c_size = vap->va_size;
		error = 0;
		setsize = 0;

		/* see if okay to try to set the file size */
		if (((cp->c_flags & CN_NOCACHE) == 0) &&
		    CFS_ISFS_NONSHARED(fscp)) {
			/* okay to set size if file is populated */
			if (cp->c_metadata.md_flags & MD_POPULATED)
				setsize = 1;

			/*
			 * Okay to set size if front file exists and setting
			 * file size to zero.
			 */
			if ((cp->c_metadata.md_flags & MD_FILE) &&
			    (vap->va_size == 0))
				setsize = 1;
		}

		/* if okay to try to set the file size */
		if (setsize) {
			error = 0;
			if (cp->c_frontvp == NULL)
				error = cachefs_getfrontfile(cp);
			if (error == 0)
				error = cachefs_frontfile_size(cp, cp->c_size);
		} else if (cp->c_metadata.md_flags & MD_FILE) {
			/* make sure file gets nocached */
			error = EEXIST;
		}

		/* if we have to nocache the file */
		if (error) {
			if ((cp->c_flags & CN_NOCACHE) == 0 &&
			    !CFS_ISFS_BACKFS_NFSV4(fscp))
				cachefs_nocache(cp);
			error = 0;
		}
	}

	cp->c_flags |= CN_UPDATED;

	/* XXX bob: given what modify_cobject does this seems unnecessary */
	cp->c_attr.va_mask = AT_ALL;
	error = VOP_GETATTR(cp->c_backvp, &cp->c_attr, 0, cr, ct);
	if (error)
		goto out;

	cp->c_attr.va_size = MAX(cp->c_attr.va_size, cp->c_size);
	cp->c_size = cp->c_attr.va_size;

	CFSOP_MODIFY_COBJECT(fscp, cp, cr);
out:
	mutex_exit(&cp->c_statelock);
	return (error);
}

/*
 * perform the setattr on the local file system
 */
/*ARGSUSED4*/
static int
cachefs_setattr_disconnected(
	vnode_t *vp,
	vattr_t *vap,
	int flags,
	cred_t *cr,
	caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int mask;
	int error;
	int newfile;
	off_t commit = 0;

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	/* if we do not have good attributes */
	if (cp->c_metadata.md_flags & MD_NEEDATTRS)
		return (ETIMEDOUT);

	/* primary concern is to keep this routine as much like ufs_setattr */

	mutex_enter(&cp->c_statelock);

	error = secpolicy_vnode_setattr(cr, vp, vap, &cp->c_attr, flags,
	    cachefs_access_local, cp);

	if (error)
		goto out;

	mask = vap->va_mask;

	/* if changing the size of the file */
	if (mask & AT_SIZE) {
		if (vp->v_type == VDIR) {
			error = EISDIR;
			goto out;
		}

		if (vp->v_type == VFIFO) {
			error = 0;
			goto out;
		}

		if ((vp->v_type != VREG) &&
		    !((vp->v_type == VLNK) && (vap->va_size == 0))) {
			error = EINVAL;
			goto out;
		}

		if (vap->va_size > fscp->fs_offmax) {
			error = EFBIG;
			goto out;
		}

		/* if the file is not populated and we are not truncating it */
		if (((cp->c_metadata.md_flags & MD_POPULATED) == 0) &&
		    (vap->va_size != 0)) {
			error = ETIMEDOUT;
			goto out;
		}

		if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				error = ENOSPC;
				goto out;
			}
			cp->c_metadata.md_flags |= MD_MAPPING;
		}

		/* log the operation */
		commit = cachefs_dlog_setattr(fscp, vap, flags, cp, cr);
		if (commit == 0) {
			error = ENOSPC;
			goto out;
		}
		cp->c_flags &= ~CN_NOCACHE;

		/* special case truncating fast sym links */
		if ((vp->v_type == VLNK) &&
		    (cp->c_metadata.md_flags & MD_FASTSYMLNK)) {
			/* XXX how can we get here */
			/* XXX should update mtime */
			cp->c_size = 0;
			error = 0;
			goto out;
		}

		/* get the front file, this may create one */
		newfile = (cp->c_metadata.md_flags & MD_FILE) ? 0 : 1;
		if (cp->c_frontvp == NULL) {
			error = cachefs_getfrontfile(cp);
			if (error)
				goto out;
		}
		ASSERT(cp->c_frontvp);
		if (newfile && (cp->c_flags & CN_UPDATED)) {
			/* allocate space for the metadata */
			ASSERT((cp->c_flags & CN_ALLOC_PENDING) == 0);
			ASSERT((cp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR)
			    == 0);
			error = filegrp_write_metadata(cp->c_filegrp,
			    &cp->c_id, &cp->c_metadata);
			if (error)
				goto out;
		}

		/* change the size of the front file */
		error = cachefs_frontfile_size(cp, vap->va_size);
		if (error)
			goto out;
		cp->c_attr.va_size = cp->c_size = vap->va_size;
		gethrestime(&cp->c_metadata.md_localmtime);
		cp->c_metadata.md_flags |= MD_POPULATED | MD_LOCALMTIME;
		cachefs_modified(cp);
		cp->c_flags |= CN_UPDATED;
	}

	if (mask & AT_MODE) {
		/* mark as modified */
		if (cachefs_modified_alloc(cp)) {
			error = ENOSPC;
			goto out;
		}

		if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				error = ENOSPC;
				goto out;
			}
			cp->c_metadata.md_flags |= MD_MAPPING;
		}

		/* log the operation if not already logged */
		if (commit == 0) {
			commit = cachefs_dlog_setattr(fscp, vap, flags, cp, cr);
			if (commit == 0) {
				error = ENOSPC;
				goto out;
			}
		}

		cp->c_attr.va_mode &= S_IFMT;
		cp->c_attr.va_mode |= vap->va_mode & ~S_IFMT;
		gethrestime(&cp->c_metadata.md_localctime);
		cp->c_metadata.md_flags |= MD_LOCALCTIME;
		cp->c_flags |= CN_UPDATED;
	}

	if (mask & (AT_UID|AT_GID)) {

		/* mark as modified */
		if (cachefs_modified_alloc(cp)) {
			error = ENOSPC;
			goto out;
		}

		if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				error = ENOSPC;
				goto out;
			}
			cp->c_metadata.md_flags |= MD_MAPPING;
		}

		/* log the operation if not already logged */
		if (commit == 0) {
			commit = cachefs_dlog_setattr(fscp, vap, flags, cp, cr);
			if (commit == 0) {
				error = ENOSPC;
				goto out;
			}
		}

		if (mask & AT_UID)
			cp->c_attr.va_uid = vap->va_uid;

		if (mask & AT_GID)
			cp->c_attr.va_gid = vap->va_gid;
		gethrestime(&cp->c_metadata.md_localctime);
		cp->c_metadata.md_flags |= MD_LOCALCTIME;
		cp->c_flags |= CN_UPDATED;
	}


	if (mask & (AT_MTIME|AT_ATIME)) {
		/* mark as modified */
		if (cachefs_modified_alloc(cp)) {
			error = ENOSPC;
			goto out;
		}

		if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				error = ENOSPC;
				goto out;
			}
			cp->c_metadata.md_flags |= MD_MAPPING;
		}

		/* log the operation if not already logged */
		if (commit == 0) {
			commit = cachefs_dlog_setattr(fscp, vap, flags, cp, cr);
			if (commit == 0) {
				error = ENOSPC;
				goto out;
			}
		}

		if (mask & AT_MTIME) {
			cp->c_metadata.md_localmtime = vap->va_mtime;
			cp->c_metadata.md_flags |= MD_LOCALMTIME;
		}
		if (mask & AT_ATIME)
			cp->c_attr.va_atime = vap->va_atime;
		gethrestime(&cp->c_metadata.md_localctime);
		cp->c_metadata.md_flags |= MD_LOCALCTIME;
		cp->c_flags |= CN_UPDATED;
	}

out:
	mutex_exit(&cp->c_statelock);

	/* commit the log entry */
	if (commit) {
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}
	return (error);
}

/* ARGSUSED */
static int
cachefs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
	caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_access: ENTER vp %p\n", (void *)vp);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the access operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_access_connected(vp, mode, flags,
			    cr);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			mutex_enter(&cp->c_statelock);
			error = cachefs_access_local(cp, mode, cr);
			mutex_exit(&cp->c_statelock);
			if (CFS_TIMEOUT(fscp, error)) {
				if (cachefs_cd_access_miss(fscp)) {
					mutex_enter(&cp->c_statelock);
					if (cp->c_backvp == NULL) {
						(void) cachefs_getbackvp(fscp,
						    cp);
					}
					mutex_exit(&cp->c_statelock);
					error = cachefs_access_connected(vp,
					    mode, flags, cr);
					if (!CFS_TIMEOUT(fscp, error))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}
	if (held)
		cachefs_cd_release(fscp);
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_access: EXIT error = %d\n", error);
#endif
	return (error);
}

static int
cachefs_access_connected(struct vnode *vp, int mode, int flags, cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;

	mutex_enter(&cp->c_statelock);

	/* Make sure the cnode attrs are valid first. */
	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error)
		goto out;

	/* see if can do a local file system check */
	if ((fscp->fs_info.fi_mntflags & CFS_ACCESS_BACKFS) == 0 &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_access_local(cp, mode, cr);
		goto out;
	}

	/* else do a remote file system check */
	else {
		if (cp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, cp);
			if (error)
				goto out;
		}

		CFS_DPRINT_BACKFS_NFSV4(fscp,
		    ("cachefs_access (nfsv4): cnode %p, backvp %p\n",
		    cp, cp->c_backvp));
		error = VOP_ACCESS(cp->c_backvp, mode, flags, cr, NULL);

		/*
		 * even though we don't `need' the ACL to do access
		 * via the backvp, we should cache it here to make our
		 * behavior more reasonable if we go disconnected.
		 */

		if (((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0) &&
		    (cachefs_vtype_aclok(vp)) &&
		    ((cp->c_flags & CN_NOCACHE) == 0) &&
		    (!CFS_ISFS_BACKFS_NFSV4(fscp)) &&
		    ((cp->c_metadata.md_flags & MD_ACL) == 0))
			(void) cachefs_cacheacl(cp, NULL);
	}
out:
	/*
	 * If NFS returned ESTALE, mark this cnode as stale, so that
	 * the vn_open retry will read the file anew from backfs
	 */
	if (error == ESTALE)
		cachefs_cnode_stale(cp);

	mutex_exit(&cp->c_statelock);
	return (error);
}

/*
 * CFS has a fastsymlink scheme. If the size of the link is < C_FSL_SIZE, then
 * the link is placed in the metadata itself (no front file is allocated).
 */
/*ARGSUSED*/
static int
cachefs_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
{
	int error = 0;
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	cachefscache_t *cachep = fscp->fs_cache;
	int held = 0;
	int connected = 0;

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	if (vp->v_type != VLNK)
		return (EINVAL);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the readlink operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			/*
			 * since readlink_connected will call stuffsymlink
			 * on success, have to serialize access
			 */
			if (!rw_tryenter(&cp->c_rwlock, RW_WRITER)) {
				cachefs_cd_release(fscp);
				rw_enter(&cp->c_rwlock, RW_WRITER);
				error = cachefs_cd_access(fscp, connected, 0);
				if (error) {
					held = 0;
					rw_exit(&cp->c_rwlock);
					break;
				}
			}
			error = cachefs_readlink_connected(vp, uiop, cr);
			rw_exit(&cp->c_rwlock);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_readlink_disconnected(vp, uiop);
			if (CFS_TIMEOUT(fscp, error)) {
				if (cachefs_cd_access_miss(fscp)) {
					/* as above */
					if (!rw_tryenter(&cp->c_rwlock,
					    RW_WRITER)) {
						cachefs_cd_release(fscp);
						rw_enter(&cp->c_rwlock,
						    RW_WRITER);
						error = cachefs_cd_access(fscp,
						    connected, 0);
						if (error) {
							held = 0;
							rw_exit(&cp->c_rwlock);
							break;
						}
					}
					error = cachefs_readlink_connected(vp,
					    uiop, cr);
					rw_exit(&cp->c_rwlock);
					if (!CFS_TIMEOUT(fscp, error))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}
	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_READLINK))
		cachefs_log_readlink(cachep, error, fscp->fs_cfsvfsp,
		    &cp->c_metadata.md_cookie, cp->c_id.cid_fileno,
		    crgetuid(cr), cp->c_size);

	if (held)
		cachefs_cd_release(fscp);
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

	/*
	 * The over the wire error for attempting to readlink something
	 * other than a symbolic link is ENXIO.  However, we need to
	 * return EINVAL instead of ENXIO, so we map it here.
	 */
	return (error == ENXIO ? EINVAL : error);
}

static int
cachefs_readlink_connected(vnode_t *vp, uio_t *uiop, cred_t *cr)
{
	int error;
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	caddr_t buf;
	int buflen;
	int readcache = 0;

	mutex_enter(&cp->c_statelock);

	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error)
		goto out;

	/* if the sym link is cached as a fast sym link */
	if (cp->c_metadata.md_flags & MD_FASTSYMLNK) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		error = uiomove(cp->c_metadata.md_allocinfo,
		    MIN(cp->c_size, uiop->uio_resid), UIO_READ, uiop);
#ifdef CFSDEBUG
		readcache = 1;
		goto out;
#else /* CFSDEBUG */
		/* XXX KLUDGE! correct for insidious 0-len symlink */
		if (cp->c_size != 0) {
			readcache = 1;
			goto out;
		}
#endif /* CFSDEBUG */
	}

	/* if the sym link is cached in a front file */
	if (cp->c_metadata.md_flags & MD_POPULATED) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		ASSERT(cp->c_metadata.md_flags & MD_FILE);
		if (cp->c_frontvp == NULL) {
			(void) cachefs_getfrontfile(cp);
		}
		if (cp->c_metadata.md_flags & MD_POPULATED) {
			/* read symlink data from frontfile */
			uiop->uio_offset = 0;
			(void) VOP_RWLOCK(cp->c_frontvp,
			    V_WRITELOCK_FALSE, NULL);
			error = VOP_READ(cp->c_frontvp, uiop, 0, kcred, NULL);
			VOP_RWUNLOCK(cp->c_frontvp, V_WRITELOCK_FALSE, NULL);

			/* XXX KLUDGE! correct for insidious 0-len symlink */
			if (cp->c_size != 0) {
				readcache = 1;
				goto out;
			}
		}
	}

	/* get the sym link contents from the back fs */
	error = cachefs_readlink_back(cp, cr, &buf, &buflen);
	if (error)
		goto out;

	/* copy the contents out to the user */
	error = uiomove(buf, MIN(buflen, uiop->uio_resid), UIO_READ, uiop);

	/*
	 * try to cache the sym link, note that its a noop if NOCACHE is set
	 * or if NFSv4 pass-through is enabled.
	 */
	if (cachefs_stuffsymlink(cp, buf, buflen)) {
		cachefs_nocache(cp);
	}

	cachefs_kmem_free(buf, MAXPATHLEN);

out:
	mutex_exit(&cp->c_statelock);
	if (error == 0) {
		if (readcache)
			fscp->fs_stats.st_hits++;
		else
			fscp->fs_stats.st_misses++;
	}
	return (error);
}

static int
cachefs_readlink_disconnected(vnode_t *vp, uio_t *uiop)
{
	int error;
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int readcache = 0;

	mutex_enter(&cp->c_statelock);

	/* if the sym link is cached as a fast sym link */
	if (cp->c_metadata.md_flags & MD_FASTSYMLNK) {
		error = uiomove(cp->c_metadata.md_allocinfo,
		    MIN(cp->c_size, uiop->uio_resid), UIO_READ, uiop);
		readcache = 1;
		goto out;
	}

	/* if the sym link is cached in a front file */
	if (cp->c_metadata.md_flags & MD_POPULATED) {
		ASSERT(cp->c_metadata.md_flags & MD_FILE);
		if (cp->c_frontvp == NULL) {
			(void) cachefs_getfrontfile(cp);
		}
		if (cp->c_metadata.md_flags & MD_POPULATED) {
			/* read symlink data from frontfile */
			uiop->uio_offset = 0;
			(void) VOP_RWLOCK(cp->c_frontvp,
			    V_WRITELOCK_FALSE, NULL);
			error = VOP_READ(cp->c_frontvp, uiop, 0, kcred, NULL);
			VOP_RWUNLOCK(cp->c_frontvp, V_WRITELOCK_FALSE, NULL);
			readcache = 1;
			goto out;
		}
	}
	error = ETIMEDOUT;

out:
	mutex_exit(&cp->c_statelock);
	if (error == 0) {
		if (readcache)
			fscp->fs_stats.st_hits++;
		else
			fscp->fs_stats.st_misses++;
	}
	return (error);
}

/*ARGSUSED*/
static int
cachefs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_fsync: ENTER vp %p\n", (void *)vp);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_backvfsp && fscp->fs_backvfsp->vfs_flag & VFS_RDONLY)
		goto out;

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the fsync operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;
		connected = 0;

		/* if a regular file, write out the pages */
		if ((vp->v_type == VREG) && vn_has_cached_data(vp) &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
			error = cachefs_putpage_common(vp, (offset_t)0,
			    0, 0, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					continue;
				} else {
					connected = 1;
					continue;
				}
			}

			/* if no space left in cache, wait until connected */
			if ((error == ENOSPC) &&
			    (fscp->fs_cdconnected != CFS_CD_CONNECTED)) {
				connected = 1;
				continue;
			}

			/* clear the cnode error if putpage worked */
			if ((error == 0) && cp->c_error) {
				mutex_enter(&cp->c_statelock);
				cp->c_error = 0;
				mutex_exit(&cp->c_statelock);
			}

			if (error)
				break;
		}

		/* if connected, sync the backvp */
		if ((fscp->fs_cdconnected == CFS_CD_CONNECTED) &&
		    cp->c_backvp) {
			mutex_enter(&cp->c_statelock);
			if (cp->c_backvp) {
				CFS_DPRINT_BACKFS_NFSV4(fscp,
				    ("cachefs_fsync (nfsv4): cnode %p, "
				    "backvp %p\n", cp, cp->c_backvp));
				error = VOP_FSYNC(cp->c_backvp, syncflag, cr,
				    ct);
				if (CFS_TIMEOUT(fscp, error)) {
					mutex_exit(&cp->c_statelock);
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					continue;
				} else if (error && (error != EINTR))
					cp->c_error = error;
			}
			mutex_exit(&cp->c_statelock);
		}

		/* sync the metadata and the front file to the front fs */
		if (!CFS_ISFS_BACKFS_NFSV4(fscp)) {
			error = cachefs_sync_metadata(cp);
			if (error &&
			    (fscp->fs_cdconnected == CFS_CD_CONNECTED))
				error = 0;
		}
		break;
	}

	if (error == 0)
		error = cp->c_error;

	if (held)
		cachefs_cd_release(fscp);

out:
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_fsync: EXIT vp %p\n", (void *)vp);
#endif
	return (error);
}

/*
 * Called from cachefs_inactive(), to make sure all the data goes out to disk.
 */
int
cachefs_sync_metadata(cnode_t *cp)
{
	int error = 0;
	struct filegrp *fgp;
	struct vattr va;
	fscache_t *fscp = C_TO_FSCACHE(cp);

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("c_sync_metadata: ENTER cp %p cflag %x\n",
		    (void *)cp, cp->c_flags);
#endif

	mutex_enter(&cp->c_statelock);
	if ((cp->c_flags & CN_UPDATED) == 0)
		goto out;
	if (cp->c_flags & (CN_STALE | CN_DESTROY))
		goto out;
	fgp = cp->c_filegrp;
	if ((fgp->fg_flags & CFS_FG_WRITE) == 0)
		goto out;
	if (CFS_ISFS_BACKFS_NFSV4(fscp))
		goto out;

	if (fgp->fg_flags & CFS_FG_ALLOC_ATTR) {
		mutex_exit(&cp->c_statelock);
		error = filegrp_allocattr(fgp);
		mutex_enter(&cp->c_statelock);
		if (error) {
			error = 0;
			goto out;
		}
	}

	if (cp->c_flags & CN_ALLOC_PENDING) {
		error = filegrp_create_metadata(fgp, &cp->c_metadata,
		    &cp->c_id);
		if (error)
			goto out;
		cp->c_flags &= ~CN_ALLOC_PENDING;
	}

	if (cp->c_flags & CN_NEED_FRONT_SYNC) {
		if (cp->c_frontvp != NULL) {
			error = VOP_FSYNC(cp->c_frontvp, FSYNC, kcred, NULL);
			if (error) {
				cp->c_metadata.md_timestamp.tv_sec = 0;
			} else {
				va.va_mask = AT_MTIME;
				error = VOP_GETATTR(cp->c_frontvp, &va, 0,
				    kcred, NULL);
				if (error)
					goto out;
				cp->c_metadata.md_timestamp = va.va_mtime;
				cp->c_flags &=
				    ~(CN_NEED_FRONT_SYNC |
				    CN_POPULATION_PENDING);
			}
		} else {
			cp->c_flags &=
			    ~(CN_NEED_FRONT_SYNC | CN_POPULATION_PENDING);
		}
	}

	/*
	 * XXX tony: How can CN_ALLOC_PENDING still be set??
	 * XXX tony: How can CN_UPDATED not be set?????
	 */
	if ((cp->c_flags & CN_ALLOC_PENDING) == 0 &&
	    (cp->c_flags & CN_UPDATED)) {
		error = filegrp_write_metadata(fgp, &cp->c_id,
		    &cp->c_metadata);
		if (error)
			goto out;
	}
out:
	if (error) {
		/* XXX modified files? */
		if (cp->c_metadata.md_rlno) {
			cachefs_removefrontfile(&cp->c_metadata,
			    &cp->c_id, fgp);
			cachefs_rlent_moveto(C_TO_FSCACHE(cp)->fs_cache,
			    CACHEFS_RL_FREE, cp->c_metadata.md_rlno, 0);
			cp->c_metadata.md_rlno = 0;
			cp->c_metadata.md_rltype = CACHEFS_RL_NONE;
			if (cp->c_frontvp) {
				VN_RELE(cp->c_frontvp);
				cp->c_frontvp = NULL;
			}
		}
		if ((cp->c_flags & CN_ALLOC_PENDING) == 0)
			(void) filegrp_destroy_metadata(fgp, &cp->c_id);
		cp->c_flags |= CN_ALLOC_PENDING;
		cachefs_nocache(cp);
	}
	/*
	 * we clear the updated bit even on errors because a retry
	 * will probably fail also.
	 */
	cp->c_flags &= ~CN_UPDATED;
	mutex_exit(&cp->c_statelock);

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("c_sync_metadata: EXIT cp %p cflag %x\n",
		    (void *)cp, cp->c_flags);
#endif

	return (error);
}

/*
 * This is the vop entry point for inactivating a vnode.
 * It just queues the request for the async thread which
 * calls cachefs_inactive.
 * Because of the dnlc, it is not safe to grab most locks here.
 */
/*ARGSUSED*/
static void
cachefs_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp;
	struct cachefs_req *rp;
	fscache_t *fscp;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_inactive: ENTER vp %p\n", (void *)vp);
#endif

	cp = VTOC(vp);
	fscp = C_TO_FSCACHE(cp);

	ASSERT((cp->c_flags & CN_IDLE) == 0);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the inactive operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* vn_rele() set the v_count == 1 */

	cp->c_ipending = 1;

	rp = kmem_cache_alloc(cachefs_req_cache, KM_SLEEP);
	rp->cfs_cmd = CFS_IDLE;
	rp->cfs_cr = cr;
	crhold(rp->cfs_cr);
	rp->cfs_req_u.cu_idle.ci_vp = vp;
	cachefs_addqueue(rp, &(C_TO_FSCACHE(cp)->fs_workq));

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_inactive: EXIT vp %p\n", (void *)vp);
#endif
}

/* ARGSUSED */
static int
cachefs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
    struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
    caller_context_t *ct, int *direntflags, pathname_t *realpnp)

{
	int error = 0;
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_lookup: ENTER dvp %p nm %s\n", (void *)dvp, nm);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the lookup operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		error = cachefs_lookup_common(dvp, nm, vpp, pnp,
			flags, rdir, cr);
		if (CFS_TIMEOUT(fscp, error)) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			} else {
				if (cachefs_cd_access_miss(fscp)) {
					rw_enter(&dcp->c_rwlock, RW_READER);
					error = cachefs_lookup_back(dvp, nm,
					    vpp, cr);
					rw_exit(&dcp->c_rwlock);
					if (!CFS_TIMEOUT(fscp, error))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}
	if (held)
		cachefs_cd_release(fscp);

	if (error == 0 && IS_DEVVP(*vpp)) {
		struct vnode *newvp;
		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
		VN_RELE(*vpp);
		if (newvp == NULL) {
			error = ENOSYS;
		} else {
			*vpp = newvp;
		}
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_lookup: EXIT error = %d\n", error);
#endif

	return (error);
}

/* ARGSUSED */
int
cachefs_lookup_common(vnode_t *dvp, char *nm, vnode_t **vpp,
    struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr)
{
	int error = 0;
	cnode_t *cp, *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	struct fid cookie;
	u_offset_t d_offset;
	struct cachefs_req *rp;
	cfs_cid_t cid, dircid;
	uint_t flag;
	uint_t uncached = 0;

	*vpp = NULL;

	/*
	 * If lookup is for "", just return dvp.  Don't need
	 * to send it over the wire, look it up in the dnlc,
	 * or perform any access checks.
	 */
	if (*nm == '\0') {
		VN_HOLD(dvp);
		*vpp = dvp;
		return (0);
	}

	/* can't do lookups in non-directories */
	if (dvp->v_type != VDIR)
		return (ENOTDIR);

	/* perform access check, also does consistency check if connected */
	if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
		error = cachefs_access_connected(dvp, VEXEC, 0, cr);
	} else {
		mutex_enter(&dcp->c_statelock);
		error = cachefs_access_local(dcp, VEXEC, cr);
		mutex_exit(&dcp->c_statelock);
	}
	if (error)
		return (error);

	/*
	 * If lookup is for ".", just return dvp.  Don't need
	 * to send it over the wire or look it up in the dnlc,
	 * just need to check access.
	 */
	if (strcmp(nm, ".") == 0) {
		VN_HOLD(dvp);
		*vpp = dvp;
		return (0);
	}

	/* check the dnlc */
	*vpp = (vnode_t *)dnlc_lookup(dvp, nm);
	if (*vpp)
		return (0);

	/* read lock the dir before starting the search */
	rw_enter(&dcp->c_rwlock, RW_READER);

	mutex_enter(&dcp->c_statelock);
	dircid = dcp->c_id;

	dcp->c_usage++;

	/* if front file is not usable, lookup on the back fs */
	if ((dcp->c_flags & (CN_NOCACHE | CN_ASYNC_POPULATE)) ||
	    CFS_ISFS_BACKFS_NFSV4(fscp) ||
	    ((dcp->c_filegrp->fg_flags & CFS_FG_READ) == 0)) {
		mutex_exit(&dcp->c_statelock);
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
			error = cachefs_lookup_back(dvp, nm, vpp, cr);
		else
			error = ETIMEDOUT;
		goto out;
	}

	/* if the front file is not populated, try to populate it */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			error = ETIMEDOUT;
			mutex_exit(&dcp->c_statelock);
			goto out;
		}

		if (cachefs_async_okay()) {
			/* cannot populate if cache is not writable */
			ASSERT((dcp->c_flags &
			    (CN_ASYNC_POPULATE | CN_NOCACHE)) == 0);
			dcp->c_flags |= CN_ASYNC_POPULATE;

			rp = kmem_cache_alloc(cachefs_req_cache, KM_SLEEP);
			rp->cfs_cmd = CFS_POPULATE;
			rp->cfs_req_u.cu_populate.cpop_vp = dvp;
			rp->cfs_cr = cr;

			crhold(cr);
			VN_HOLD(dvp);

			cachefs_addqueue(rp, &fscp->fs_workq);
		} else if (fscp->fs_info.fi_mntflags & CFS_NOACL) {
			error = cachefs_dir_fill(dcp, cr);
			if (error != 0) {
				mutex_exit(&dcp->c_statelock);
				goto out;
			}
		}
		/* no populate if too many asyncs and we have to cache ACLs */

		mutex_exit(&dcp->c_statelock);

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
			error = cachefs_lookup_back(dvp, nm, vpp, cr);
		else
			error = ETIMEDOUT;
		goto out;
	}

	/* by now we have a valid cached front file that we can search */

	ASSERT((dcp->c_flags & CN_ASYNC_POPULATE) == 0);
	error = cachefs_dir_look(dcp, nm, &cookie, &flag,
	    &d_offset, &cid);
	mutex_exit(&dcp->c_statelock);

	if (error) {
		/* if the entry does not have the fid, go get it */
		if (error == EINVAL) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
				error = cachefs_lookup_back(dvp, nm, vpp, cr);
			else
				error = ETIMEDOUT;
		}

		/* errors other than does not exist */
		else if (error != ENOENT) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
				error = cachefs_lookup_back(dvp, nm, vpp, cr);
			else
				error = ETIMEDOUT;
		}
		goto out;
	}

	/*
	 * Else we found the entry in the cached directory.
	 * Make a cnode for it.
	 */
	error = cachefs_cnode_make(&cid, fscp, &cookie, NULL, NULL,
	    cr, 0, &cp);
	if (error == ESTALE) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		mutex_enter(&dcp->c_statelock);
		cachefs_nocache(dcp);
		mutex_exit(&dcp->c_statelock);
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_lookup_back(dvp, nm, vpp, cr);
			uncached = 1;
		} else
			error = ETIMEDOUT;
	} else if (error == 0) {
		*vpp = CTOV(cp);
	}

out:
	if (error == 0) {
		/* put the entry in the dnlc */
		if (cachefs_dnlc)
			dnlc_enter(dvp, nm, *vpp);

		/* save the cid of the parent so can find the name */
		cp = VTOC(*vpp);
		if (bcmp(&cp->c_metadata.md_parent, &dircid,
		    sizeof (cfs_cid_t)) != 0) {
			mutex_enter(&cp->c_statelock);
			cp->c_metadata.md_parent = dircid;
			cp->c_flags |= CN_UPDATED;
			mutex_exit(&cp->c_statelock);
		}
	}

	rw_exit(&dcp->c_rwlock);
	if (uncached && dcp->c_metadata.md_flags & MD_PACKED)
		(void) cachefs_pack_common(dvp, cr);
	return (error);
}

/*
 * Called from cachefs_lookup_common when the back file system needs to be
 * examined to perform the lookup.
 */
static int
cachefs_lookup_back(vnode_t *dvp, char *nm, vnode_t **vpp,
    cred_t *cr)
{
	int error = 0;
	cnode_t *cp, *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	vnode_t *backvp = NULL;
	struct vattr va;
	struct fid cookie;
	cfs_cid_t cid;
	uint32_t valid_fid;

	mutex_enter(&dcp->c_statelock);

	/* do a lookup on the back FS to get the back vnode */
	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error)
			goto out;
	}

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_lookup (nfsv4): dcp %p, dbackvp %p, name %s\n",
	    dcp, dcp->c_backvp, nm));
	error = VOP_LOOKUP(dcp->c_backvp, nm, &backvp, (struct pathname *)NULL,
	    0, (vnode_t *)NULL, cr, NULL, NULL, NULL);
	if (error)
		goto out;
	if (IS_DEVVP(backvp)) {
		struct vnode *devvp = backvp;

		if (VOP_REALVP(devvp, &backvp, NULL) == 0) {
			VN_HOLD(backvp);
			VN_RELE(devvp);
		}
	}

	/* get the fid and attrs from the back fs */
	valid_fid = (CFS_ISFS_BACKFS_NFSV4(fscp) ? FALSE : TRUE);
	error = cachefs_getcookie(backvp, &cookie, &va, cr, valid_fid);
	if (error)
		goto out;

	cid.cid_fileno = va.va_nodeid;
	cid.cid_flags = 0;

#if 0
	/* XXX bob: this is probably no longer necessary */
	/* if the directory entry was incomplete, we can complete it now */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) &&
	    ((dcp->c_flags & CN_ASYNC_POPULATE) == 0) &&
	    (dcp->c_filegrp->fg_flags & CFS_FG_WRITE)) {
		cachefs_dir_modentry(dcp, d_offset, &cookie, &cid);
	}
#endif

out:
	mutex_exit(&dcp->c_statelock);

	/* create the cnode */
	if (error == 0) {
		error = cachefs_cnode_make(&cid, fscp,
		    (valid_fid ? &cookie : NULL),
		    &va, backvp, cr, 0, &cp);
		if (error == 0) {
			*vpp = CTOV(cp);
		}
	}

	if (backvp)
		VN_RELE(backvp);

	return (error);
}

/*ARGSUSED7*/
static int
cachefs_create(vnode_t *dvp, char *nm, vattr_t *vap,
    vcexcl_t exclusive, int mode, vnode_t **vpp, cred_t *cr, int flag,
    caller_context_t *ct, vsecattr_t *vsecp)

{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error;
	int connected = 0;
	int held = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_create: ENTER dvp %p, nm %s\n",
		    (void *)dvp, nm);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the create operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/*
		 * if we are connected, perform the remote portion of the
		 * create.
		 */
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_create_connected(dvp, nm, vap,
			    exclusive, mode, vpp, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			} else if (error) {
				break;
			}
		}

		/* else we must be disconnected */
		else {
			error = cachefs_create_disconnected(dvp, nm, vap,
			    exclusive, mode, vpp, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			} else if (error) {
				break;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_CREATE)) {
		fid_t *fidp = NULL;
		ino64_t fileno = 0;
		cnode_t *cp = NULL;
		if (error == 0)
			cp = VTOC(*vpp);

		if (cp != NULL) {
			fidp = &cp->c_metadata.md_cookie;
			fileno = cp->c_id.cid_fileno;
		}
		cachefs_log_create(cachep, error, fscp->fs_cfsvfsp,
		    fidp, fileno, crgetuid(cr));
	}

	if (held)
		cachefs_cd_release(fscp);

	if (error == 0 && CFS_ISFS_NONSHARED(fscp))
		(void) cachefs_pack(dvp, nm, cr);
	if (error == 0 && IS_DEVVP(*vpp)) {
		struct vnode *spcvp;

		spcvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
		VN_RELE(*vpp);
		if (spcvp == NULL) {
			error = ENOSYS;
		} else {
			*vpp = spcvp;
		}
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_create: EXIT error %d\n", error);
#endif
	return (error);
}


static int
cachefs_create_connected(vnode_t *dvp, char *nm, vattr_t *vap,
    enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error;
	vnode_t *tvp = NULL;
	vnode_t *devvp;
	fid_t cookie;
	vattr_t va;
	cnode_t *ncp;
	cfs_cid_t cid;
	vnode_t *vp;
	uint32_t valid_fid;

	/* special case if file already exists */
	error = cachefs_lookup_common(dvp, nm, &vp, NULL, 0, NULL, cr);
	if (CFS_TIMEOUT(fscp, error))
		return (error);
	if (error == 0) {
		if (exclusive == EXCL)
			error = EEXIST;
		else if (vp->v_type == VDIR && (mode & VWRITE))
			error = EISDIR;
		else if ((error =
		    cachefs_access_connected(vp, mode, 0, cr)) == 0) {
			if ((vap->va_mask & AT_SIZE) && (vp->v_type == VREG)) {
				vap->va_mask = AT_SIZE;
				error = cachefs_setattr_common(vp, vap, 0,
				    cr, NULL);
			}
		}
		if (error) {
			VN_RELE(vp);
		} else
			*vpp = vp;
		return (error);
	}

	rw_enter(&dcp->c_rwlock, RW_WRITER);
	mutex_enter(&dcp->c_statelock);

	/* consistency check the directory */
	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* get the backvp if necessary */
	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error) {
			mutex_exit(&dcp->c_statelock);
			goto out;
		}
	}

	/* create the file on the back fs */
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_create (nfsv4): dcp %p, dbackvp %p,"
	    "name %s\n", dcp, dcp->c_backvp, nm));
	error = VOP_CREATE(dcp->c_backvp, nm, vap, exclusive, mode,
	    &devvp, cr, 0, NULL, NULL);
	mutex_exit(&dcp->c_statelock);
	if (error)
		goto out;
	if (VOP_REALVP(devvp, &tvp, NULL) == 0) {
		VN_HOLD(tvp);
		VN_RELE(devvp);
	} else {
		tvp = devvp;
	}

	/* get the fid and attrs from the back fs */
	valid_fid = (CFS_ISFS_BACKFS_NFSV4(fscp) ? FALSE : TRUE);
	error = cachefs_getcookie(tvp, &cookie, &va, cr, valid_fid);
	if (error)
		goto out;

	/* make the cnode */
	cid.cid_fileno = va.va_nodeid;
	cid.cid_flags = 0;
	error = cachefs_cnode_make(&cid, fscp, (valid_fid ? &cookie : NULL),
	    &va, tvp, cr, 0, &ncp);
	if (error)
		goto out;

	*vpp = CTOV(ncp);

	/* enter it in the parent directory */
	mutex_enter(&dcp->c_statelock);
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		/* see if entry already exists */
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		error = cachefs_dir_look(dcp, nm, NULL, NULL, NULL, NULL);
		if (error == ENOENT) {
			/* entry, does not exist, add the new file */
			error = cachefs_dir_enter(dcp, nm, &ncp->c_cookie,
			    &ncp->c_id, SM_ASYNC);
			if (error) {
				cachefs_nocache(dcp);
				error = 0;
			}
			/* XXX should this be done elsewhere, too? */
			dnlc_enter(dvp, nm, *vpp);
		} else {
			/* entry exists or some other problem */
			cachefs_nocache(dcp);
			error = 0;
		}
	}
	CFSOP_MODIFY_COBJECT(fscp, dcp, cr);
	mutex_exit(&dcp->c_statelock);

out:
	rw_exit(&dcp->c_rwlock);
	if (tvp)
		VN_RELE(tvp);

	return (error);
}

static int
cachefs_create_disconnected(vnode_t *dvp, char *nm, vattr_t *vap,
	enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp;
	cnode_t *ncp = NULL;
	vnode_t *vp;
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;
	struct vattr va;
	timestruc_t current_time;
	off_t commit = 0;
	fid_t cookie;
	cfs_cid_t cid;

	rw_enter(&dcp->c_rwlock, RW_WRITER);
	mutex_enter(&dcp->c_statelock);

	/* give up if the directory is not populated */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		mutex_exit(&dcp->c_statelock);
		rw_exit(&dcp->c_rwlock);
		return (ETIMEDOUT);
	}

	/* special case if file already exists */
	error = cachefs_dir_look(dcp, nm, &cookie, NULL, NULL, &cid);
	if (error == EINVAL) {
		mutex_exit(&dcp->c_statelock);
		rw_exit(&dcp->c_rwlock);
		return (ETIMEDOUT);
	}
	if (error == 0) {
		mutex_exit(&dcp->c_statelock);
		rw_exit(&dcp->c_rwlock);
		error = cachefs_cnode_make(&cid, fscp, &cookie, NULL, NULL,
		    cr, 0, &cp);
		if (error) {
			return (error);
		}
		vp = CTOV(cp);

		if (cp->c_metadata.md_flags & MD_NEEDATTRS)
			error = ETIMEDOUT;
		else if (exclusive == EXCL)
			error = EEXIST;
		else if (vp->v_type == VDIR && (mode & VWRITE))
			error = EISDIR;
		else {
			mutex_enter(&cp->c_statelock);
			error = cachefs_access_local(cp, mode, cr);
			mutex_exit(&cp->c_statelock);
			if (!error) {
				if ((vap->va_mask & AT_SIZE) &&
				    (vp->v_type == VREG)) {
					vap->va_mask = AT_SIZE;
					error = cachefs_setattr_common(vp,
					    vap, 0, cr, NULL);
				}
			}
		}
		if (error) {
			VN_RELE(vp);
		} else
			*vpp = vp;
		return (error);
	}

	/* give up if cannot modify the cache */
	if (CFS_ISFS_WRITE_AROUND(fscp)) {
		mutex_exit(&dcp->c_statelock);
		error = ETIMEDOUT;
		goto out;
	}

	/* check access */
	if (error = cachefs_access_local(dcp, VWRITE, cr)) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* mark dir as modified */
	cachefs_modified(dcp);
	mutex_exit(&dcp->c_statelock);

	/* must be privileged to set sticky bit */
	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
		vap->va_mode &= ~VSVTX;

	/* make up a reasonable set of attributes */
	cachefs_attr_setup(vap, &va, dcp, cr);

	/* create the cnode */
	error = cachefs_cnode_create(fscp, &va, 0, &ncp);
	if (error)
		goto out;

	mutex_enter(&ncp->c_statelock);

	/* get the front file now instead of later */
	if (vap->va_type == VREG) {
		error = cachefs_getfrontfile(ncp);
		if (error) {
			mutex_exit(&ncp->c_statelock);
			goto out;
		}
		ASSERT(ncp->c_frontvp != NULL);
		ASSERT((ncp->c_flags & CN_ALLOC_PENDING) == 0);
		ncp->c_metadata.md_flags |= MD_POPULATED;
	} else {
		ASSERT(ncp->c_flags & CN_ALLOC_PENDING);
		if (ncp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) {
			(void) filegrp_allocattr(ncp->c_filegrp);
		}
		error = filegrp_create_metadata(ncp->c_filegrp,
		    &ncp->c_metadata, &ncp->c_id);
		if (error) {
			mutex_exit(&ncp->c_statelock);
			goto out;
		}
		ncp->c_flags &= ~CN_ALLOC_PENDING;
	}
	mutex_enter(&dcp->c_statelock);
	cachefs_creategid(dcp, ncp, vap, cr);
	cachefs_createacl(dcp, ncp);
	mutex_exit(&dcp->c_statelock);

	/* set times on the file */
	gethrestime(&current_time);
	ncp->c_metadata.md_vattr.va_atime = current_time;
	ncp->c_metadata.md_localctime = current_time;
	ncp->c_metadata.md_localmtime = current_time;
	ncp->c_metadata.md_flags |= MD_LOCALMTIME | MD_LOCALCTIME;

	/* reserve space for the daemon cid mapping */
	error = cachefs_dlog_cidmap(fscp);
	if (error) {
		mutex_exit(&ncp->c_statelock);
		goto out;
	}
	ncp->c_metadata.md_flags |= MD_MAPPING;

	/* mark the new file as modified */
	if (cachefs_modified_alloc(ncp)) {
		mutex_exit(&ncp->c_statelock);
		error = ENOSPC;
		goto out;
	}
	ncp->c_flags |= CN_UPDATED;

	/*
	 * write the metadata now rather than waiting until
	 * inactive so that if there's no space we can let
	 * the caller know.
	 */
	ASSERT((ncp->c_flags & CN_ALLOC_PENDING) == 0);
	ASSERT((ncp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) == 0);
	error = filegrp_write_metadata(ncp->c_filegrp,
	    &ncp->c_id, &ncp->c_metadata);
	if (error) {
		mutex_exit(&ncp->c_statelock);
		goto out;
	}

	/* log the operation */
	commit = cachefs_dlog_create(fscp, dcp, nm, vap, exclusive,
	    mode, ncp, 0, cr);
	if (commit == 0) {
		mutex_exit(&ncp->c_statelock);
		error = ENOSPC;
		goto out;
	}

	mutex_exit(&ncp->c_statelock);

	mutex_enter(&dcp->c_statelock);

	/* update parent dir times */
	dcp->c_metadata.md_localmtime = current_time;
	dcp->c_metadata.md_flags |= MD_LOCALMTIME;
	dcp->c_flags |= CN_UPDATED;

	/* enter new file name in the parent directory */
	if (dcp->c_metadata.md_flags & MD_POPULATED) {
		error = cachefs_dir_enter(dcp, nm, &ncp->c_cookie,
		    &ncp->c_id, 0);
		if (error) {
			cachefs_nocache(dcp);
			mutex_exit(&dcp->c_statelock);
			error = ETIMEDOUT;
			goto out;
		}
		dnlc_enter(dvp, nm, CTOV(ncp));
	} else {
		mutex_exit(&dcp->c_statelock);
		error = ETIMEDOUT;
		goto out;
	}
	mutex_exit(&dcp->c_statelock);

out:
	rw_exit(&dcp->c_rwlock);

	if (commit) {
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}
	if (error) {
		/* destroy the cnode we created */
		if (ncp) {
			mutex_enter(&ncp->c_statelock);
			ncp->c_flags |= CN_DESTROY;
			mutex_exit(&ncp->c_statelock);
			VN_RELE(CTOV(ncp));
		}
	} else {
		*vpp = CTOV(ncp);
	}
	return (error);
}

/*ARGSUSED*/
static int
cachefs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
    int flags)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;
	size_t namlen;
	vnode_t *vp = NULL;
	int vfslock = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_remove: ENTER dvp %p name %s\n",
		    (void *)dvp, nm);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE))
		ASSERT(dcp->c_flags & CN_NOCACHE);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the remove operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		if (vfslock) {
			vn_vfsunlock(vp);
			vfslock = 0;
		}
		if (vp) {
			VN_RELE(vp);
			vp = NULL;
		}

		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/* if disconnected, do some extra error checking */
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			/* check permissions */
			mutex_enter(&dcp->c_statelock);
			error = cachefs_access_local(dcp, (VEXEC|VWRITE), cr);
			mutex_exit(&dcp->c_statelock);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
			if (error)
				break;

			namlen = strlen(nm);
			if (namlen == 0) {
				error = EINVAL;
				break;
			}

			/* cannot remove . and .. */
			if (nm[0] == '.') {
				if (namlen == 1) {
					error = EINVAL;
					break;
				} else if (namlen == 2 && nm[1] == '.') {
					error = EEXIST;
					break;
				}
			}

		}

		/* get the cnode of the file to delete */
		error = cachefs_lookup_common(dvp, nm, &vp, NULL, 0, NULL, cr);
		if (error) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				if (CFS_TIMEOUT(fscp, error)) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					connected = 0;
					continue;
				}
			} else {
				if (CFS_TIMEOUT(fscp, error)) {
					connected = 1;
					continue;
				}
			}
			if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_REMOVE)) {
				struct fid foo;

				bzero(&foo, sizeof (foo));
				cachefs_log_remove(cachep, error,
				    fscp->fs_cfsvfsp, &foo, 0, crgetuid(cr));
			}
			break;
		}

		if (vp->v_type == VDIR) {
			/* must be privileged to remove dirs with unlink() */
			if ((error = secpolicy_fs_linkdir(cr, vp->v_vfsp)) != 0)
				break;

			/* see ufs_dirremove for why this is done, mount race */
			if (vn_vfswlock(vp)) {
				error = EBUSY;
				break;
			}
			vfslock = 1;
			if (vn_mountedvfs(vp) != NULL) {
				error = EBUSY;
				break;
			}
		}

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_remove_connected(dvp, nm, cr, vp);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_remove_disconnected(dvp, nm, cr,
			    vp);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

#if 0
	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_REMOVE))
		cachefs_log_remove(cachep, error, fscp->fs_cfsvfsp,
		    &cp->c_metadata.md_cookie, cp->c_id.cid_fileno,
		    crgetuid(cr));
#endif

	if (held)
		cachefs_cd_release(fscp);

	if (vfslock)
		vn_vfsunlock(vp);

	if (vp)
		VN_RELE(vp);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_remove: EXIT dvp %p\n", (void *)dvp);
#endif

	return (error);
}

int
cachefs_remove_connected(vnode_t *dvp, char *nm, cred_t *cr, vnode_t *vp)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;

	/*
	 * Acquire the rwlock (WRITER) on the directory to prevent other
	 * activity on the directory.
	 */
	rw_enter(&dcp->c_rwlock, RW_WRITER);

	/* purge dnlc of this entry so can get accurate vnode count */
	dnlc_purge_vp(vp);

	/*
	 * If the cnode is active, make a link to the file
	 * so operations on the file will continue.
	 */
	if ((vp->v_type != VDIR) &&
	    !((vp->v_count == 1) || ((vp->v_count == 2) && cp->c_ipending))) {
		error = cachefs_remove_dolink(dvp, vp, nm, cr);
		if (error)
			goto out;
	}

	/* else call backfs NFSv4 handler if NFSv4 */
	else if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_remove_backfs_nfsv4(dvp, nm, cr, vp);
		goto out;
	}

	/* else drop the backvp so nfs does not do rename */
	else if (cp->c_backvp) {
		mutex_enter(&cp->c_statelock);
		if (cp->c_backvp) {
			VN_RELE(cp->c_backvp);
			cp->c_backvp = NULL;
		}
		mutex_exit(&cp->c_statelock);
	}

	mutex_enter(&dcp->c_statelock);

	/* get the backvp */
	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error) {
			mutex_exit(&dcp->c_statelock);
			goto out;
		}
	}

	/* check directory consistency */
	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* perform the remove on the back fs */
	error = VOP_REMOVE(dcp->c_backvp, nm, cr, NULL, 0);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* the dir has been modified */
	CFSOP_MODIFY_COBJECT(fscp, dcp, cr);

	/* remove the entry from the populated directory */
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_rmentry(dcp, nm);
		if (error) {
			cachefs_nocache(dcp);
			error = 0;
		}
	}
	mutex_exit(&dcp->c_statelock);

	/* fix up the file we deleted */
	mutex_enter(&cp->c_statelock);
	if (cp->c_attr.va_nlink == 1)
		cp->c_flags |= CN_DESTROY;
	else
		cp->c_flags |= CN_UPDATED;

	cp->c_attr.va_nlink--;
	CFSOP_MODIFY_COBJECT(fscp, cp, cr);
	mutex_exit(&cp->c_statelock);

out:
	rw_exit(&dcp->c_rwlock);
	return (error);
}

/*
 * cachefs_remove_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the remove (cachefs
 * pass-through support for NFSv4).
 */
int
cachefs_remove_backfs_nfsv4(vnode_t *dvp, char *nm, cred_t *cr, vnode_t *vp)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp = VTOC(vp);
	vnode_t *dbackvp;
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;

	/*
	 * For NFSv4 pass-through to work, only connected operation
	 * is supported, the cnode backvp must exist, and cachefs
	 * optional (eg., disconnectable) flags are turned off. Assert
	 * these conditions for the getattr operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Should hold the directory readwrite lock to update directory */
	ASSERT(RW_WRITE_HELD(&dcp->c_rwlock));

	/*
	 * Update attributes for directory. Note that
	 * CFSOP_CHECK_COBJECT asserts for c_statelock being
	 * held, so grab it before calling the routine.
	 */
	mutex_enter(&dcp->c_statelock);
	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	mutex_exit(&dcp->c_statelock);
	if (error)
		goto out;

	/*
	 * Update attributes for cp. Note that CFSOP_CHECK_COBJECT
	 * asserts for c_statelock being held, so grab it before
	 * calling the routine.
	 */
	mutex_enter(&cp->c_statelock);
	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error) {
		mutex_exit(&cp->c_statelock);
		goto out;
	}

	/*
	 * Drop the backvp so nfs if the link count is 1 so that
	 * nfs does not do rename. Ensure that we will destroy the cnode
	 * since this cnode no longer contains the backvp. Note that we
	 * maintain lock on this cnode to prevent change till the remove
	 * completes, otherwise other operations will encounter an ESTALE
	 * if they try to use the cnode with CN_DESTROY set (see
	 * cachefs_get_backvp()), or change the state of the cnode
	 * while we're removing it.
	 */
	if (cp->c_attr.va_nlink == 1) {
		/*
		 * The unldvp information is created for the case
		 * when there is more than one reference on the
		 * vnode when a remove operation is called. If the
		 * remove itself was holding a reference to the
		 * vnode, then a subsequent remove will remove the
		 * backvp, so we need to get rid of the unldvp
		 * before removing the backvp. An alternate would
		 * be to simply ignore the remove and let the
		 * inactivation routine do the deletion of the
		 * unldvp.
		 */
		if (cp->c_unldvp) {
			VN_RELE(cp->c_unldvp);
			cachefs_kmem_free(cp->c_unlname, MAXNAMELEN);
			crfree(cp->c_unlcred);
			cp->c_unldvp = NULL;
			cp->c_unlcred = NULL;
		}
		cp->c_flags |= CN_DESTROY;
		cp->c_attr.va_nlink = 0;
		VN_RELE(cp->c_backvp);
		cp->c_backvp = NULL;
	}

	/* perform the remove on back fs after extracting directory backvp */
	mutex_enter(&dcp->c_statelock);
	dbackvp = dcp->c_backvp;
	mutex_exit(&dcp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_remove (nfsv4): dcp %p, dbackvp %p, name %s\n",
	    dcp, dbackvp, nm));
	error = VOP_REMOVE(dbackvp, nm, cr, NULL, 0);
	if (error) {
		mutex_exit(&cp->c_statelock);
		goto out;
	}

	/* fix up the file we deleted, if not destroying the cnode */
	if ((cp->c_flags & CN_DESTROY) == 0) {
		cp->c_attr.va_nlink--;
		cp->c_flags |= CN_UPDATED;
	}

	mutex_exit(&cp->c_statelock);

out:
	return (error);
}

int
cachefs_remove_disconnected(vnode_t *dvp, char *nm, cred_t *cr,
    vnode_t *vp)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;
	off_t commit = 0;
	timestruc_t current_time;

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	if (cp->c_metadata.md_flags & MD_NEEDATTRS)
		return (ETIMEDOUT);

	/*
	 * Acquire the rwlock (WRITER) on the directory to prevent other
	 * activity on the directory.
	 */
	rw_enter(&dcp->c_rwlock, RW_WRITER);

	/* dir must be populated */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
		goto out;
	}

	mutex_enter(&dcp->c_statelock);
	mutex_enter(&cp->c_statelock);

	error = cachefs_stickyrmchk(dcp, cp, cr);

	mutex_exit(&cp->c_statelock);
	mutex_exit(&dcp->c_statelock);
	if (error)
		goto out;

	/* purge dnlc of this entry so can get accurate vnode count */
	dnlc_purge_vp(vp);

	/*
	 * If the cnode is active, make a link to the file
	 * so operations on the file will continue.
	 */
	if ((vp->v_type != VDIR) &&
	    !((vp->v_count == 1) || ((vp->v_count == 2) && cp->c_ipending))) {
		error = cachefs_remove_dolink(dvp, vp, nm, cr);
		if (error)
			goto out;
	}

	if (cp->c_attr.va_nlink > 1) {
		mutex_enter(&cp->c_statelock);
		if (cachefs_modified_alloc(cp)) {
			mutex_exit(&cp->c_statelock);
			error = ENOSPC;
			goto out;
		}
		if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				mutex_exit(&cp->c_statelock);
				error = ENOSPC;
				goto out;
			}
			cp->c_metadata.md_flags |= MD_MAPPING;
			cp->c_flags |= CN_UPDATED;
		}
		mutex_exit(&cp->c_statelock);
	}

	/* log the remove */
	commit = cachefs_dlog_remove(fscp, dcp, nm, cp, cr);
	if (commit == 0) {
		error = ENOSPC;
		goto out;
	}

	/* remove the file from the dir */
	mutex_enter(&dcp->c_statelock);
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		mutex_exit(&dcp->c_statelock);
		error = ETIMEDOUT;
		goto out;

	}
	cachefs_modified(dcp);
	error = cachefs_dir_rmentry(dcp, nm);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		if (error == ENOTDIR)
			error = ETIMEDOUT;
		goto out;
	}

	/* update parent dir times */
	gethrestime(&current_time);
	dcp->c_metadata.md_localctime = current_time;
	dcp->c_metadata.md_localmtime = current_time;
	dcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
	dcp->c_flags |= CN_UPDATED;
	mutex_exit(&dcp->c_statelock);

	/* adjust file we are deleting */
	mutex_enter(&cp->c_statelock);
	cp->c_attr.va_nlink--;
	cp->c_metadata.md_localctime = current_time;
	cp->c_metadata.md_flags |= MD_LOCALCTIME;
	if (cp->c_attr.va_nlink == 0) {
		cp->c_flags |= CN_DESTROY;
	} else {
		cp->c_flags |= CN_UPDATED;
	}
	mutex_exit(&cp->c_statelock);

out:
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}

	rw_exit(&dcp->c_rwlock);
	return (error);
}

/*ARGSUSED*/
static int
cachefs_link(vnode_t *tdvp, vnode_t *fvp, char *tnm, cred_t *cr,
    caller_context_t *ct, int flags)
{
	fscache_t *fscp = VFS_TO_FSCACHE(tdvp->v_vfsp);
	cnode_t *tdcp = VTOC(tdvp);
	struct vnode *realvp;
	int error = 0;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_link: ENTER fvp %p tdvp %p tnm %s\n",
		    (void *)fvp, (void *)tdvp, tnm);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE))
		ASSERT(tdcp->c_flags & CN_NOCACHE);

	if (VOP_REALVP(fvp, &realvp, ct) == 0) {
		fvp = realvp;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the link operation.
	 */

	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(tdcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			rw_exit(&tdcp->c_rwlock);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		rw_enter(&tdcp->c_rwlock, RW_WRITER);
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_link_connected(tdvp, fvp, tnm, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				rw_exit(&tdcp->c_rwlock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_link_disconnected(tdvp, fvp, tnm,
			    cr);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (held) {
		rw_exit(&tdcp->c_rwlock);
		cachefs_cd_release(fscp);
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_link: EXIT fvp %p tdvp %p tnm %s\n",
		    (void *)fvp, (void *)tdvp, tnm);
#endif
	return (error);
}

static int
cachefs_link_connected(vnode_t *tdvp, vnode_t *fvp, char *tnm, cred_t *cr)
{
	cnode_t *tdcp = VTOC(tdvp);
	cnode_t *fcp = VTOC(fvp);
	fscache_t *fscp = VFS_TO_FSCACHE(tdvp->v_vfsp);
	int error = 0;
	vnode_t *backvp = NULL;

	if (tdcp != fcp) {
		mutex_enter(&fcp->c_statelock);

		if (fcp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, fcp);
			if (error) {
				mutex_exit(&fcp->c_statelock);
				goto out;
			}
		}

		error = CFSOP_CHECK_COBJECT(fscp, fcp, 0, cr);
		if (error) {
			mutex_exit(&fcp->c_statelock);
			goto out;
		}
		backvp = fcp->c_backvp;
		VN_HOLD(backvp);
		mutex_exit(&fcp->c_statelock);
	}

	mutex_enter(&tdcp->c_statelock);

	/* get backvp of target directory */
	if (tdcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, tdcp);
		if (error) {
			mutex_exit(&tdcp->c_statelock);
			goto out;
		}
	}

	/* consistency check target directory */
	error = CFSOP_CHECK_COBJECT(fscp, tdcp, 0, cr);
	if (error) {
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}
	if (backvp == NULL) {
		backvp = tdcp->c_backvp;
		VN_HOLD(backvp);
	}

	/* perform the link on the back fs */
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_link (nfsv4): tdcp %p, tdbackvp %p, "
	    "name %s\n", tdcp, tdcp->c_backvp, tnm));
	error = VOP_LINK(tdcp->c_backvp, backvp, tnm, cr, NULL, 0);
	if (error) {
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}

	CFSOP_MODIFY_COBJECT(fscp, tdcp, cr);

	/* if the dir is populated, add the new link */
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (tdcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_enter(tdcp, tnm, &fcp->c_cookie,
		    &fcp->c_id, SM_ASYNC);
		if (error) {
			cachefs_nocache(tdcp);
			error = 0;
		}
	}
	mutex_exit(&tdcp->c_statelock);

	/* get the new link count on the file */
	mutex_enter(&fcp->c_statelock);
	fcp->c_flags |= CN_UPDATED;
	CFSOP_MODIFY_COBJECT(fscp, fcp, cr);
	if (fcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, fcp);
		if (error) {
			mutex_exit(&fcp->c_statelock);
			goto out;
		}
	}

	/* XXX bob: given what modify_cobject does this seems unnecessary */
	fcp->c_attr.va_mask = AT_ALL;
	error = VOP_GETATTR(fcp->c_backvp, &fcp->c_attr, 0, cr, NULL);
	mutex_exit(&fcp->c_statelock);
out:
	if (backvp)
		VN_RELE(backvp);

	return (error);
}

static int
cachefs_link_disconnected(vnode_t *tdvp, vnode_t *fvp, char *tnm,
    cred_t *cr)
{
	cnode_t *tdcp = VTOC(tdvp);
	cnode_t *fcp = VTOC(fvp);
	fscache_t *fscp = VFS_TO_FSCACHE(tdvp->v_vfsp);
	int error = 0;
	timestruc_t current_time;
	off_t commit = 0;

	if (fvp->v_type == VDIR && secpolicy_fs_linkdir(cr, fvp->v_vfsp) != 0 ||
	    fcp->c_attr.va_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
		return (EPERM);

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	if (fcp->c_metadata.md_flags & MD_NEEDATTRS)
		return (ETIMEDOUT);

	mutex_enter(&tdcp->c_statelock);

	/* check permissions */
	if (error = cachefs_access_local(tdcp, (VEXEC|VWRITE), cr)) {
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}

	/* the directory front file must be populated */
	if ((tdcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}

	/* make sure tnm does not already exist in the directory */
	error = cachefs_dir_look(tdcp, tnm, NULL, NULL, NULL, NULL);
	if (error == ENOTDIR) {
		error = ETIMEDOUT;
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}
	if (error != ENOENT) {
		error = EEXIST;
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}

	mutex_enter(&fcp->c_statelock);

	/* create a mapping for the file if necessary */
	if ((fcp->c_metadata.md_flags & MD_MAPPING) == 0) {
		error = cachefs_dlog_cidmap(fscp);
		if (error) {
			mutex_exit(&fcp->c_statelock);
			mutex_exit(&tdcp->c_statelock);
			error = ENOSPC;
			goto out;
		}
		fcp->c_metadata.md_flags |= MD_MAPPING;
		fcp->c_flags |= CN_UPDATED;
	}

	/* mark file as modified */
	if (cachefs_modified_alloc(fcp)) {
		mutex_exit(&fcp->c_statelock);
		mutex_exit(&tdcp->c_statelock);
		error = ENOSPC;
		goto out;
	}
	mutex_exit(&fcp->c_statelock);

	/* log the operation */
	commit = cachefs_dlog_link(fscp, tdcp, tnm, fcp, cr);
	if (commit == 0) {
		mutex_exit(&tdcp->c_statelock);
		error = ENOSPC;
		goto out;
	}

	gethrestime(&current_time);

	/* make the new link */
	cachefs_modified(tdcp);
	error = cachefs_dir_enter(tdcp, tnm, &fcp->c_cookie,
	    &fcp->c_id, SM_ASYNC);
	if (error) {
		error = 0;
		mutex_exit(&tdcp->c_statelock);
		goto out;
	}

	/* Update mtime/ctime of parent dir */
	tdcp->c_metadata.md_localmtime = current_time;
	tdcp->c_metadata.md_localctime = current_time;
	tdcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
	tdcp->c_flags |= CN_UPDATED;
	mutex_exit(&tdcp->c_statelock);

	/* update the file we linked to */
	mutex_enter(&fcp->c_statelock);
	fcp->c_attr.va_nlink++;
	fcp->c_metadata.md_localctime = current_time;
	fcp->c_metadata.md_flags |= MD_LOCALCTIME;
	fcp->c_flags |= CN_UPDATED;
	mutex_exit(&fcp->c_statelock);

out:
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}

	return (error);
}

/*
 * Serialize all renames in CFS, to avoid deadlocks - We have to hold two
 * cnodes atomically.
 */
kmutex_t cachefs_rename_lock;

/*ARGSUSED*/
static int
cachefs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp,
    char *nnm, cred_t *cr, caller_context_t *ct, int flags)
{
	fscache_t *fscp = C_TO_FSCACHE(VTOC(odvp));
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;
	vnode_t *delvp = NULL;
	vnode_t *tvp = NULL;
	int vfslock = 0;
	struct vnode *realvp;

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
		ndvp = realvp;

	/*
	 * if the fs NOFILL or NOCACHE flags are on, then the old and new
	 * directory cnodes better indicate NOCACHE mode as well.
	 */
	ASSERT(
	    (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE)) == 0 ||
	    ((VTOC(odvp)->c_flags & CN_NOCACHE) &&
	    (VTOC(ndvp)->c_flags & CN_NOCACHE)));

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the rename operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(VTOC(odvp));
	CFS_BACKFS_NFSV4_ASSERT_CNODE(VTOC(ndvp));

	for (;;) {
		if (vfslock) {
			vn_vfsunlock(delvp);
			vfslock = 0;
		}
		if (delvp) {
			VN_RELE(delvp);
			delvp = NULL;
		}

		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop for NFSv4 connected support */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/* sanity check */
		if ((odvp->v_type != VDIR) || (ndvp->v_type != VDIR)) {
			error = EINVAL;
			break;
		}

		/* cannot rename from or to . or .. */
		if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
		    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0) {
			error = EINVAL;
			break;
		}

		if (odvp != ndvp) {
			/*
			 * if moving a directory, its notion
			 * of ".." will change
			 */
			error = cachefs_lookup_common(odvp, onm, &tvp,
			    NULL, 0, NULL, cr);
			if (error == 0) {
				ASSERT(tvp != NULL);
				if (tvp->v_type == VDIR) {
					cnode_t *cp = VTOC(tvp);

					dnlc_remove(tvp, "..");

					mutex_enter(&cp->c_statelock);
					CFSOP_MODIFY_COBJECT(fscp, cp, cr);
					mutex_exit(&cp->c_statelock);
				}
			} else {
				tvp = NULL;
				if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
					if (CFS_TIMEOUT(fscp, error)) {
						cachefs_cd_release(fscp);
						held = 0;
						cachefs_cd_timedout(fscp);
						connected = 0;
						continue;
					}
				} else {
					if (CFS_TIMEOUT(fscp, error)) {
						connected = 1;
						continue;
					}
				}
				break;
			}
		}

		/* get the cnode if file being deleted */
		error = cachefs_lookup_common(ndvp, nnm, &delvp, NULL, 0,
		    NULL, cr);
		if (error) {
			delvp = NULL;
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				if (CFS_TIMEOUT(fscp, error)) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					connected = 0;
					continue;
				}
			} else {
				if (CFS_TIMEOUT(fscp, error)) {
					connected = 1;
					continue;
				}
			}
			if (error != ENOENT)
				break;
		}

		if (delvp && delvp->v_type == VDIR) {
			/* see ufs_dirremove for why this is done, mount race */
			if (vn_vfswlock(delvp)) {
				error = EBUSY;
				break;
			}
			vfslock = 1;
			if (vn_mountedvfs(delvp) != NULL) {
				error = EBUSY;
				break;
			}
		}

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_rename_connected(odvp, onm,
			    ndvp, nnm, cr, delvp);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_rename_disconnected(odvp, onm,
			    ndvp, nnm, cr, delvp);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_RENAME)) {
		struct fid gone;

		bzero(&gone, sizeof (gone));
		gone.fid_len = MAXFIDSZ;
		if (delvp != NULL)
			(void) VOP_FID(delvp, &gone, ct);

		cachefs_log_rename(cachep, error, fscp->fs_cfsvfsp,
		    &gone, 0, (delvp != NULL), crgetuid(cr));
	}

	if (held)
		cachefs_cd_release(fscp);

	if (vfslock)
		vn_vfsunlock(delvp);

	if (delvp)
		VN_RELE(delvp);
	if (tvp)
		VN_RELE(tvp);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
	return (error);
}

static int
cachefs_rename_connected(vnode_t *odvp, char *onm, vnode_t *ndvp,
    char *nnm, cred_t *cr, vnode_t *delvp)
{
	cnode_t *odcp = VTOC(odvp);
	cnode_t *ndcp = VTOC(ndvp);
	vnode_t *revp = NULL;
	cnode_t *recp;
	cnode_t *delcp;
	fscache_t *fscp = C_TO_FSCACHE(odcp);
	int error = 0;
	struct fid cookie;
	struct fid *cookiep;
	cfs_cid_t cid;
	int gotdirent;

	/* find the file we are renaming */
	error = cachefs_lookup_common(odvp, onm, &revp, NULL, 0, NULL, cr);
	if (error)
		return (error);
	recp = VTOC(revp);

	/*
	 * To avoid deadlock, we acquire this global rename lock before
	 * we try to get the locks for the source and target directories.
	 */
	mutex_enter(&cachefs_rename_lock);
	rw_enter(&odcp->c_rwlock, RW_WRITER);
	if (odcp != ndcp) {
		rw_enter(&ndcp->c_rwlock, RW_WRITER);
	}
	mutex_exit(&cachefs_rename_lock);

	ASSERT((odcp->c_flags & CN_ASYNC_POP_WORKING) == 0);
	ASSERT((ndcp->c_flags & CN_ASYNC_POP_WORKING) == 0);

	mutex_enter(&odcp->c_statelock);
	if (odcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, odcp);
		if (error) {
			mutex_exit(&odcp->c_statelock);
			goto out;
		}
	}

	error = CFSOP_CHECK_COBJECT(fscp, odcp, 0, cr);
	if (error) {
		mutex_exit(&odcp->c_statelock);
		goto out;
	}
	mutex_exit(&odcp->c_statelock);

	if (odcp != ndcp) {
		mutex_enter(&ndcp->c_statelock);
		if (ndcp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, ndcp);
			if (error) {
				mutex_exit(&ndcp->c_statelock);
				goto out;
			}
		}

		error = CFSOP_CHECK_COBJECT(fscp, ndcp, 0, cr);
		if (error) {
			mutex_exit(&ndcp->c_statelock);
			goto out;
		}
		mutex_exit(&ndcp->c_statelock);
	}

	/* if a file is being deleted because of this rename */
	if (delvp) {
		/* if src and dest file are same */
		if (delvp == revp) {
			error = 0;
			goto out;
		}

		/*
		 * If the cnode is active, make a link to the file
		 * so operations on the file will continue.
		 */
		dnlc_purge_vp(delvp);
		delcp = VTOC(delvp);
		if ((delvp->v_type != VDIR) &&
		    !((delvp->v_count == 1) ||
		    ((delvp->v_count == 2) && delcp->c_ipending))) {
			error = cachefs_remove_dolink(ndvp, delvp, nnm, cr);
			if (error)
				goto out;
		}
	}

	/* do the rename on the back fs */
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_rename (nfsv4): odcp %p, odbackvp %p, "
	    " ndcp %p, ndbackvp %p, onm %s, nnm %s\n",
	    odcp, odcp->c_backvp, ndcp, ndcp->c_backvp, onm, nnm));
	error = VOP_RENAME(odcp->c_backvp, onm, ndcp->c_backvp, nnm, cr, NULL,
	    0);
	if (error)
		goto out;

	/* purge mappings to file in the old directory */
	dnlc_purge_vp(odvp);

	/* purge mappings in the new dir if we deleted a file */
	if (delvp && (odvp != ndvp))
		dnlc_purge_vp(ndvp);

	/* update the file we just deleted */
	if (delvp) {
		mutex_enter(&delcp->c_statelock);
		if (delcp->c_attr.va_nlink == 1) {
			delcp->c_flags |= CN_DESTROY;
		} else {
			delcp->c_flags |= CN_UPDATED;
		}
		delcp->c_attr.va_nlink--;
		CFSOP_MODIFY_COBJECT(fscp, delcp, cr);
		mutex_exit(&delcp->c_statelock);
	}

	/* find the entry in the old directory */
	mutex_enter(&odcp->c_statelock);
	gotdirent = 0;
	cookiep = NULL;
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (odcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_look(odcp, onm, &cookie,
		    NULL, NULL, &cid);
		if (error == 0 || error == EINVAL) {
			gotdirent = 1;
			if (error == 0)
				cookiep = &cookie;
		} else {
			cachefs_inval_object(odcp);
		}
	}
	error = 0;

	/* remove the directory entry from the old directory */
	if (gotdirent) {
		error = cachefs_dir_rmentry(odcp, onm);
		if (error) {
			cachefs_nocache(odcp);
			error = 0;
		}
	}
	CFSOP_MODIFY_COBJECT(fscp, odcp, cr);
	mutex_exit(&odcp->c_statelock);

	/* install the directory entry in the new directory */
	mutex_enter(&ndcp->c_statelock);
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (ndcp->c_metadata.md_flags & MD_POPULATED)) {
		error = 1;
		if (gotdirent) {
			ASSERT(cid.cid_fileno != 0);
			error = 0;
			if (delvp) {
				error = cachefs_dir_rmentry(ndcp, nnm);
			}
			if (error == 0) {
				error = cachefs_dir_enter(ndcp, nnm, cookiep,
				    &cid, SM_ASYNC);
			}
		}
		if (error) {
			cachefs_nocache(ndcp);
			error = 0;
		}
	}
	if (odcp != ndcp)
		CFSOP_MODIFY_COBJECT(fscp, ndcp, cr);
	mutex_exit(&ndcp->c_statelock);

	/* ctime of renamed file has changed */
	mutex_enter(&recp->c_statelock);
	CFSOP_MODIFY_COBJECT(fscp, recp, cr);
	mutex_exit(&recp->c_statelock);

out:
	if (odcp != ndcp)
		rw_exit(&ndcp->c_rwlock);
	rw_exit(&odcp->c_rwlock);

	VN_RELE(revp);

	return (error);
}

static int
cachefs_rename_disconnected(vnode_t *odvp, char *onm, vnode_t *ndvp,
    char *nnm, cred_t *cr, vnode_t *delvp)
{
	cnode_t *odcp = VTOC(odvp);
	cnode_t *ndcp = VTOC(ndvp);
	cnode_t *delcp = NULL;
	vnode_t *revp = NULL;
	cnode_t *recp;
	fscache_t *fscp = C_TO_FSCACHE(odcp);
	int error = 0;
	struct fid cookie;
	struct fid *cookiep;
	cfs_cid_t cid;
	off_t commit = 0;
	timestruc_t current_time;

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	/* find the file we are renaming */
	error = cachefs_lookup_common(odvp, onm, &revp, NULL, 0, NULL, cr);
	if (error)
		return (error);
	recp = VTOC(revp);

	/*
	 * To avoid deadlock, we acquire this global rename lock before
	 * we try to get the locks for the source and target directories.
	 */
	mutex_enter(&cachefs_rename_lock);
	rw_enter(&odcp->c_rwlock, RW_WRITER);
	if (odcp != ndcp) {
		rw_enter(&ndcp->c_rwlock, RW_WRITER);
	}
	mutex_exit(&cachefs_rename_lock);

	if (recp->c_metadata.md_flags & MD_NEEDATTRS) {
		error = ETIMEDOUT;
		goto out;
	}

	if ((recp->c_metadata.md_flags & MD_MAPPING) == 0) {
		mutex_enter(&recp->c_statelock);
		if ((recp->c_metadata.md_flags & MD_MAPPING) == 0) {
			error = cachefs_dlog_cidmap(fscp);
			if (error) {
				mutex_exit(&recp->c_statelock);
				error = ENOSPC;
				goto out;
			}
			recp->c_metadata.md_flags |= MD_MAPPING;
			recp->c_flags |= CN_UPDATED;
		}
		mutex_exit(&recp->c_statelock);
	}

	/* check permissions */
	/* XXX clean up this mutex junk sometime */
	mutex_enter(&odcp->c_statelock);
	error = cachefs_access_local(odcp, (VEXEC|VWRITE), cr);
	mutex_exit(&odcp->c_statelock);
	if (error != 0)
		goto out;
	mutex_enter(&ndcp->c_statelock);
	error = cachefs_access_local(ndcp, (VEXEC|VWRITE), cr);
	mutex_exit(&ndcp->c_statelock);
	if (error != 0)
		goto out;
	mutex_enter(&odcp->c_statelock);
	error = cachefs_stickyrmchk(odcp, recp, cr);
	mutex_exit(&odcp->c_statelock);
	if (error != 0)
		goto out;

	/* dirs must be populated */
	if (((odcp->c_metadata.md_flags & MD_POPULATED) == 0) ||
	    ((ndcp->c_metadata.md_flags & MD_POPULATED) == 0)) {
		error = ETIMEDOUT;
		goto out;
	}

	/* for now do not allow moving dirs because could cause cycles */
	if ((((revp->v_type == VDIR) && (odvp != ndvp))) ||
	    (revp == odvp)) {
		error = ETIMEDOUT;
		goto out;
	}

	/* if a file is being deleted because of this rename */
	if (delvp) {
		delcp = VTOC(delvp);

		/* if src and dest file are the same */
		if (delvp == revp) {
			error = 0;
			goto out;
		}

		if (delcp->c_metadata.md_flags & MD_NEEDATTRS) {
			error = ETIMEDOUT;
			goto out;
		}

		/* if there are hard links to this file */
		if (delcp->c_attr.va_nlink > 1) {
			mutex_enter(&delcp->c_statelock);
			if (cachefs_modified_alloc(delcp)) {
				mutex_exit(&delcp->c_statelock);
				error = ENOSPC;
				goto out;
			}

			if ((delcp->c_metadata.md_flags & MD_MAPPING) == 0) {
				error = cachefs_dlog_cidmap(fscp);
				if (error) {
					mutex_exit(&delcp->c_statelock);
					error = ENOSPC;
					goto out;
				}
				delcp->c_metadata.md_flags |= MD_MAPPING;
				delcp->c_flags |= CN_UPDATED;
			}
			mutex_exit(&delcp->c_statelock);
		}

		/* make sure we can delete file */
		mutex_enter(&ndcp->c_statelock);
		error = cachefs_stickyrmchk(ndcp, delcp, cr);
		mutex_exit(&ndcp->c_statelock);
		if (error != 0)
			goto out;

		/*
		 * If the cnode is active, make a link to the file
		 * so operations on the file will continue.
		 */
		dnlc_purge_vp(delvp);
		if ((delvp->v_type != VDIR) &&
		    !((delvp->v_count == 1) ||
		    ((delvp->v_count == 2) && delcp->c_ipending))) {
			error = cachefs_remove_dolink(ndvp, delvp, nnm, cr);
			if (error)
				goto out;
		}
	}

	/* purge mappings to file in the old directory */
	dnlc_purge_vp(odvp);

	/* purge mappings in the new dir if we deleted a file */
	if (delvp && (odvp != ndvp))
		dnlc_purge_vp(ndvp);

	/* find the entry in the old directory */
	mutex_enter(&odcp->c_statelock);
	if ((odcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		mutex_exit(&odcp->c_statelock);
		error = ETIMEDOUT;
		goto out;
	}
	cookiep = NULL;
	error = cachefs_dir_look(odcp, onm, &cookie, NULL, NULL, &cid);
	if (error == 0 || error == EINVAL) {
		if (error == 0)
			cookiep = &cookie;
	} else {
		mutex_exit(&odcp->c_statelock);
		if (error == ENOTDIR)
			error = ETIMEDOUT;
		goto out;
	}
	error = 0;

	/* write the log entry */
	commit = cachefs_dlog_rename(fscp, odcp, onm, ndcp, nnm, cr,
	    recp, delcp);
	if (commit == 0) {
		mutex_exit(&odcp->c_statelock);
		error = ENOSPC;
		goto out;
	}

	/* remove the directory entry from the old directory */
	cachefs_modified(odcp);
	error = cachefs_dir_rmentry(odcp, onm);
	if (error) {
		mutex_exit(&odcp->c_statelock);
		if (error == ENOTDIR)
			error = ETIMEDOUT;
		goto out;
	}
	mutex_exit(&odcp->c_statelock);

	/* install the directory entry in the new directory */
	mutex_enter(&ndcp->c_statelock);
	error = ENOTDIR;
	if (ndcp->c_metadata.md_flags & MD_POPULATED) {
		ASSERT(cid.cid_fileno != 0);
		cachefs_modified(ndcp);
		error = 0;
		if (delvp) {
			error = cachefs_dir_rmentry(ndcp, nnm);
		}
		if (error == 0) {
			error = cachefs_dir_enter(ndcp, nnm, cookiep,
			    &cid, SM_ASYNC);
		}
	}
	if (error) {
		cachefs_nocache(ndcp);
		mutex_exit(&ndcp->c_statelock);
		mutex_enter(&odcp->c_statelock);
		cachefs_nocache(odcp);
		mutex_exit(&odcp->c_statelock);
		if (error == ENOTDIR)
			error = ETIMEDOUT;
		goto out;
	}
	mutex_exit(&ndcp->c_statelock);

	gethrestime(&current_time);

	/* update the file we just deleted */
	if (delvp) {
		mutex_enter(&delcp->c_statelock);
		delcp->c_attr.va_nlink--;
		delcp->c_metadata.md_localctime = current_time;
		delcp->c_metadata.md_flags |= MD_LOCALCTIME;
		if (delcp->c_attr.va_nlink == 0) {
			delcp->c_flags |= CN_DESTROY;
		} else {
			delcp->c_flags |= CN_UPDATED;
		}
		mutex_exit(&delcp->c_statelock);
	}

	/* update the file we renamed */
	mutex_enter(&recp->c_statelock);
	recp->c_metadata.md_localctime = current_time;
	recp->c_metadata.md_flags |= MD_LOCALCTIME;
	recp->c_flags |= CN_UPDATED;
	mutex_exit(&recp->c_statelock);

	/* update the source directory */
	mutex_enter(&odcp->c_statelock);
	odcp->c_metadata.md_localctime = current_time;
	odcp->c_metadata.md_localmtime = current_time;
	odcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
	odcp->c_flags |= CN_UPDATED;
	mutex_exit(&odcp->c_statelock);

	/* update the destination directory */
	if (odcp != ndcp) {
		mutex_enter(&ndcp->c_statelock);
		ndcp->c_metadata.md_localctime = current_time;
		ndcp->c_metadata.md_localmtime = current_time;
		ndcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
		ndcp->c_flags |= CN_UPDATED;
		mutex_exit(&ndcp->c_statelock);
	}

out:
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}

	if (odcp != ndcp)
		rw_exit(&ndcp->c_rwlock);
	rw_exit(&odcp->c_rwlock);

	VN_RELE(revp);

	return (error);
}

/*ARGSUSED*/
static int
cachefs_mkdir(vnode_t *dvp, char *nm, vattr_t *vap, vnode_t **vpp,
    cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_mkdir: ENTER dvp %p\n", (void *)dvp);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE))
		ASSERT(dcp->c_flags & CN_NOCACHE);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the mkdir operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			rw_exit(&dcp->c_rwlock);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		rw_enter(&dcp->c_rwlock, RW_WRITER);
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_mkdir_connected(dvp, nm, vap,
			    vpp, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				rw_exit(&dcp->c_rwlock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_mkdir_disconnected(dvp, nm, vap,
			    vpp, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_MKDIR)) {
		fid_t *fidp = NULL;
		ino64_t fileno = 0;
		cnode_t *cp = NULL;
		if (error == 0)
			cp = VTOC(*vpp);

		if (cp != NULL) {
			fidp = &cp->c_metadata.md_cookie;
			fileno = cp->c_id.cid_fileno;
		}

		cachefs_log_mkdir(cachep, error, fscp->fs_cfsvfsp,
		    fidp, fileno, crgetuid(cr));
	}

	if (held) {
		rw_exit(&dcp->c_rwlock);
		cachefs_cd_release(fscp);
	}
	if (error == 0 && CFS_ISFS_NONSHARED(fscp))
		(void) cachefs_pack(dvp, nm, cr);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_mkdir: EXIT error = %d\n", error);
#endif
	return (error);
}

static int
cachefs_mkdir_connected(vnode_t *dvp, char *nm, vattr_t *vap,
    vnode_t **vpp, cred_t *cr)
{
	cnode_t *newcp = NULL, *dcp = VTOC(dvp);
	struct vnode *vp = NULL;
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	struct fid cookie;
	struct vattr attr;
	cfs_cid_t cid, dircid;
	uint32_t valid_fid;

	if (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE))
		ASSERT(dcp->c_flags & CN_NOCACHE);

	mutex_enter(&dcp->c_statelock);

	/* get backvp of dir */
	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error) {
			mutex_exit(&dcp->c_statelock);
			goto out;
		}
	}

	/* consistency check the directory */
	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	dircid = dcp->c_id;

	/* make the dir on the back fs */
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_mkdir (nfsv4): dcp %p, dbackvp %p, "
	    "name %s\n", dcp, dcp->c_backvp, nm));
	error = VOP_MKDIR(dcp->c_backvp, nm, vap, &vp, cr, NULL, 0, NULL);
	mutex_exit(&dcp->c_statelock);
	if (error) {
		goto out;
	}

	/* get the cookie and make the cnode */
	attr.va_mask = AT_ALL;
	valid_fid = (CFS_ISFS_BACKFS_NFSV4(fscp) ? FALSE : TRUE);
	error = cachefs_getcookie(vp, &cookie, &attr, cr, valid_fid);
	if (error) {
		goto out;
	}
	cid.cid_flags = 0;
	cid.cid_fileno = attr.va_nodeid;
	error = cachefs_cnode_make(&cid, fscp, (valid_fid ? &cookie : NULL),
	    &attr, vp, cr, 0, &newcp);
	if (error) {
		goto out;
	}
	ASSERT(CTOV(newcp)->v_type == VDIR);
	*vpp = CTOV(newcp);

	/* if the dir is populated, add the new entry */
	mutex_enter(&dcp->c_statelock);
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_enter(dcp, nm, &cookie, &newcp->c_id,
		    SM_ASYNC);
		if (error) {
			cachefs_nocache(dcp);
			error = 0;
		}
	}
	dcp->c_attr.va_nlink++;
	dcp->c_flags |= CN_UPDATED;
	CFSOP_MODIFY_COBJECT(fscp, dcp, cr);
	mutex_exit(&dcp->c_statelock);

	/* XXX bob: should we do a filldir here? or just add . and .. */
	/* maybe should kick off an async filldir so caller does not wait */

	/* put the entry in the dnlc */
	if (cachefs_dnlc)
		dnlc_enter(dvp, nm, *vpp);

	/* save the fileno of the parent so can find the name */
	if (bcmp(&newcp->c_metadata.md_parent, &dircid,
	    sizeof (cfs_cid_t)) != 0) {
		mutex_enter(&newcp->c_statelock);
		newcp->c_metadata.md_parent = dircid;
		newcp->c_flags |= CN_UPDATED;
		mutex_exit(&newcp->c_statelock);
	}
out:
	if (vp)
		VN_RELE(vp);

	return (error);
}

static int
cachefs_mkdir_disconnected(vnode_t *dvp, char *nm, vattr_t *vap,
    vnode_t **vpp, cred_t *cr)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error;
	cnode_t *newcp = NULL;
	struct vattr va;
	timestruc_t current_time;
	off_t commit = 0;
	char *s;
	int namlen;

	/* don't allow '/' characters in pathname component */
	for (s = nm, namlen = 0; *s; s++, namlen++)
		if (*s == '/')
			return (EACCES);
	if (namlen == 0)
		return (EINVAL);

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	mutex_enter(&dcp->c_statelock);

	/* check permissions */
	if (error = cachefs_access_local(dcp, (VEXEC|VWRITE), cr)) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* the directory front file must be populated */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* make sure nm does not already exist in the directory */
	error = cachefs_dir_look(dcp, nm, NULL, NULL, NULL, NULL);
	if (error == ENOTDIR) {
		error = ETIMEDOUT;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	if (error != ENOENT) {
		error = EEXIST;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* make up a reasonable set of attributes */
	cachefs_attr_setup(vap, &va, dcp, cr);
	va.va_type = VDIR;
	va.va_mode |= S_IFDIR;
	va.va_nlink = 2;

	mutex_exit(&dcp->c_statelock);

	/* create the cnode */
	error = cachefs_cnode_create(fscp, &va, 0, &newcp);
	if (error)
		goto out;

	mutex_enter(&newcp->c_statelock);

	error = cachefs_dlog_cidmap(fscp);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		goto out;
	}

	cachefs_creategid(dcp, newcp, vap, cr);
	mutex_enter(&dcp->c_statelock);
	cachefs_createacl(dcp, newcp);
	mutex_exit(&dcp->c_statelock);
	gethrestime(&current_time);
	newcp->c_metadata.md_vattr.va_atime = current_time;
	newcp->c_metadata.md_localctime = current_time;
	newcp->c_metadata.md_localmtime = current_time;
	newcp->c_metadata.md_flags |= MD_MAPPING | MD_LOCALMTIME |
	    MD_LOCALCTIME;
	newcp->c_flags |= CN_UPDATED;

	/* make a front file for the new directory, add . and .. */
	error = cachefs_dir_new(dcp, newcp);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		goto out;
	}
	cachefs_modified(newcp);

	/*
	 * write the metadata now rather than waiting until
	 * inactive so that if there's no space we can let
	 * the caller know.
	 */
	ASSERT(newcp->c_frontvp);
	ASSERT((newcp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) == 0);
	ASSERT((newcp->c_flags & CN_ALLOC_PENDING) == 0);
	error = filegrp_write_metadata(newcp->c_filegrp,
	    &newcp->c_id, &newcp->c_metadata);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		goto out;
	}
	mutex_exit(&newcp->c_statelock);

	/* log the operation */
	commit = cachefs_dlog_mkdir(fscp, dcp, newcp, nm, &va, cr);
	if (commit == 0) {
		error = ENOSPC;
		goto out;
	}

	mutex_enter(&dcp->c_statelock);

	/* make sure directory is still populated */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		mutex_exit(&dcp->c_statelock);
		error = ETIMEDOUT;
		goto out;
	}
	cachefs_modified(dcp);

	/* enter the new file in the directory */
	error = cachefs_dir_enter(dcp, nm, &newcp->c_metadata.md_cookie,
	    &newcp->c_id, SM_ASYNC);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* update parent dir times */
	dcp->c_metadata.md_localctime = current_time;
	dcp->c_metadata.md_localmtime = current_time;
	dcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
	dcp->c_attr.va_nlink++;
	dcp->c_flags |= CN_UPDATED;
	mutex_exit(&dcp->c_statelock);

out:
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}
	if (error) {
		if (newcp) {
			mutex_enter(&newcp->c_statelock);
			newcp->c_flags |= CN_DESTROY;
			mutex_exit(&newcp->c_statelock);
			VN_RELE(CTOV(newcp));
		}
	} else {
		*vpp = CTOV(newcp);
	}
	return (error);
}

/*ARGSUSED*/
static int
cachefs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
    caller_context_t *ct, int flags)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;
	size_t namlen;
	vnode_t *vp = NULL;
	int vfslock = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_rmdir: ENTER vp %p\n", (void *)dvp);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_cache->c_flags & (CACHE_NOFILL | CACHE_NOCACHE))
		ASSERT(dcp->c_flags & CN_NOCACHE);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the rmdir operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		if (vfslock) {
			vn_vfsunlock(vp);
			vfslock = 0;
		}
		if (vp) {
			VN_RELE(vp);
			vp = NULL;
		}

		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/* if disconnected, do some extra error checking */
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			/* check permissions */
			mutex_enter(&dcp->c_statelock);
			error = cachefs_access_local(dcp, (VEXEC|VWRITE), cr);
			mutex_exit(&dcp->c_statelock);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
			if (error)
				break;

			namlen = strlen(nm);
			if (namlen == 0) {
				error = EINVAL;
				break;
			}

			/* cannot remove . and .. */
			if (nm[0] == '.') {
				if (namlen == 1) {
					error = EINVAL;
					break;
				} else if (namlen == 2 && nm[1] == '.') {
					error = EEXIST;
					break;
				}
			}

		}

		/* get the cnode of the dir to remove */
		error = cachefs_lookup_common(dvp, nm, &vp, NULL, 0, NULL, cr);
		if (error) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				if (CFS_TIMEOUT(fscp, error)) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					connected = 0;
					continue;
				}
			} else {
				if (CFS_TIMEOUT(fscp, error)) {
					connected = 1;
					continue;
				}
			}
			break;
		}

		/* must be a dir */
		if (vp->v_type != VDIR) {
			error = ENOTDIR;
			break;
		}

		/* must not be current dir */
		if (VOP_CMP(vp, cdir, ct)) {
			error = EINVAL;
			break;
		}

		/* see ufs_dirremove for why this is done, mount race */
		if (vn_vfswlock(vp)) {
			error = EBUSY;
			break;
		}
		vfslock = 1;
		if (vn_mountedvfs(vp) != NULL) {
			error = EBUSY;
			break;
		}

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_rmdir_connected(dvp, nm, cdir,
			    cr, vp);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_rmdir_disconnected(dvp, nm, cdir,
			    cr, vp);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_RMDIR)) {
		ino64_t fileno = 0;
		fid_t *fidp = NULL;
		cnode_t *cp = NULL;
		if (vp)
			cp = VTOC(vp);

		if (cp != NULL) {
			fidp = &cp->c_metadata.md_cookie;
			fileno = cp->c_id.cid_fileno;
		}

		cachefs_log_rmdir(cachep, error, fscp->fs_cfsvfsp,
		    fidp, fileno, crgetuid(cr));
	}

	if (held) {
		cachefs_cd_release(fscp);
	}

	if (vfslock)
		vn_vfsunlock(vp);

	if (vp)
		VN_RELE(vp);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_rmdir: EXIT error = %d\n", error);
#endif

	return (error);
}

static int
cachefs_rmdir_connected(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
    vnode_t *vp)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp = VTOC(vp);
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(dcp);

	rw_enter(&dcp->c_rwlock, RW_WRITER);
	mutex_enter(&dcp->c_statelock);
	mutex_enter(&cp->c_statelock);

	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error) {
			goto out;
		}
	}

	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error)
		goto out;

	/* rmdir on the back fs */
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_rmdir (nfsv4): dcp %p, dbackvp %p, "
	    "name %s\n", dcp, dcp->c_backvp, nm));
	error = VOP_RMDIR(dcp->c_backvp, nm, cdir, cr, NULL, 0);
	if (error)
		goto out;

	/* if the dir is populated, remove the entry from it */
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_rmentry(dcp, nm);
		if (error) {
			cachefs_nocache(dcp);
			error = 0;
		}
	}

	/*
	 * *if* the (hard) link count goes to 0, then we set the CDESTROY
	 * flag on the cnode. The cached object will then be destroyed
	 * at inactive time where the chickens come home to roost :-)
	 * The link cnt for directories is bumped down by 2 'cause the "."
	 * entry has to be elided too ! The link cnt for the parent goes down
	 * by 1 (because of "..").
	 */
	cp->c_attr.va_nlink -= 2;
	dcp->c_attr.va_nlink--;
	if (cp->c_attr.va_nlink == 0) {
		cp->c_flags |= CN_DESTROY;
	} else {
		cp->c_flags |= CN_UPDATED;
	}
	dcp->c_flags |= CN_UPDATED;

	dnlc_purge_vp(vp);
	CFSOP_MODIFY_COBJECT(fscp, dcp, cr);

out:
	mutex_exit(&cp->c_statelock);
	mutex_exit(&dcp->c_statelock);
	rw_exit(&dcp->c_rwlock);

	return (error);
}

static int
/*ARGSUSED*/
cachefs_rmdir_disconnected(vnode_t *dvp, char *nm, vnode_t *cdir,
    cred_t *cr, vnode_t *vp)
{
	cnode_t *dcp = VTOC(dvp);
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;
	off_t commit = 0;
	timestruc_t current_time;

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	rw_enter(&dcp->c_rwlock, RW_WRITER);
	mutex_enter(&dcp->c_statelock);
	mutex_enter(&cp->c_statelock);

	/* both directories must be populated */
	if (((dcp->c_metadata.md_flags & MD_POPULATED) == 0) ||
	    ((cp->c_metadata.md_flags & MD_POPULATED) == 0)) {
		error = ETIMEDOUT;
		goto out;
	}

	/* if sticky bit set on the dir, more access checks to perform */
	if (error = cachefs_stickyrmchk(dcp, cp, cr)) {
		goto out;
	}

	/* make sure dir is empty */
	if (cp->c_attr.va_nlink > 2) {
		error = cachefs_dir_empty(cp);
		if (error) {
			if (error == ENOTDIR)
				error = ETIMEDOUT;
			goto out;
		}
		cachefs_modified(cp);
	}
	cachefs_modified(dcp);

	/* log the operation */
	commit = cachefs_dlog_rmdir(fscp, dcp, nm, cp, cr);
	if (commit == 0) {
		error = ENOSPC;
		goto out;
	}

	/* remove name from parent dir */
	error = cachefs_dir_rmentry(dcp, nm);
	if (error == ENOTDIR) {
		error = ETIMEDOUT;
		goto out;
	}
	if (error)
		goto out;

	gethrestime(&current_time);

	/* update deleted dir values */
	cp->c_attr.va_nlink -= 2;
	if (cp->c_attr.va_nlink == 0)
		cp->c_flags |= CN_DESTROY;
	else {
		cp->c_metadata.md_localctime = current_time;
		cp->c_metadata.md_flags |= MD_LOCALCTIME;
		cp->c_flags |= CN_UPDATED;
	}

	/* update parent values */
	dcp->c_metadata.md_localctime = current_time;
	dcp->c_metadata.md_localmtime = current_time;
	dcp->c_metadata.md_flags |= MD_LOCALCTIME | MD_LOCALMTIME;
	dcp->c_attr.va_nlink--;
	dcp->c_flags |= CN_UPDATED;

out:
	mutex_exit(&cp->c_statelock);
	mutex_exit(&dcp->c_statelock);
	rw_exit(&dcp->c_rwlock);
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
		dnlc_purge_vp(vp);
	}
	return (error);
}

/*ARGSUSED*/
static int
cachefs_symlink(vnode_t *dvp, char *lnm, vattr_t *tva,
    char *tnm, cred_t *cr, caller_context_t *ct, int flags)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_symlink: ENTER dvp %p lnm %s tnm %s\n",
		    (void *)dvp, lnm, tnm);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_cache->c_flags & CACHE_NOCACHE)
		ASSERT(dcp->c_flags & CN_NOCACHE);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the symlink operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			rw_exit(&dcp->c_rwlock);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		rw_enter(&dcp->c_rwlock, RW_WRITER);
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_symlink_connected(dvp, lnm, tva,
			    tnm, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				rw_exit(&dcp->c_rwlock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_symlink_disconnected(dvp, lnm, tva,
			    tnm, cr);
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_SYMLINK))
		cachefs_log_symlink(cachep, error, fscp->fs_cfsvfsp,
		    &dcp->c_metadata.md_cookie, dcp->c_id.cid_fileno,
		    crgetuid(cr), (uint_t)strlen(tnm));

	if (held) {
		rw_exit(&dcp->c_rwlock);
		cachefs_cd_release(fscp);
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_symlink: EXIT error = %d\n", error);
#endif
	return (error);
}

static int
cachefs_symlink_connected(vnode_t *dvp, char *lnm, vattr_t *tva,
    char *tnm, cred_t *cr)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error = 0;
	vnode_t *backvp = NULL;
	cnode_t *newcp = NULL;
	struct vattr va;
	struct fid cookie;
	cfs_cid_t cid;
	uint32_t valid_fid;

	mutex_enter(&dcp->c_statelock);

	if (dcp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, dcp);
		if (error) {
			cachefs_nocache(dcp);
			mutex_exit(&dcp->c_statelock);
			goto out;
		}
	}

	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_symlink (nfsv4): dcp %p, dbackvp %p, "
	    "lnm %s, tnm %s\n", dcp, dcp->c_backvp, lnm, tnm));
	error = VOP_SYMLINK(dcp->c_backvp, lnm, tva, tnm, cr, NULL, 0);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	if ((dcp->c_filegrp->fg_flags & CFS_FG_WRITE) == 0 &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		cachefs_nocache(dcp);
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	CFSOP_MODIFY_COBJECT(fscp, dcp, cr);

	/* lookup the symlink we just created and get its fid and attrs */
	(void) VOP_LOOKUP(dcp->c_backvp, lnm, &backvp, NULL, 0, NULL, cr,
	    NULL, NULL, NULL);
	if (backvp == NULL) {
		if (CFS_ISFS_BACKFS_NFSV4(fscp) == 0)
			cachefs_nocache(dcp);
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	valid_fid = (CFS_ISFS_BACKFS_NFSV4(fscp) ? FALSE : TRUE);
	error = cachefs_getcookie(backvp, &cookie, &va, cr, valid_fid);
	if (error) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		error = 0;
		cachefs_nocache(dcp);
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	cid.cid_fileno = va.va_nodeid;
	cid.cid_flags = 0;

	/* if the dir is cached, add the symlink to it */
	if (CFS_ISFS_NONSHARED(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		error = cachefs_dir_enter(dcp, lnm, &cookie, &cid, SM_ASYNC);
		if (error) {
			cachefs_nocache(dcp);
			error = 0;
		}
	}
	mutex_exit(&dcp->c_statelock);

	/* make the cnode for the sym link */
	error = cachefs_cnode_make(&cid, fscp, (valid_fid ? &cookie : NULL),
	    &va, backvp, cr, 0, &newcp);
	if (error) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		cachefs_nocache(dcp);
		error = 0;
		goto out;
	}

	/* try to cache the symlink contents */
	rw_enter(&newcp->c_rwlock, RW_WRITER);
	mutex_enter(&newcp->c_statelock);

	/*
	 * try to cache the sym link, note that its a noop if NOCACHE
	 * or NFSv4 is set
	 */
	error = cachefs_stuffsymlink(newcp, tnm, (int)newcp->c_size);
	if (error) {
		cachefs_nocache(newcp);
		error = 0;
	}
	mutex_exit(&newcp->c_statelock);
	rw_exit(&newcp->c_rwlock);

out:
	if (backvp)
		VN_RELE(backvp);
	if (newcp)
		VN_RELE(CTOV(newcp));
	return (error);
}

static int
cachefs_symlink_disconnected(vnode_t *dvp, char *lnm, vattr_t *tva,
    char *tnm, cred_t *cr)
{
	cnode_t *dcp = VTOC(dvp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	int error;
	cnode_t *newcp = NULL;
	struct vattr va;
	timestruc_t current_time;
	off_t commit = 0;

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	mutex_enter(&dcp->c_statelock);

	/* check permissions */
	if (error = cachefs_access_local(dcp, (VEXEC|VWRITE), cr)) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* the directory front file must be populated */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* make sure lnm does not already exist in the directory */
	error = cachefs_dir_look(dcp, lnm, NULL, NULL, NULL, NULL);
	if (error == ENOTDIR) {
		error = ETIMEDOUT;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	if (error != ENOENT) {
		error = EEXIST;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* make up a reasonable set of attributes */
	cachefs_attr_setup(tva, &va, dcp, cr);
	va.va_type = VLNK;
	va.va_mode |= S_IFLNK;
	va.va_size = strlen(tnm);

	mutex_exit(&dcp->c_statelock);

	/* create the cnode */
	error = cachefs_cnode_create(fscp, &va, 0, &newcp);
	if (error)
		goto out;

	rw_enter(&newcp->c_rwlock, RW_WRITER);
	mutex_enter(&newcp->c_statelock);

	error = cachefs_dlog_cidmap(fscp);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		rw_exit(&newcp->c_rwlock);
		error = ENOSPC;
		goto out;
	}

	cachefs_creategid(dcp, newcp, tva, cr);
	mutex_enter(&dcp->c_statelock);
	cachefs_createacl(dcp, newcp);
	mutex_exit(&dcp->c_statelock);
	gethrestime(&current_time);
	newcp->c_metadata.md_vattr.va_atime = current_time;
	newcp->c_metadata.md_localctime = current_time;
	newcp->c_metadata.md_localmtime = current_time;
	newcp->c_metadata.md_flags |= MD_MAPPING | MD_LOCALMTIME |
	    MD_LOCALCTIME;
	newcp->c_flags |= CN_UPDATED;

	/* log the operation */
	commit = cachefs_dlog_symlink(fscp, dcp, newcp, lnm, tva, tnm, cr);
	if (commit == 0) {
		mutex_exit(&newcp->c_statelock);
		rw_exit(&newcp->c_rwlock);
		error = ENOSPC;
		goto out;
	}

	/* store the symlink contents */
	error = cachefs_stuffsymlink(newcp, tnm, (int)newcp->c_size);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		rw_exit(&newcp->c_rwlock);
		goto out;
	}
	if (cachefs_modified_alloc(newcp)) {
		mutex_exit(&newcp->c_statelock);
		rw_exit(&newcp->c_rwlock);
		error = ENOSPC;
		goto out;
	}

	/*
	 * write the metadata now rather than waiting until
	 * inactive so that if there's no space we can let
	 * the caller know.
	 */
	if (newcp->c_flags & CN_ALLOC_PENDING) {
		if (newcp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) {
			(void) filegrp_allocattr(newcp->c_filegrp);
		}
		error = filegrp_create_metadata(newcp->c_filegrp,
		    &newcp->c_metadata, &newcp->c_id);
		if (error) {
			mutex_exit(&newcp->c_statelock);
			rw_exit(&newcp->c_rwlock);
			goto out;
		}
		newcp->c_flags &= ~CN_ALLOC_PENDING;
	}
	error = filegrp_write_metadata(newcp->c_filegrp,
	    &newcp->c_id, &newcp->c_metadata);
	if (error) {
		mutex_exit(&newcp->c_statelock);
		rw_exit(&newcp->c_rwlock);
		goto out;
	}
	mutex_exit(&newcp->c_statelock);
	rw_exit(&newcp->c_rwlock);

	mutex_enter(&dcp->c_statelock);

	/* enter the new file in the directory */
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
		mutex_exit(&dcp->c_statelock);
		goto out;
	}
	cachefs_modified(dcp);
	error = cachefs_dir_enter(dcp, lnm, &newcp->c_metadata.md_cookie,
	    &newcp->c_id, SM_ASYNC);
	if (error) {
		mutex_exit(&dcp->c_statelock);
		goto out;
	}

	/* update parent dir times */
	dcp->c_metadata.md_localctime = current_time;
	dcp->c_metadata.md_localmtime = current_time;
	dcp->c_metadata.md_flags |= MD_LOCALMTIME | MD_LOCALCTIME;
	dcp->c_flags |= CN_UPDATED;
	mutex_exit(&dcp->c_statelock);

out:
	if (commit) {
		/* commit the log entry */
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX bob: fix on panic */
		}
	}

	if (error) {
		if (newcp) {
			mutex_enter(&newcp->c_statelock);
			newcp->c_flags |= CN_DESTROY;
			mutex_exit(&newcp->c_statelock);
		}
	}
	if (newcp) {
		VN_RELE(CTOV(newcp));
	}

	return (error);
}

/*ARGSUSED*/
static int
cachefs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
    caller_context_t *ct, int flags)
{
	cnode_t *dcp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_readdir: ENTER vp %p\n", (void *)vp);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the readdir operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(dcp);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			rw_exit(&dcp->c_rwlock);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		rw_enter(&dcp->c_rwlock, RW_READER);
		held = 1;

		/* quit if link count of zero (posix) */
		if (dcp->c_attr.va_nlink == 0) {
			if (eofp)
				*eofp = 1;
			error = 0;
			break;
		}

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_readdir_connected(vp, uiop, cr,
			    eofp);
			if (CFS_TIMEOUT(fscp, error)) {
				rw_exit(&dcp->c_rwlock);
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_readdir_disconnected(vp, uiop, cr,
			    eofp);
			if (CFS_TIMEOUT(fscp, error)) {
				if (cachefs_cd_access_miss(fscp)) {
					error = cachefs_readdir_connected(vp,
					    uiop, cr, eofp);
					if (!CFS_TIMEOUT(fscp, error))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_READDIR))
		cachefs_log_readdir(cachep, error, fscp->fs_cfsvfsp,
		    &dcp->c_metadata.md_cookie, dcp->c_id.cid_fileno,
		    crgetuid(cr), uiop->uio_loffset, *eofp);

	if (held) {
		rw_exit(&dcp->c_rwlock);
		cachefs_cd_release(fscp);
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_readdir: EXIT error = %d\n", error);
#endif

	return (error);
}

static int
cachefs_readdir_connected(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp)
{
	cnode_t *dcp = VTOC(vp);
	int error;
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	struct cachefs_req *rp;

	mutex_enter(&dcp->c_statelock);

	/* check directory consistency */
	error = CFSOP_CHECK_COBJECT(fscp, dcp, 0, cr);
	if (error)
		goto out;
	dcp->c_usage++;

	/* if dir was modified, toss old contents */
	if (dcp->c_metadata.md_flags & MD_INVALREADDIR) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		cachefs_inval_object(dcp);
	}

	error = 0;
	if (((dcp->c_metadata.md_flags & MD_POPULATED) == 0) &&
	    ((dcp->c_flags & (CN_ASYNC_POPULATE | CN_NOCACHE)) == 0) &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp) &&
	    (fscp->fs_cdconnected == CFS_CD_CONNECTED)) {

		if (cachefs_async_okay()) {

			/*
			 * Set up asynchronous request to fill this
			 * directory.
			 */

			dcp->c_flags |= CN_ASYNC_POPULATE;

			rp = kmem_cache_alloc(cachefs_req_cache, KM_SLEEP);
			rp->cfs_cmd = CFS_POPULATE;
			rp->cfs_req_u.cu_populate.cpop_vp = vp;
			rp->cfs_cr = cr;

			crhold(cr);
			VN_HOLD(vp);

			cachefs_addqueue(rp, &fscp->fs_workq);
		} else {
			error = cachefs_dir_fill(dcp, cr);
			if (error != 0)
				cachefs_nocache(dcp);
		}
	}

	/* if front file is populated */
	if (((dcp->c_flags & (CN_NOCACHE | CN_ASYNC_POPULATE)) == 0) &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp) &&
	    (dcp->c_metadata.md_flags & MD_POPULATED)) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		error = cachefs_dir_read(dcp, uiop, eofp);
		if (error == 0)
			fscp->fs_stats.st_hits++;
	}

	/* if front file could not be used */
	if ((error != 0) ||
	    CFS_ISFS_BACKFS_NFSV4(fscp) ||
	    (dcp->c_flags & (CN_NOCACHE | CN_ASYNC_POPULATE)) ||
	    ((dcp->c_metadata.md_flags & MD_POPULATED) == 0)) {

		if (error && !(dcp->c_flags & CN_NOCACHE) &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp))
			cachefs_nocache(dcp);

		/* get the back vp */
		if (dcp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, dcp);
			if (error)
				goto out;
		}

		if (fscp->fs_inum_size > 0) {
			error = cachefs_readback_translate(dcp, uiop, cr, eofp);
		} else {
			/* do the dir read from the back fs */
			(void) VOP_RWLOCK(dcp->c_backvp,
			    V_WRITELOCK_FALSE, NULL);
			CFS_DPRINT_BACKFS_NFSV4(fscp,
			    ("cachefs_readdir (nfsv4): "
			    "dcp %p, dbackvp %p\n", dcp, dcp->c_backvp));
			error = VOP_READDIR(dcp->c_backvp, uiop, cr, eofp,
			    NULL, 0);
			VOP_RWUNLOCK(dcp->c_backvp, V_WRITELOCK_FALSE, NULL);
		}

		if (error == 0)
			fscp->fs_stats.st_misses++;
	}

out:
	mutex_exit(&dcp->c_statelock);

	return (error);
}

static int
cachefs_readback_translate(cnode_t *cp, uio_t *uiop, cred_t *cr, int *eofp)
{
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	caddr_t buffy = NULL;
	int buffysize = MAXBSIZE;
	caddr_t chrp, end;
	ino64_t newinum;
	struct dirent64 *de;
	uio_t uioin;
	iovec_t iov;

	ASSERT(cp->c_backvp != NULL);
	ASSERT(fscp->fs_inum_size > 0);

	if (uiop->uio_resid < buffysize)
		buffysize = (int)uiop->uio_resid;
	buffy = cachefs_kmem_alloc(buffysize, KM_SLEEP);

	iov.iov_base = buffy;
	iov.iov_len = buffysize;
	uioin.uio_iov = &iov;
	uioin.uio_iovcnt = 1;
	uioin.uio_segflg = UIO_SYSSPACE;
	uioin.uio_fmode = 0;
	uioin.uio_extflg = UIO_COPY_CACHED;
	uioin.uio_loffset = uiop->uio_loffset;
	uioin.uio_resid = buffysize;

	(void) VOP_RWLOCK(cp->c_backvp, V_WRITELOCK_FALSE, NULL);
	error = VOP_READDIR(cp->c_backvp, &uioin, cr, eofp, NULL, 0);
	VOP_RWUNLOCK(cp->c_backvp, V_WRITELOCK_FALSE, NULL);

	if (error != 0)
		goto out;

	end = buffy + buffysize - uioin.uio_resid;

	mutex_exit(&cp->c_statelock);
	mutex_enter(&fscp->fs_fslock);


	for (chrp = buffy; chrp < end; chrp += de->d_reclen) {
		de = (dirent64_t *)chrp;
		newinum = cachefs_inum_real2fake(fscp, de->d_ino);
		if (newinum == 0)
			newinum = cachefs_fileno_conflict(fscp, de->d_ino);
		de->d_ino = newinum;
	}
	mutex_exit(&fscp->fs_fslock);
	mutex_enter(&cp->c_statelock);

	error = uiomove(buffy, end - buffy, UIO_READ, uiop);
	uiop->uio_loffset = uioin.uio_loffset;

out:

	if (buffy != NULL)
		cachefs_kmem_free(buffy, buffysize);

	return (error);
}

static int
/*ARGSUSED*/
cachefs_readdir_disconnected(vnode_t *vp, uio_t *uiop, cred_t *cr,
    int *eofp)
{
	cnode_t *dcp = VTOC(vp);
	int error;

	mutex_enter(&dcp->c_statelock);
	if ((dcp->c_metadata.md_flags & MD_POPULATED) == 0) {
		error = ETIMEDOUT;
	} else {
		error = cachefs_dir_read(dcp, uiop, eofp);
		if (error == ENOTDIR)
			error = ETIMEDOUT;
	}
	mutex_exit(&dcp->c_statelock);

	return (error);
}

/*ARGSUSED*/
static int
cachefs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
{
	int error = 0;
	struct cnode *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions, then bail
	 * as  NFSv4 doesn't support VOP_FID.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		return (ENOTSUP);
	}

	mutex_enter(&cp->c_statelock);
	if (fidp->fid_len < cp->c_metadata.md_cookie.fid_len) {
		fidp->fid_len = cp->c_metadata.md_cookie.fid_len;
		error = ENOSPC;
	} else {
		bcopy(cp->c_metadata.md_cookie.fid_data, fidp->fid_data,
		    cp->c_metadata.md_cookie.fid_len);
		fidp->fid_len = cp->c_metadata.md_cookie.fid_len;
	}
	mutex_exit(&cp->c_statelock);
	return (error);
}

/* ARGSUSED2 */
static int
cachefs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
{
	cnode_t *cp = VTOC(vp);

	/*
	 * XXX - This is ifdef'ed out for now. The problem -
	 * getdents() acquires the read version of rwlock, then we come
	 * into cachefs_readdir() and that wants to acquire the write version
	 * of this lock (if its going to populate the directory). This is
	 * a problem, this can be solved by introducing another lock in the
	 * cnode.
	 */
/* XXX */
	if (vp->v_type != VREG)
		return (-1);
	if (write_lock)
		rw_enter(&cp->c_rwlock, RW_WRITER);
	else
		rw_enter(&cp->c_rwlock, RW_READER);
	return (write_lock);
}

/* ARGSUSED */
static void
cachefs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
{
	cnode_t *cp = VTOC(vp);
	if (vp->v_type != VREG)
		return;
	rw_exit(&cp->c_rwlock);
}

/* ARGSUSED */
static int
cachefs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
    caller_context_t *ct)
{
	return (0);
}

static int cachefs_lostpage = 0;
/*
 * Return all the pages from [off..off+len] in file
 */
/*ARGSUSED*/
static int
cachefs_getpage(struct vnode *vp, offset_t off, size_t len,
	uint_t *protp, struct page *pl[], size_t plsz, struct seg *seg,
	caddr_t addr, enum seg_rw rw, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	int error;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	cachefscache_t *cachep = fscp->fs_cache;
	int held = 0;
	int connected = 0;

#ifdef CFSDEBUG
	u_offset_t offx = (u_offset_t)off;

	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getpage: ENTER vp %p off %lld len %lu rw %d\n",
		    (void *)vp, offx, len, rw);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (vp->v_flag & VNOMAP) {
		error = ENOSYS;
		goto out;
	}

	/* Call backfilesystem if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_getpage_backfs_nfsv4(vp, off, len, protp, pl,
		    plsz, seg, addr, rw, cr);
		goto out;
	}

	/* XXX sam: make this do an async populate? */
	if (pl == NULL) {
		error = 0;
		goto out;
	}
	if (protp != NULL)
		*protp = PROT_ALL;

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		/*
		 * If we are getting called as a side effect of a
		 * cachefs_write()
		 * operation the local file size might not be extended yet.
		 * In this case we want to be able to return pages of zeroes.
		 */
		if ((u_offset_t)off + len >
		    ((cp->c_size + PAGEOFFSET) & (offset_t)PAGEMASK)) {
			if (seg != segkmap) {
				error = EFAULT;
				break;
			}
		}
		if (len <= PAGESIZE)
			error = cachefs_getapage(vp, (u_offset_t)off, len,
			    protp, pl, plsz, seg, addr, rw, cr);
		else
			error = pvn_getpages(cachefs_getapage, vp,
			    (u_offset_t)off, len, protp, pl, plsz, seg, addr,
			    rw, cr);
		if (error == 0)
			break;

		if (((cp->c_flags & CN_NOCACHE) && (error == ENOSPC)) ||
		    error == EAGAIN) {
			connected = 0;
			continue;
		}
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			if (CFS_TIMEOUT(fscp, error)) {
				if (cachefs_cd_access_miss(fscp)) {
					if (len <= PAGESIZE)
						error = cachefs_getapage_back(
						    vp, (u_offset_t)off,
						    len, protp, pl,
						    plsz, seg, addr, rw, cr);
					else
						error = pvn_getpages(
						    cachefs_getapage_back, vp,
						    (u_offset_t)off, len,
						    protp, pl,
						    plsz, seg, addr, rw, cr);
					if (!CFS_TIMEOUT(fscp, error) &&
					    (error != EAGAIN))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_GETPAGE))
		cachefs_log_getpage(cachep, error, vp->v_vfsp,
		    &cp->c_metadata.md_cookie, cp->c_id.cid_fileno,
		    crgetuid(cr), off, len);

	if (held) {
		cachefs_cd_release(fscp);
	}

out:
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getpage: EXIT vp %p error %d\n",
		    (void *)vp, error);
#endif
	return (error);
}

/*
 * cachefs_getpage_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the getpage (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_getpage_backfs_nfsv4(struct vnode *vp, offset_t off, size_t len,
			uint_t *protp, struct page *pl[], size_t plsz,
			struct seg *seg, caddr_t addr, enum seg_rw rw,
			cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation is
	 * supported, the cnode backvp must exist, and cachefs optional
	 * (eg., disconnectable) flags are turned off. Assert these
	 * conditions for the getpage operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_getpage_backfs_nfsv4: cnode %p, backvp %p\n",
	    cp, backvp));
	error = VOP_GETPAGE(backvp, off, len, protp, pl, plsz, seg,
	    addr, rw, cr, NULL);

	return (error);
}

/*
 * Called from pvn_getpages or cachefs_getpage to get a particular page.
 */
/*ARGSUSED*/
static int
cachefs_getapage(struct vnode *vp, u_offset_t off, size_t len, uint_t *protp,
	struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
	enum seg_rw rw, cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	page_t **ppp, *pp = NULL;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	cachefscache_t *cachep = fscp->fs_cache;
	int error = 0;
	struct page **ourpl;
	struct page *ourstackpl[17]; /* see ASSERT() below for 17 */
	int index = 0;
	int downgrade;
	int have_statelock = 0;
	u_offset_t popoff;
	size_t popsize = 0;

	/*LINTED*/
	ASSERT(((DEF_POP_SIZE / PAGESIZE) + 1) <= 17);

	if (fscp->fs_info.fi_popsize > DEF_POP_SIZE)
		ourpl = cachefs_kmem_alloc(sizeof (struct page *) *
		    ((fscp->fs_info.fi_popsize / PAGESIZE) + 1), KM_SLEEP);
	else
		ourpl = ourstackpl;

	ourpl[0] = NULL;
	off = off & (offset_t)PAGEMASK;
again:
	/*
	 * Look for the page
	 */
	if (page_exists(vp, off) == 0) {
		/*
		 * Need to do work to get the page.
		 * Grab our lock because we are going to
		 * modify the state of the cnode.
		 */
		if (! have_statelock) {
			mutex_enter(&cp->c_statelock);
			have_statelock = 1;
		}
		/*
		 * If we're in NOCACHE mode, we will need a backvp
		 */
		if (cp->c_flags & CN_NOCACHE) {
			if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
				error = ETIMEDOUT;
				goto out;
			}
			if (cp->c_backvp == NULL) {
				error = cachefs_getbackvp(fscp, cp);
				if (error)
					goto out;
			}
			error = VOP_GETPAGE(cp->c_backvp, off,
			    PAGESIZE, protp, ourpl, PAGESIZE, seg,
			    addr, S_READ, cr, NULL);
			/*
			 * backfs returns EFAULT when we are trying for a
			 * page beyond EOF but cachefs has the knowledge that
			 * it is not beyond EOF be cause cp->c_size is
			 * greater then the offset requested.
			 */
			if (error == EFAULT) {
				error = 0;
				pp = page_create_va(vp, off, PAGESIZE,
				    PG_EXCL | PG_WAIT, seg, addr);
				if (pp == NULL)
					goto again;
				pagezero(pp, 0, PAGESIZE);
				pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
				goto out;
			}
			if (error)
				goto out;
			goto getpages;
		}
		/*
		 * We need a front file. If we can't get it,
		 * put the cnode in NOCACHE mode and try again.
		 */
		if (cp->c_frontvp == NULL) {
			error = cachefs_getfrontfile(cp);
			if (error) {
				cachefs_nocache(cp);
				error = EAGAIN;
				goto out;
			}
		}
		/*
		 * Check if the front file needs population.
		 * If population is necessary, make sure we have a
		 * backvp as well. We will get the page from the backvp.
		 * bug 4152459-
		 * But if the file system is in disconnected mode
		 * and the file is a local file then do not check the
		 * allocmap.
		 */
		if (((fscp->fs_cdconnected == CFS_CD_CONNECTED) ||
		    ((cp->c_metadata.md_flags & MD_LOCALFILENO) == 0)) &&
		    (cachefs_check_allocmap(cp, off) == 0)) {
			if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
				error = ETIMEDOUT;
				goto out;
			}
			if (cp->c_backvp == NULL) {
				error = cachefs_getbackvp(fscp, cp);
				if (error)
					goto out;
			}
			if (cp->c_filegrp->fg_flags & CFS_FG_WRITE) {
				cachefs_cluster_allocmap(off, &popoff,
				    &popsize,
				    fscp->fs_info.fi_popsize, cp);
				if (popsize != 0) {
					error = cachefs_populate(cp,
					    popoff, popsize,
					    cp->c_frontvp, cp->c_backvp,
					    cp->c_size, cr);
					if (error) {
						cachefs_nocache(cp);
						error = EAGAIN;
						goto out;
					} else {
						cp->c_flags |=
						    CN_UPDATED |
						    CN_NEED_FRONT_SYNC |
						    CN_POPULATION_PENDING;
					}
					popsize = popsize - (off - popoff);
				} else {
					popsize = PAGESIZE;
				}
			}
			/* else XXX assert CN_NOCACHE? */
			error = VOP_GETPAGE(cp->c_backvp, (offset_t)off,
			    PAGESIZE, protp, ourpl, popsize,
			    seg, addr, S_READ, cr, NULL);
			if (error)
				goto out;
			fscp->fs_stats.st_misses++;
		} else {
			if (cp->c_flags & CN_POPULATION_PENDING) {
				error = VOP_FSYNC(cp->c_frontvp, FSYNC, cr,
				    NULL);
				cp->c_flags &= ~CN_POPULATION_PENDING;
				if (error) {
					cachefs_nocache(cp);
					error = EAGAIN;
					goto out;
				}
			}
			/*
			 * File was populated so we get the page from the
			 * frontvp
			 */
			error = VOP_GETPAGE(cp->c_frontvp, (offset_t)off,
			    PAGESIZE, protp, ourpl, PAGESIZE, seg, addr,
			    rw, cr, NULL);
			if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_GPFRONT))
				cachefs_log_gpfront(cachep, error,
				    fscp->fs_cfsvfsp,
				    &cp->c_metadata.md_cookie, cp->c_fileno,
				    crgetuid(cr), off, PAGESIZE);
			if (error) {
				cachefs_nocache(cp);
				error = EAGAIN;
				goto out;
			}
			fscp->fs_stats.st_hits++;
		}
getpages:
		ASSERT(have_statelock);
		if (have_statelock) {
			mutex_exit(&cp->c_statelock);
			have_statelock = 0;
		}
		downgrade = 0;
		for (ppp = ourpl; *ppp; ppp++) {
			if ((*ppp)->p_offset < off) {
				index++;
				page_unlock(*ppp);
				continue;
			}
			if (PAGE_SHARED(*ppp)) {
				if (page_tryupgrade(*ppp) == 0) {
					for (ppp = &ourpl[index]; *ppp; ppp++)
						page_unlock(*ppp);
					error = EAGAIN;
					goto out;
				}
				downgrade = 1;
			}
			ASSERT(PAGE_EXCL(*ppp));
			(void) hat_pageunload((*ppp), HAT_FORCE_PGUNLOAD);
			page_rename(*ppp, vp, (*ppp)->p_offset);
		}
		pl[0] = ourpl[index];
		pl[1] = NULL;
		if (downgrade) {
			page_downgrade(ourpl[index]);
		}
		/* Unlock the rest of the pages from the cluster */
		for (ppp = &ourpl[index+1]; *ppp; ppp++)
			page_unlock(*ppp);
	} else {
		ASSERT(! have_statelock);
		if (have_statelock) {
			mutex_exit(&cp->c_statelock);
			have_statelock = 0;
		}
		/* XXX SE_SHARED probably isn't what we *always* want */
		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
			cachefs_lostpage++;
			goto again;
		}
		pl[0] = pp;
		pl[1] = NULL;
		/* XXX increment st_hits?  i don't think so, but... */
	}

out:
	if (have_statelock) {
		mutex_exit(&cp->c_statelock);
		have_statelock = 0;
	}
	if (fscp->fs_info.fi_popsize > DEF_POP_SIZE)
		cachefs_kmem_free(ourpl, sizeof (struct page *) *
		    ((fscp->fs_info.fi_popsize / PAGESIZE) + 1));
	return (error);
}

/* gets a page but only from the back fs */
/*ARGSUSED*/
static int
cachefs_getapage_back(struct vnode *vp, u_offset_t off, size_t len,
    uint_t *protp, struct page *pl[], size_t plsz, struct seg *seg,
    caddr_t addr, enum seg_rw rw, cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	page_t **ppp, *pp = NULL;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;
	struct page *ourpl[17];
	int index = 0;
	int have_statelock = 0;
	int downgrade;

	/*
	 * Grab the cnode statelock so the cnode state won't change
	 * while we're in here.
	 */
	ourpl[0] = NULL;
	off = off & (offset_t)PAGEMASK;
again:
	if (page_exists(vp, off) == 0) {
		if (! have_statelock) {
			mutex_enter(&cp->c_statelock);
			have_statelock = 1;
		}

		if (cp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, cp);
			if (error)
				goto out;
		}
		error = VOP_GETPAGE(cp->c_backvp, (offset_t)off,
		    PAGESIZE, protp, ourpl, PAGESIZE, seg,
		    addr, S_READ, cr, NULL);
		if (error)
			goto out;

		if (have_statelock) {
			mutex_exit(&cp->c_statelock);
			have_statelock = 0;
		}
		downgrade = 0;
		for (ppp = ourpl; *ppp; ppp++) {
			if ((*ppp)->p_offset < off) {
				index++;
				page_unlock(*ppp);
				continue;
			}
			if (PAGE_SHARED(*ppp)) {
				if (page_tryupgrade(*ppp) == 0) {
					for (ppp = &ourpl[index]; *ppp; ppp++)
						page_unlock(*ppp);
					error = EAGAIN;
					goto out;
				}
				downgrade = 1;
			}
			ASSERT(PAGE_EXCL(*ppp));
			(void) hat_pageunload((*ppp), HAT_FORCE_PGUNLOAD);
			page_rename(*ppp, vp, (*ppp)->p_offset);
		}
		pl[0] = ourpl[index];
		pl[1] = NULL;
		if (downgrade) {
			page_downgrade(ourpl[index]);
		}
		/* Unlock the rest of the pages from the cluster */
		for (ppp = &ourpl[index+1]; *ppp; ppp++)
			page_unlock(*ppp);
	} else {
		ASSERT(! have_statelock);
		if (have_statelock) {
			mutex_exit(&cp->c_statelock);
			have_statelock = 0;
		}
		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
			cachefs_lostpage++;
			goto again;
		}
		pl[0] = pp;
		pl[1] = NULL;
	}

out:
	if (have_statelock) {
		mutex_exit(&cp->c_statelock);
		have_statelock = 0;
	}
	return (error);
}

/*ARGSUSED*/
static int
cachefs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
    caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int held = 0;
	int connected = 0;

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	/* Call backfilesytem if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_putpage_backfs_nfsv4(vp, off, len, flags, cr);
		goto out;
	}

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		error = cachefs_putpage_common(vp, off, len, flags, cr);
		if (error == 0)
			break;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			if (NOMEMWAIT()) {
				error = 0;
				goto out;
			}
			if (CFS_TIMEOUT(fscp, error)) {
				connected = 1;
				continue;
			}
		}
		break;
	}

out:

	if (held) {
		cachefs_cd_release(fscp);
	}

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
	return (error);
}

/*
 * cachefs_putpage_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the putpage (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_putpage_backfs_nfsv4(vnode_t *vp, offset_t off, size_t len, int flags,
			cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation is
	 * supported, the cnode backvp must exist, and cachefs optional
	 * (eg., disconnectable) flags are turned off. Assert these
	 * conditions for the putpage operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_putpage_backfs_nfsv4: cnode %p, backvp %p\n",
	    cp, backvp));
	error = VOP_PUTPAGE(backvp, off, len, flags, cr, NULL);

	return (error);
}

/*
 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
 * If len == 0, do from off to EOF.
 *
 * The normal cases should be len == 0 & off == 0 (entire vp list),
 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
 * (from pageout).
 */

/*ARGSUSED*/
int
cachefs_putpage_common(struct vnode *vp, offset_t off, size_t len,
    int flags, cred_t *cr)
{
	struct cnode *cp  = VTOC(vp);
	struct page *pp;
	size_t io_len;
	u_offset_t eoff, io_off;
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	cachefscache_t *cachep = fscp->fs_cache;

	if (len == 0 && (flags & B_INVAL) == 0 && vn_is_readonly(vp)) {
		return (0);
	}
	if (!vn_has_cached_data(vp) || (off >= cp->c_size &&
	    (flags & B_INVAL) == 0))
		return (0);

	/*
	 * Should never have cached data for the cachefs vnode
	 * if NFSv4 is in use.
	 */
	ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);

	/*
	 * If this is an async putpage let a thread handle it.
	 */
	if (flags & B_ASYNC) {
		struct cachefs_req *rp;
		int tflags = (flags & ~(B_ASYNC|B_DONTNEED));

		if (ttoproc(curthread) == proc_pageout) {
			/*
			 * If this is the page daemon we
			 * do the push synchronously (Dangerous!) and hope
			 * we can free enough to keep running...
			 */
			flags &= ~B_ASYNC;
			goto again;
		}

		if (! cachefs_async_okay()) {

			/*
			 * this is somewhat like NFS's behavior.  keep
			 * the system from thrashing.  we've seen
			 * cases where async queues get out of
			 * control, especially if
			 * madvise(MADV_SEQUENTIAL) is done on a large
			 * mmap()ed file that is read sequentially.
			 */

			flags &= ~B_ASYNC;
			goto again;
		}

		/*
		 * if no flags other than B_ASYNC were set,
		 * we coalesce putpage requests into a single one for the
		 * whole file (len = off = 0).  If such a request is
		 * already queued, we're done.
		 *
		 * If there are other flags set (e.g., B_INVAL), we don't
		 * attempt to coalesce and we use the specified length and
		 * offset.
		 */
		rp = kmem_cache_alloc(cachefs_req_cache, KM_SLEEP);
		mutex_enter(&cp->c_iomutex);
		if ((cp->c_ioflags & CIO_PUTPAGES) == 0 || tflags != 0) {
			rp->cfs_cmd = CFS_PUTPAGE;
			rp->cfs_req_u.cu_putpage.cp_vp = vp;
			if (tflags == 0) {
				off = len = 0;
				cp->c_ioflags |= CIO_PUTPAGES;
			}
			rp->cfs_req_u.cu_putpage.cp_off = off;
			rp->cfs_req_u.cu_putpage.cp_len = (uint_t)len;
			rp->cfs_req_u.cu_putpage.cp_flags = flags & ~B_ASYNC;
			rp->cfs_cr = cr;
			crhold(rp->cfs_cr);
			VN_HOLD(vp);
			cp->c_nio++;
			cachefs_addqueue(rp, &(C_TO_FSCACHE(cp)->fs_workq));
		} else {
			kmem_cache_free(cachefs_req_cache, rp);
		}

		mutex_exit(&cp->c_iomutex);
		return (0);
	}


again:
	if (len == 0) {
		/*
		 * Search the entire vp list for pages >= off
		 */
		error = pvn_vplist_dirty(vp, off, cachefs_push, flags, cr);
	} else {
		/*
		 * Do a range from [off...off + len] looking for pages
		 * to deal with.
		 */
		eoff = (u_offset_t)off + len;
		for (io_off = off; io_off < eoff && io_off < cp->c_size;
		    io_off += io_len) {
			/*
			 * If we are not invalidating, synchronously
			 * freeing or writing pages use the routine
			 * page_lookup_nowait() to prevent reclaiming
			 * them from the free list.
			 */
			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
				pp = page_lookup(vp, io_off,
				    (flags & (B_INVAL | B_FREE)) ?
				    SE_EXCL : SE_SHARED);
			} else {
				/* XXX this looks like dead code */
				pp = page_lookup_nowait(vp, io_off,
				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
			}

			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
				io_len = PAGESIZE;
			else {
				error = cachefs_push(vp, pp, &io_off,
				    &io_len, flags, cr);
				if (error != 0)
					break;
				/*
				 * "io_off" and "io_len" are returned as
				 * the range of pages we actually wrote.
				 * This allows us to skip ahead more quickly
				 * since several pages may've been dealt
				 * with by this iteration of the loop.
				 */
			}
		}
	}

	if (error == 0 && off == 0 && (len == 0 || len >= cp->c_size)) {
		cp->c_flags &= ~CDIRTY;
	}

	if (CACHEFS_LOG_LOGGING(cachep, CACHEFS_LOG_PUTPAGE))
		cachefs_log_putpage(cachep, error, fscp->fs_cfsvfsp,
		    &cp->c_metadata.md_cookie, cp->c_id.cid_fileno,
		    crgetuid(cr), off, len);

	return (error);

}

/*ARGSUSED*/
static int
cachefs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
    caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	struct segvn_crargs vn_a;
	int error;
	int held = 0;
	int writing;
	int connected = 0;

#ifdef CFSDEBUG
	u_offset_t offx = (u_offset_t)off;

	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_map: ENTER vp %p off %lld len %lu flags %d\n",
		    (void *)vp, offx, len, flags);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (vp->v_flag & VNOMAP) {
		error = ENOSYS;
		goto out;
	}
	if (off < 0 || (offset_t)(off + len) < 0) {
		error = ENXIO;
		goto out;
	}
	if (vp->v_type != VREG) {
		error = ENODEV;
		goto out;
	}

	/*
	 * Check to see if the vnode is currently marked as not cachable.
	 * If so, we have to refuse the map request as this violates the
	 * don't cache attribute.
	 */
	if (vp->v_flag & VNOCACHE)
		return (EAGAIN);

#ifdef OBSOLETE
	/*
	 * If file is being locked, disallow mapping.
	 */
	if (vn_has_flocks(vp)) {
		error = EAGAIN;
		goto out;
	}
#endif

	/* call backfilesystem if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_map_backfs_nfsv4(vp, off, as, addrp, len, prot,
		    maxprot, flags, cr);
		goto out;
	}

	writing = (prot & PROT_WRITE && ((flags & MAP_PRIVATE) == 0));

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, writing);
		if (error)
			break;
		held = 1;

		if (writing) {
			mutex_enter(&cp->c_statelock);
			if (CFS_ISFS_WRITE_AROUND(fscp)) {
				if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
					connected = 1;
					continue;
				} else {
					cachefs_nocache(cp);
				}
			}

			/*
			 * CN_MAPWRITE is for an optimization in cachefs_delmap.
			 * If CN_MAPWRITE is not set then cachefs_delmap does
			 * not need to try to push out any pages.
			 * This bit gets cleared when the cnode goes inactive.
			 */
			cp->c_flags |= CN_MAPWRITE;

			mutex_exit(&cp->c_statelock);
		}
		break;
	}

	if (held) {
		cachefs_cd_release(fscp);
	}

	as_rangelock(as);
	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
	if (error != 0) {
		as_rangeunlock(as);
		goto out;
	}

	/*
	 * package up all the data passed in into a segvn_args struct and
	 * call as_map with segvn_create function to create a new segment
	 * in the address space.
	 */
	vn_a.vp = vp;
	vn_a.offset = off;
	vn_a.type = flags & MAP_TYPE;
	vn_a.prot = (uchar_t)prot;
	vn_a.maxprot = (uchar_t)maxprot;
	vn_a.cred = cr;
	vn_a.amp = NULL;
	vn_a.flags = flags & ~MAP_TYPE;
	vn_a.szc = 0;
	vn_a.lgrp_mem_policy_flags = 0;
	error = as_map(as, *addrp, len, segvn_create, &vn_a);
	as_rangeunlock(as);
out:

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_map: EXIT vp %p error %d\n", (void *)vp, error);
#endif
	return (error);
}

/*
 * cachefs_map_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the map (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_map_backfs_nfsv4(struct vnode *vp, offset_t off, struct as *as,
			caddr_t *addrp, size_t len, uchar_t prot,
			uchar_t maxprot, uint_t flags, cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation is
	 * supported, the cnode backvp must exist, and cachefs optional
	 * (eg., disconnectable) flags are turned off. Assert these
	 * conditions for the map operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_map_backfs_nfsv4: cnode %p, backvp %p\n",
	    cp, backvp));
	error = VOP_MAP(backvp, off, as, addrp, len, prot, maxprot, flags, cr,
	    NULL);

	return (error);
}

/*ARGSUSED*/
static int
cachefs_addmap(struct vnode *vp, offset_t off, struct as *as,
    caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
    cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	if (vp->v_flag & VNOMAP)
		return (ENOSYS);

	/*
	 * Check this is not an NFSv4 filesystem, as the mapping
	 * is not done on the cachefs filesystem if NFSv4 is in
	 * use.
	 */
	ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);

	mutex_enter(&cp->c_statelock);
	cp->c_mapcnt += btopr(len);
	mutex_exit(&cp->c_statelock);
	return (0);
}

/*ARGSUSED*/
static int
cachefs_delmap(struct vnode *vp, offset_t off, struct as *as,
	caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
	cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error;
	int connected = 0;
	int held = 0;

	/*
	 * The file may be passed in to (or inherited into) the zone, so we
	 * need to let this operation go through since it happens as part of
	 * exiting.
	 */
	if (vp->v_flag & VNOMAP)
		return (ENOSYS);

	/*
	 * Check this is not an NFSv4 filesystem, as the mapping
	 * is not done on the cachefs filesystem if NFSv4 is in
	 * use.
	 */
	ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);

	mutex_enter(&cp->c_statelock);
	cp->c_mapcnt -= btopr(len);
	ASSERT(cp->c_mapcnt >= 0);
	mutex_exit(&cp->c_statelock);

	if (cp->c_mapcnt || !vn_has_cached_data(vp) ||
	    ((cp->c_flags & CN_MAPWRITE) == 0))
		return (0);

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;
		connected = 0;

		error = cachefs_putpage_common(vp, (offset_t)0,
		    (uint_t)0, 0, cr);
		if (CFS_TIMEOUT(fscp, error)) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				continue;
			} else {
				connected = 1;
				continue;
			}
		}

		/* if no space left in cache, wait until connected */
		if ((error == ENOSPC) &&
		    (fscp->fs_cdconnected != CFS_CD_CONNECTED)) {
			connected = 1;
			continue;
		}

		mutex_enter(&cp->c_statelock);
		if (!error)
			error = cp->c_error;
		cp->c_error = 0;
		mutex_exit(&cp->c_statelock);
		break;
	}

	if (held)
		cachefs_cd_release(fscp);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
	return (error);
}

/* ARGSUSED */
static int
cachefs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
	offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
	caller_context_t *ct)
{
	struct cnode *cp = VTOC(vp);
	int error;
	struct fscache *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int held = 0;
	int connected = 0;

	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);

	if ((cmd != F_GETLK) && (cmd != F_SETLK) && (cmd != F_SETLKW))
		return (EINVAL);

	/* Disallow locking of files that are currently mapped */
	if (((cmd == F_SETLK) || (cmd == F_SETLKW)) && (cp->c_mapcnt > 0)) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		return (EAGAIN);
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the frlock operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* XXX bob: nfs does a bunch more checks than we do */
	if (CFS_ISFS_LLOCK(fscp)) {
		ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
	}

	for (;;) {
		/* get (or renew) access to the file system */
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		/* if not connected, quit or wait */
		if (fscp->fs_cdconnected != CFS_CD_CONNECTED) {
			connected = 1;
			continue;
		}

		/* nocache the file */
		if ((cp->c_flags & CN_NOCACHE) == 0 &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
			mutex_enter(&cp->c_statelock);
			cachefs_nocache(cp);
			mutex_exit(&cp->c_statelock);
		}

		/*
		 * XXX bob: probably should do a consistency check
		 * Pass arguments unchanged if NFSv4 is the backfs.
		 */
		if (bfp->l_whence == 2 && CFS_ISFS_BACKFS_NFSV4(fscp) == 0) {
			bfp->l_start += cp->c_size;
			bfp->l_whence = 0;
		}

		/* get the back vp */
		mutex_enter(&cp->c_statelock);
		if (cp->c_backvp == NULL) {
			error = cachefs_getbackvp(fscp, cp);
			if (error) {
				mutex_exit(&cp->c_statelock);
				break;
			}
		}
		backvp = cp->c_backvp;
		VN_HOLD(backvp);
		mutex_exit(&cp->c_statelock);

		/*
		 * make sure we can flush currently dirty pages before
		 * allowing the lock
		 */
		if (bfp->l_type != F_UNLCK && cmd != F_GETLK &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
			error = cachefs_putpage(
			    vp, (offset_t)0, 0, B_INVAL, cr, ct);
			if (error) {
				error = ENOLCK;
				VN_RELE(backvp);
				break;
			}
		}

		/* do lock on the back file */
		CFS_DPRINT_BACKFS_NFSV4(fscp,
		    ("cachefs_frlock (nfsv4): cp %p, backvp %p\n",
		    cp, backvp));
		error = VOP_FRLOCK(backvp, cmd, bfp, flag, offset, NULL, cr,
		    ct);
		VN_RELE(backvp);
		if (CFS_TIMEOUT(fscp, error)) {
			connected = 1;
			continue;
		}
		break;
	}

	if (held) {
		cachefs_cd_release(fscp);
	}

	/*
	 * If we are setting a lock mark the vnode VNOCACHE so the page
	 * cache does not give inconsistent results on locked files shared
	 * between clients.  The VNOCACHE flag is never turned off as long
	 * as the vnode is active because it is hard to figure out when the
	 * last lock is gone.
	 * XXX - what if some already has the vnode mapped in?
	 * XXX bob: see nfs3_frlock, do not allow locking if vnode mapped in.
	 */
	if ((error == 0) && (bfp->l_type != F_UNLCK) && (cmd != F_GETLK) &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp))
		vp->v_flag |= VNOCACHE;

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
	return (error);
}

/*
 * Free storage space associated with the specified vnode.  The portion
 * to be freed is specified by bfp->l_start and bfp->l_len (already
 * normalized to a "whence" of 0).
 *
 * This is an experimental facility whose continued existence is not
 * guaranteed.  Currently, we only support the special case
 * of l_len == 0, meaning free to end of file.
 */
/* ARGSUSED */
static int
cachefs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
	offset_t offset, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error;

	ASSERT(vp->v_type == VREG);
	if (getzoneid() != GLOBAL_ZONEID)
		return (EPERM);
	if (cmd != F_FREESP)
		return (EINVAL);

	/* call backfilesystem if NFSv4 */
	if (CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_space_backfs_nfsv4(vp, cmd, bfp, flag,
		    offset, cr, ct);
		goto out;
	}

	if ((error = convoff(vp, bfp, 0, offset)) == 0) {
		ASSERT(bfp->l_start >= 0);
		if (bfp->l_len == 0) {
			struct vattr va;

			va.va_size = bfp->l_start;
			va.va_mask = AT_SIZE;
			error = cachefs_setattr(vp, &va, 0, cr, ct);
		} else
			error = EINVAL;
	}

out:
	return (error);
}

/*
 * cachefs_space_backfs_nfsv4
 *
 * Call NFSv4 back filesystem to handle the space (cachefs
 * pass-through support for NFSv4).
 */
static int
cachefs_space_backfs_nfsv4(struct vnode *vp, int cmd, struct flock64 *bfp,
		int flag, offset_t offset, cred_t *cr, caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vnode_t *backvp;
	int error;

	/*
	 * For NFSv4 pass-through to work, only connected operation is
	 * supported, the cnode backvp must exist, and cachefs optional
	 * (eg., disconnectable) flags are turned off. Assert these
	 * conditions for the space operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	/* Call backfs vnode op after extracting backvp */
	mutex_enter(&cp->c_statelock);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_space_backfs_nfsv4: cnode %p, backvp %p\n",
	    cp, backvp));
	error = VOP_SPACE(backvp, cmd, bfp, flag, offset, cr, ct);

	return (error);
}

/*ARGSUSED*/
static int
cachefs_realvp(struct vnode *vp, struct vnode **vpp, caller_context_t *ct)
{
	return (EINVAL);
}

/*ARGSUSED*/
static int
cachefs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
	int flags, cred_t *cr, caller_context_t *ct)
{
	return (ENOSYS);
}

static int
cachefs_setsecattr_connected(cnode_t *cp,
    vsecattr_t *vsec, int flag, cred_t *cr)
{
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;

	ASSERT(RW_WRITE_HELD(&cp->c_rwlock));
	ASSERT((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0);

	mutex_enter(&cp->c_statelock);

	if (cp->c_backvp == NULL) {
		error = cachefs_getbackvp(fscp, cp);
		if (error) {
			cachefs_nocache(cp);
			goto out;
		}
	}

	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error)
		goto out;

	/* only owner can set acl */
	if (cp->c_metadata.md_vattr.va_uid != crgetuid(cr)) {
		error = EINVAL;
		goto out;
	}


	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_setsecattr (nfsv4): cp %p, backvp %p",
	    cp, cp->c_backvp));
	error = VOP_SETSECATTR(cp->c_backvp, vsec, flag, cr, NULL);
	if (error) {
		goto out;
	}

	if ((cp->c_filegrp->fg_flags & CFS_FG_WRITE) == 0 &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		cachefs_nocache(cp);
		goto out;
	}

	CFSOP_MODIFY_COBJECT(fscp, cp, cr);

	/* acl may have changed permissions -- handle this. */
	if (!CFS_ISFS_BACKFS_NFSV4(fscp))
		cachefs_acl2perm(cp, vsec);

	if ((cp->c_flags & CN_NOCACHE) == 0 &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_cacheacl(cp, vsec);
		if (error != 0) {
#ifdef CFSDEBUG
			CFS_DEBUG(CFSDEBUG_VOPS)
				printf("cachefs_setacl: cacheacl: error %d\n",
				    error);
#endif /* CFSDEBUG */
			error = 0;
			cachefs_nocache(cp);
		}
	}

out:
	mutex_exit(&cp->c_statelock);

	return (error);
}

static int
cachefs_setsecattr_disconnected(cnode_t *cp,
    vsecattr_t *vsec, int flag, cred_t *cr)
{
	fscache_t *fscp = C_TO_FSCACHE(cp);
	mode_t failmode = cp->c_metadata.md_vattr.va_mode;
	off_t commit = 0;
	int error = 0;

	ASSERT((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0);

	if (CFS_ISFS_WRITE_AROUND(fscp))
		return (ETIMEDOUT);

	mutex_enter(&cp->c_statelock);

	/* only owner can set acl */
	if (cp->c_metadata.md_vattr.va_uid != crgetuid(cr)) {
		error = EINVAL;
		goto out;
	}

	if (cp->c_metadata.md_flags & MD_NEEDATTRS) {
		error = ETIMEDOUT;
		goto out;
	}

	/* XXX do i need this?  is this right? */
	if (cp->c_flags & CN_ALLOC_PENDING) {
		if (cp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) {
			(void) filegrp_allocattr(cp->c_filegrp);
		}
		error = filegrp_create_metadata(cp->c_filegrp,
		    &cp->c_metadata, &cp->c_id);
		if (error) {
			goto out;
		}
		cp->c_flags &= ~CN_ALLOC_PENDING;
	}

	/* XXX is this right? */
	if ((cp->c_metadata.md_flags & MD_MAPPING) == 0) {
		error = cachefs_dlog_cidmap(fscp);
		if (error) {
			error = ENOSPC;
			goto out;
		}
		cp->c_metadata.md_flags |= MD_MAPPING;
		cp->c_flags |= CN_UPDATED;
	}

	commit = cachefs_dlog_setsecattr(fscp, vsec, flag, cp, cr);
	if (commit == 0)
		goto out;

	/* fix modes in metadata */
	cachefs_acl2perm(cp, vsec);

	if ((cp->c_flags & CN_NOCACHE) == 0) {
		error = cachefs_cacheacl(cp, vsec);
		if (error != 0) {
			goto out;
		}
	}

	/* XXX is this right? */
	if (cachefs_modified_alloc(cp)) {
		error = ENOSPC;
		goto out;
	}

out:
	if (error != 0)
		cp->c_metadata.md_vattr.va_mode = failmode;

	mutex_exit(&cp->c_statelock);

	if (commit) {
		if (cachefs_dlog_commit(fscp, commit, error)) {
			/*EMPTY*/
			/* XXX fix on panic? */
		}
	}

	return (error);
}

/*ARGSUSED*/
static int
cachefs_setsecattr(vnode_t *vp, vsecattr_t *vsec, int flag, cred_t *cr,
    caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int connected = 0;
	int held = 0;
	int error = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_setsecattr: ENTER vp %p\n", (void *)vp);
#endif
	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	if (fscp->fs_info.fi_mntflags & CFS_NOACL) {
		error = ENOSYS;
		goto out;
	}

	if (! cachefs_vtype_aclok(vp)) {
		error = EINVAL;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the setsecattr operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	for (;;) {
		/* drop hold on file system */
		if (held) {
			/* Won't loop with NFSv4 connected operation */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}

		/* acquire access to the file system */
		error = cachefs_cd_access(fscp, connected, 1);
		if (error)
			break;
		held = 1;

		/* perform the setattr */
		if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
			error = cachefs_setsecattr_connected(cp,
			    vsec, flag, cr);
		else
			error = cachefs_setsecattr_disconnected(cp,
			    vsec, flag, cr);
		if (error) {
			/* if connected */
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
				if (CFS_TIMEOUT(fscp, error)) {
					cachefs_cd_release(fscp);
					held = 0;
					cachefs_cd_timedout(fscp);
					connected = 0;
					continue;
				}
			}

			/* else must be disconnected */
			else {
				if (CFS_TIMEOUT(fscp, error)) {
					connected = 1;
					continue;
				}
			}
		}
		break;
	}

	if (held) {
		cachefs_cd_release(fscp);
	}
	return (error);

out:
#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_setsecattr: EXIT error = %d\n", error);
#endif
	return (error);
}

/*
 * call this BEFORE calling cachefs_cacheacl(), as the latter will
 * sanitize the acl.
 */

static void
cachefs_acl2perm(cnode_t *cp, vsecattr_t *vsec)
{
	aclent_t *aclp;
	int i;

	for (i = 0; i < vsec->vsa_aclcnt; i++) {
		aclp = ((aclent_t *)vsec->vsa_aclentp) + i;
		switch (aclp->a_type) {
		case USER_OBJ:
			cp->c_metadata.md_vattr.va_mode &= (~0700);
			cp->c_metadata.md_vattr.va_mode |= (aclp->a_perm << 6);
			break;

		case GROUP_OBJ:
			cp->c_metadata.md_vattr.va_mode &= (~070);
			cp->c_metadata.md_vattr.va_mode |= (aclp->a_perm << 3);
			break;

		case OTHER_OBJ:
			cp->c_metadata.md_vattr.va_mode &= (~07);
			cp->c_metadata.md_vattr.va_mode |= (aclp->a_perm);
			break;

		case CLASS_OBJ:
			cp->c_metadata.md_aclclass = aclp->a_perm;
			break;
		}
	}

	cp->c_flags |= CN_UPDATED;
}

static int
cachefs_getsecattr(vnode_t *vp, vsecattr_t *vsec, int flag, cred_t *cr,
    caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int held = 0, connected = 0;
	int error = 0;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getsecattr: ENTER vp %p\n", (void *)vp);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the getsecattr operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	if (fscp->fs_info.fi_mntflags & CFS_NOACL) {
		error = fs_fab_acl(vp, vsec, flag, cr, ct);
		goto out;
	}

	for (;;) {
		if (held) {
			/* Won't loop with NFSv4 connected behavior */
			ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
			cachefs_cd_release(fscp);
			held = 0;
		}
		error = cachefs_cd_access(fscp, connected, 0);
		if (error)
			break;
		held = 1;

		if (fscp->fs_cdconnected == CFS_CD_CONNECTED) {
			error = cachefs_getsecattr_connected(vp, vsec, flag,
			    cr);
			if (CFS_TIMEOUT(fscp, error)) {
				cachefs_cd_release(fscp);
				held = 0;
				cachefs_cd_timedout(fscp);
				connected = 0;
				continue;
			}
		} else {
			error = cachefs_getsecattr_disconnected(vp, vsec, flag,
			    cr);
			if (CFS_TIMEOUT(fscp, error)) {
				if (cachefs_cd_access_miss(fscp)) {
					error = cachefs_getsecattr_connected(vp,
					    vsec, flag, cr);
					if (!CFS_TIMEOUT(fscp, error))
						break;
					delay(5*hz);
					connected = 0;
					continue;
				}
				connected = 1;
				continue;
			}
		}
		break;
	}

out:
	if (held)
		cachefs_cd_release(fscp);

#ifdef CFS_CD_DEBUG
	ASSERT((curthread->t_flag & T_CD_HELD) == 0);
#endif
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_getsecattr: EXIT error = %d\n", error);
#endif
	return (error);
}

static int
cachefs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
    caller_context_t *ct)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error = 0;
	vnode_t *backvp;

#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_shrlock: ENTER vp %p\n", (void *)vp);
#endif

	if (getzoneid() != GLOBAL_ZONEID) {
		error = EPERM;
		goto out;
	}

	/*
	 * Cachefs only provides pass-through support for NFSv4,
	 * and all vnode operations are passed through to the
	 * back file system. For NFSv4 pass-through to work, only
	 * connected operation is supported, the cnode backvp must
	 * exist, and cachefs optional (eg., disconnectable) flags
	 * are turned off. Assert these conditions to ensure that
	 * the backfilesystem is called for the shrlock operation.
	 */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(cp);

	mutex_enter(&cp->c_statelock);
	if (cp->c_backvp == NULL)
		error = cachefs_getbackvp(fscp, cp);
	backvp = cp->c_backvp;
	mutex_exit(&cp->c_statelock);
	ASSERT((error != 0) || (backvp != NULL));

	if (error == 0) {
		CFS_DPRINT_BACKFS_NFSV4(fscp,
		    ("cachefs_shrlock (nfsv4): cp %p, backvp %p",
		    cp, backvp));
		error = VOP_SHRLOCK(backvp, cmd, shr, flag, cr, ct);
	}

out:
#ifdef CFSDEBUG
	CFS_DEBUG(CFSDEBUG_VOPS)
		printf("cachefs_shrlock: EXIT error = %d\n", error);
#endif
	return (error);
}

static int
cachefs_getsecattr_connected(vnode_t *vp, vsecattr_t *vsec, int flag,
    cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int hit = 0;
	int error = 0;


	mutex_enter(&cp->c_statelock);
	error = CFSOP_CHECK_COBJECT(fscp, cp, 0, cr);
	if (error)
		goto out;

	/* read from the cache if we can */
	if ((cp->c_metadata.md_flags & MD_ACL) &&
	    ((cp->c_flags & CN_NOCACHE) == 0) &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		ASSERT((cp->c_flags & CN_NOCACHE) == 0);
		error = cachefs_getaclfromcache(cp, vsec);
		if (error) {
			cachefs_nocache(cp);
			ASSERT((cp->c_metadata.md_flags & MD_ACL) == 0);
			error = 0;
		} else {
			hit = 1;
			goto out;
		}
	}

	ASSERT(error == 0);
	if (cp->c_backvp == NULL)
		error = cachefs_getbackvp(fscp, cp);
	if (error)
		goto out;

	CFS_DPRINT_BACKFS_NFSV4(fscp,
	    ("cachefs_getsecattr (nfsv4): cp %p, backvp %p",
	    cp, cp->c_backvp));
	error = VOP_GETSECATTR(cp->c_backvp, vsec, flag, cr, NULL);
	if (error)
		goto out;

	if (((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0) &&
	    (cachefs_vtype_aclok(vp)) &&
	    ((cp->c_flags & CN_NOCACHE) == 0) &&
	    !CFS_ISFS_BACKFS_NFSV4(fscp)) {
		error = cachefs_cacheacl(cp, vsec);
		if (error) {
			error = 0;
			cachefs_nocache(cp);
		}
	}

out:
	if (error == 0) {
		if (hit)
			fscp->fs_stats.st_hits++;
		else
			fscp->fs_stats.st_misses++;
	}
	mutex_exit(&cp->c_statelock);

	return (error);
}

static int
/*ARGSUSED*/
cachefs_getsecattr_disconnected(vnode_t *vp, vsecattr_t *vsec, int flag,
    cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int hit = 0;
	int error = 0;


	mutex_enter(&cp->c_statelock);

	/* read from the cache if we can */
	if (((cp->c_flags & CN_NOCACHE) == 0) &&
	    (cp->c_metadata.md_flags & MD_ACL)) {
		error = cachefs_getaclfromcache(cp, vsec);
		if (error) {
			cachefs_nocache(cp);
			ASSERT((cp->c_metadata.md_flags & MD_ACL) == 0);
			error = 0;
		} else {
			hit = 1;
			goto out;
		}
	}
	error = ETIMEDOUT;

out:
	if (error == 0) {
		if (hit)
			fscp->fs_stats.st_hits++;
		else
			fscp->fs_stats.st_misses++;
	}
	mutex_exit(&cp->c_statelock);

	return (error);
}

/*
 * cachefs_cacheacl() -- cache an ACL, which we do by applying it to
 * the frontfile if possible; otherwise, the adjunct directory.
 *
 * inputs:
 * cp - the cnode, with its statelock already held
 * vsecp - a pointer to a vsecattr_t you'd like us to cache as-is,
 *  or NULL if you want us to do the VOP_GETSECATTR(backvp).
 *
 * returns:
 * 0 - all is well
 * nonzero - errno
 */

int
cachefs_cacheacl(cnode_t *cp, vsecattr_t *vsecp)
{
	fscache_t *fscp = C_TO_FSCACHE(cp);
	vsecattr_t vsec;
	aclent_t *aclp;
	int gotvsec = 0;
	int error = 0;
	vnode_t *vp = NULL;
	void *aclkeep = NULL;
	int i;

	ASSERT(MUTEX_HELD(&cp->c_statelock));
	ASSERT((cp->c_flags & CN_NOCACHE) == 0);
	ASSERT(CFS_ISFS_BACKFS_NFSV4(fscp) == 0);
	ASSERT((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0);
	ASSERT(cachefs_vtype_aclok(CTOV(cp)));

	if (fscp->fs_info.fi_mntflags & CFS_NOACL) {
		error = ENOSYS;
		goto out;
	}

	if (vsecp == NULL) {
		if (cp->c_backvp == NULL)
			error = cachefs_getbackvp(fscp, cp);
		if (error != 0)
			goto out;
		vsecp = &vsec;
		bzero(&vsec, sizeof (vsec));
		vsecp->vsa_mask =
		    VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
		error = VOP_GETSECATTR(cp->c_backvp, vsecp, 0, kcred, NULL);
		if (error != 0) {
			goto out;
		}
		gotvsec = 1;
	} else if (vsecp->vsa_mask & VSA_ACL) {
		aclkeep = vsecp->vsa_aclentp;
		vsecp->vsa_aclentp = cachefs_kmem_alloc(vsecp->vsa_aclcnt *
		    sizeof (aclent_t), KM_SLEEP);
		bcopy(aclkeep, vsecp->vsa_aclentp, vsecp->vsa_aclcnt *
		    sizeof (aclent_t));
	} else if ((vsecp->vsa_mask & (VSA_ACL | VSA_DFACL)) == 0) {
		/* unless there's real data, we can cache nothing. */
		return (0);
	}

	/*
	 * prevent the ACL from chmoding our frontfile, and
	 * snarf the class info
	 */

	if ((vsecp->vsa_mask & (VSA_ACL | VSA_ACLCNT)) ==
	    (VSA_ACL | VSA_ACLCNT)) {
		for (i = 0; i < vsecp->vsa_aclcnt; i++) {
			aclp = ((aclent_t *)vsecp->vsa_aclentp) + i;
			switch (aclp->a_type) {
			case CLASS_OBJ:
				cp->c_metadata.md_aclclass =
				    aclp->a_perm;
				/*FALLTHROUGH*/
			case USER_OBJ:
			case GROUP_OBJ:
			case OTHER_OBJ:
				aclp->a_perm = 06;
			}
		}
	}

	/*
	 * if the frontfile exists, then we always do the work.  but,
	 * if there's no frontfile, and the ACL isn't a `real' ACL,
	 * then we don't want to do the work.  otherwise, an `ls -l'
	 * will create tons of emtpy frontfiles.
	 */

	if (((cp->c_metadata.md_flags & MD_FILE) == 0) &&
	    ((vsecp->vsa_aclcnt + vsecp->vsa_dfaclcnt)
	    <= MIN_ACL_ENTRIES)) {
		cp->c_metadata.md_flags |= MD_ACL;
		cp->c_flags |= CN_UPDATED;
		goto out;
	}

	/*
	 * if we have a default ACL, then we need a
	 * real live directory in the frontfs that we
	 * can apply the ACL to.  if not, then we just
	 * use the frontfile.  we get the frontfile
	 * regardless -- that way, we know the
	 * directory for the frontfile exists.
	 */

	if (vsecp->vsa_dfaclcnt > 0) {
		if (cp->c_acldirvp == NULL)
			error = cachefs_getacldirvp(cp);
		if (error != 0)
			goto out;
		vp = cp->c_acldirvp;
	} else {
		if (cp->c_frontvp == NULL)
			error = cachefs_getfrontfile(cp);
		if (error != 0)
			goto out;
		vp = cp->c_frontvp;
	}
	ASSERT(vp != NULL);

	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
	error = VOP_SETSECATTR(vp, vsecp, 0, kcred, NULL);
	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
	if (error != 0) {
#ifdef CFSDEBUG
		CFS_DEBUG(CFSDEBUG_VOPS)
			printf("cachefs_cacheacl: setsecattr: error %d\n",
			    error);
#endif /* CFSDEBUG */
		/*
		 * If there was an error, we don't want to call
		 * cachefs_nocache(); so, set error to 0.
		 * We will call cachefs_purgeacl(), in order to
		 * clean such things as adjunct ACL directories.
		 */
		cachefs_purgeacl(cp);
		error = 0;
		goto out;
	}
	if (vp == cp->c_frontvp)
		cp->c_flags |= CN_NEED_FRONT_SYNC;

	cp->c_metadata.md_flags |= MD_ACL;
	cp->c_flags |= CN_UPDATED;

out:
	if ((error) && (fscp->fs_cdconnected == CFS_CD_CONNECTED))
		cachefs_nocache(cp);

	if (gotvsec) {
		if (vsec.vsa_aclcnt)
			kmem_free(vsec.vsa_aclentp,
			    vsec.vsa_aclcnt * sizeof (aclent_t));
		if (vsec.vsa_dfaclcnt)
			kmem_free(vsec.vsa_dfaclentp,
			    vsec.vsa_dfaclcnt * sizeof (aclent_t));
	} else if (aclkeep != NULL) {
		cachefs_kmem_free(vsecp->vsa_aclentp,
		    vsecp->vsa_aclcnt * sizeof (aclent_t));
		vsecp->vsa_aclentp = aclkeep;
	}

	return (error);
}

void
cachefs_purgeacl(cnode_t *cp)
{
	ASSERT(MUTEX_HELD(&cp->c_statelock));

	ASSERT(!CFS_ISFS_BACKFS_NFSV4(C_TO_FSCACHE(cp)));

	if (cp->c_acldirvp != NULL) {
		VN_RELE(cp->c_acldirvp);
		cp->c_acldirvp = NULL;
	}

	if (cp->c_metadata.md_flags & MD_ACLDIR) {
		char name[CFS_FRONTFILE_NAME_SIZE + 2];

		ASSERT(cp->c_filegrp->fg_dirvp != NULL);
		make_ascii_name(&cp->c_id, name);
		(void) strcat(name, ".d");

		(void) VOP_RMDIR(cp->c_filegrp->fg_dirvp, name,
		    cp->c_filegrp->fg_dirvp, kcred, NULL, 0);
	}

	cp->c_metadata.md_flags &= ~(MD_ACL | MD_ACLDIR);
	cp->c_flags |= CN_UPDATED;
}

static int
cachefs_getacldirvp(cnode_t *cp)
{
	char name[CFS_FRONTFILE_NAME_SIZE + 2];
	int error = 0;

	ASSERT(MUTEX_HELD(&cp->c_statelock));
	ASSERT(cp->c_acldirvp == NULL);

	if (cp->c_frontvp == NULL)
		error = cachefs_getfrontfile(cp);
	if (error != 0)
		goto out;

	ASSERT(cp->c_filegrp->fg_dirvp != NULL);
	make_ascii_name(&cp->c_id, name);
	(void) strcat(name, ".d");
	error = VOP_LOOKUP(cp->c_filegrp->fg_dirvp,
	    name, &cp->c_acldirvp, NULL, 0, NULL, kcred, NULL, NULL, NULL);
	if ((error != 0) && (error != ENOENT))
		goto out;

	if (error != 0) {
		vattr_t va;

		va.va_mode = S_IFDIR | 0777;
		va.va_uid = 0;
		va.va_gid = 0;
		va.va_type = VDIR;
		va.va_mask = AT_TYPE | AT_MODE |
		    AT_UID | AT_GID;
		error =
		    VOP_MKDIR(cp->c_filegrp->fg_dirvp,
		    name, &va, &cp->c_acldirvp, kcred, NULL, 0, NULL);
		if (error != 0)
			goto out;
	}

	ASSERT(cp->c_acldirvp != NULL);
	cp->c_metadata.md_flags |= MD_ACLDIR;
	cp->c_flags |= CN_UPDATED;

out:
	if (error != 0)
		cp->c_acldirvp = NULL;
	return (error);
}

static int
cachefs_getaclfromcache(cnode_t *cp, vsecattr_t *vsec)
{
	aclent_t *aclp;
	int error = 0;
	vnode_t *vp = NULL;
	int i;

	ASSERT(cp->c_metadata.md_flags & MD_ACL);
	ASSERT(MUTEX_HELD(&cp->c_statelock));
	ASSERT(vsec->vsa_aclentp == NULL);

	if (cp->c_metadata.md_flags & MD_ACLDIR) {
		if (cp->c_acldirvp == NULL)
			error = cachefs_getacldirvp(cp);
		if (error != 0)
			goto out;
		vp = cp->c_acldirvp;
	} else if (cp->c_metadata.md_flags & MD_FILE) {
		if (cp->c_frontvp == NULL)
			error = cachefs_getfrontfile(cp);
		if (error != 0)
			goto out;
		vp = cp->c_frontvp;
	} else {

		/*
		 * if we get here, then we know that MD_ACL is on,
		 * meaning an ACL was successfully cached.  we also
		 * know that neither MD_ACLDIR nor MD_FILE are on, so
		 * this has to be an entry without a `real' ACL.
		 * thus, we forge whatever is necessary.
		 */

		if (vsec->vsa_mask & VSA_ACLCNT)
			vsec->vsa_aclcnt = MIN_ACL_ENTRIES;

		if (vsec->vsa_mask & VSA_ACL) {
			vsec->vsa_aclentp =
			    kmem_zalloc(MIN_ACL_ENTRIES *
			    sizeof (aclent_t), KM_SLEEP);
			aclp = (aclent_t *)vsec->vsa_aclentp;
			aclp->a_type = USER_OBJ;
			++aclp;
			aclp->a_type = GROUP_OBJ;
			++aclp;
			aclp->a_type = OTHER_OBJ;
			++aclp;
			aclp->a_type = CLASS_OBJ;
			ksort((caddr_t)vsec->vsa_aclentp, MIN_ACL_ENTRIES,
			    sizeof (aclent_t), cmp2acls);
		}

		ASSERT(vp == NULL);
	}

	if (vp != NULL) {
		if ((error = VOP_GETSECATTR(vp, vsec, 0, kcred, NULL)) != 0) {
#ifdef CFSDEBUG
			CFS_DEBUG(CFSDEBUG_VOPS)
				printf("cachefs_getaclfromcache: error %d\n",
				    error);
#endif /* CFSDEBUG */
			goto out;
		}
	}

	if (vsec->vsa_aclentp != NULL) {
		for (i = 0; i < vsec->vsa_aclcnt; i++) {
			aclp = ((aclent_t *)vsec->vsa_aclentp) + i;
			switch (aclp->a_type) {
			case USER_OBJ:
				aclp->a_id = cp->c_metadata.md_vattr.va_uid;
				aclp->a_perm =
				    cp->c_metadata.md_vattr.va_mode & 0700;
				aclp->a_perm >>= 6;
				break;

			case GROUP_OBJ:
				aclp->a_id = cp->c_metadata.md_vattr.va_gid;
				aclp->a_perm =
				    cp->c_metadata.md_vattr.va_mode & 070;
				aclp->a_perm >>= 3;
				break;

			case OTHER_OBJ:
				aclp->a_perm =
				    cp->c_metadata.md_vattr.va_mode & 07;
				break;

			case CLASS_OBJ:
				aclp->a_perm =
				    cp->c_metadata.md_aclclass;
				break;
			}
		}
	}

out:

	if (error != 0)
		cachefs_nocache(cp);

	return (error);
}

/*
 * Fills in targp with attribute information from srcp, cp
 * and if necessary the system.
 */
static void
cachefs_attr_setup(vattr_t *srcp, vattr_t *targp, cnode_t *cp, cred_t *cr)
{
	time_t	now;

	ASSERT((srcp->va_mask & (AT_TYPE | AT_MODE)) == (AT_TYPE | AT_MODE));

	/*
	 * Add code to fill in the va struct.  We use the fields from
	 * the srcp struct if they are populated, otherwise we guess
	 */

	targp->va_mask = 0;	/* initialize all fields */
	targp->va_mode = srcp->va_mode;
	targp->va_type = srcp->va_type;
	targp->va_nlink = 1;
	targp->va_nodeid = 0;

	if (srcp->va_mask & AT_UID)
		targp->va_uid = srcp->va_uid;
	else
		targp->va_uid = crgetuid(cr);

	if (srcp->va_mask & AT_GID)
		targp->va_gid = srcp->va_gid;
	else
		targp->va_gid = crgetgid(cr);

	if (srcp->va_mask & AT_FSID)
		targp->va_fsid = srcp->va_fsid;
	else
		targp->va_fsid = 0;	/* initialize all fields */

	now = gethrestime_sec();
	if (srcp->va_mask & AT_ATIME)
		targp->va_atime = srcp->va_atime;
	else
		targp->va_atime.tv_sec = now;

	if (srcp->va_mask & AT_MTIME)
		targp->va_mtime = srcp->va_mtime;
	else
		targp->va_mtime.tv_sec = now;

	if (srcp->va_mask & AT_CTIME)
		targp->va_ctime = srcp->va_ctime;
	else
		targp->va_ctime.tv_sec = now;


	if (srcp->va_mask & AT_SIZE)
		targp->va_size = srcp->va_size;
	else
		targp->va_size = 0;

	/*
	 * the remaing fields are set by the fs and not changable.
	 * we populate these entries useing the parent directory
	 * values.  It's a small hack, but should work.
	 */
	targp->va_blksize = cp->c_metadata.md_vattr.va_blksize;
	targp->va_rdev = cp->c_metadata.md_vattr.va_rdev;
	targp->va_nblocks = cp->c_metadata.md_vattr.va_nblocks;
	targp->va_seq = 0; /* Never keep the sequence number */
}

/*
 * set the gid for a newly created file.  The algorithm is as follows:
 *
 *	1) If the gid is set in the attribute list, then use it if
 *	   the caller is privileged, belongs to the target group, or
 *	   the group is the same as the parent directory.
 *
 *	2) If the parent directory's set-gid bit is clear, then use
 *	   the process gid
 *
 *	3) Otherwise, use the gid of the parent directory.
 *
 * Note: newcp->c_attr.va_{mode,type} must already be set before calling
 * this routine.
 */
static void
cachefs_creategid(cnode_t *dcp, cnode_t *newcp, vattr_t *vap, cred_t *cr)
{
	if ((vap->va_mask & AT_GID) &&
	    ((vap->va_gid == dcp->c_attr.va_gid) ||
	    groupmember(vap->va_gid, cr) ||
	    secpolicy_vnode_create_gid(cr) != 0)) {
		newcp->c_attr.va_gid = vap->va_gid;
	} else {
		if (dcp->c_attr.va_mode & S_ISGID)
			newcp->c_attr.va_gid = dcp->c_attr.va_gid;
		else
			newcp->c_attr.va_gid = crgetgid(cr);
	}

	/*
	 * if we're creating a directory, and the parent directory has the
	 * set-GID bit set, set it on the new directory.
	 * Otherwise, if the user is neither privileged nor a member of the
	 * file's new group, clear the file's set-GID bit.
	 */
	if (dcp->c_attr.va_mode & S_ISGID && newcp->c_attr.va_type == VDIR) {
		newcp->c_attr.va_mode |= S_ISGID;
	} else if ((newcp->c_attr.va_mode & S_ISGID) &&
	    secpolicy_vnode_setids_setgids(cr, newcp->c_attr.va_gid) != 0)
		newcp->c_attr.va_mode &= ~S_ISGID;
}

/*
 * create an acl for the newly created file.  should be called right
 * after cachefs_creategid.
 */

static void
cachefs_createacl(cnode_t *dcp, cnode_t *newcp)
{
	fscache_t *fscp = C_TO_FSCACHE(dcp);
	vsecattr_t vsec;
	int gotvsec = 0;
	int error = 0; /* placeholder */
	aclent_t *aclp;
	o_mode_t *classp = NULL;
	o_mode_t gunion = 0;
	int i;

	if ((fscp->fs_info.fi_mntflags & CFS_NOACL) ||
	    (! cachefs_vtype_aclok(CTOV(newcp))))
		return;

	ASSERT(dcp->c_metadata.md_flags & MD_ACL);
	ASSERT(MUTEX_HELD(&dcp->c_statelock));
	ASSERT(MUTEX_HELD(&newcp->c_statelock));

	/*
	 * XXX should probably not do VSA_ACL and VSA_ACLCNT, but that
	 * would hit code paths that isn't hit anywhere else.
	 */

	bzero(&vsec, sizeof (vsec));
	vsec.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;
	error = cachefs_getaclfromcache(dcp, &vsec);
	if (error != 0)
		goto out;
	gotvsec = 1;

	if ((vsec.vsa_dfaclcnt > 0) && (vsec.vsa_dfaclentp != NULL)) {
		if ((vsec.vsa_aclcnt > 0) && (vsec.vsa_aclentp != NULL))
			kmem_free(vsec.vsa_aclentp,
			    vsec.vsa_aclcnt * sizeof (aclent_t));

		vsec.vsa_aclcnt = vsec.vsa_dfaclcnt;
		vsec.vsa_aclentp = vsec.vsa_dfaclentp;
		vsec.vsa_dfaclcnt = 0;
		vsec.vsa_dfaclentp = NULL;

		if (newcp->c_attr.va_type == VDIR) {
			vsec.vsa_dfaclentp = kmem_alloc(vsec.vsa_aclcnt *
			    sizeof (aclent_t), KM_SLEEP);
			vsec.vsa_dfaclcnt = vsec.vsa_aclcnt;
			bcopy(vsec.vsa_aclentp, vsec.vsa_dfaclentp,
			    vsec.vsa_aclcnt * sizeof (aclent_t));
		}

		/*
		 * this function should be called pretty much after
		 * the rest of the file creation stuff is done.  so,
		 * uid, gid, etc. should be `right'.  we'll go with
		 * that, rather than trying to determine whether to
		 * get stuff from cr or va.
		 */

		for (i = 0; i < vsec.vsa_aclcnt; i++) {
			aclp = ((aclent_t *)vsec.vsa_aclentp) + i;
			switch (aclp->a_type) {
			case DEF_USER_OBJ:
				aclp->a_type = USER_OBJ;
				aclp->a_id = newcp->c_metadata.md_vattr.va_uid;
				aclp->a_perm =
				    newcp->c_metadata.md_vattr.va_mode;
				aclp->a_perm &= 0700;
				aclp->a_perm >>= 6;
				break;

			case DEF_GROUP_OBJ:
				aclp->a_type = GROUP_OBJ;
				aclp->a_id = newcp->c_metadata.md_vattr.va_gid;
				aclp->a_perm =
				    newcp->c_metadata.md_vattr.va_mode;
				aclp->a_perm &= 070;
				aclp->a_perm >>= 3;
				gunion |= aclp->a_perm;
				break;

			case DEF_OTHER_OBJ:
				aclp->a_type = OTHER_OBJ;
				aclp->a_perm =
				    newcp->c_metadata.md_vattr.va_mode & 07;
				break;

			case DEF_CLASS_OBJ:
				aclp->a_type = CLASS_OBJ;
				classp = &(aclp->a_perm);
				break;

			case DEF_USER:
				aclp->a_type = USER;
				gunion |= aclp->a_perm;
				break;

			case DEF_GROUP:
				aclp->a_type = GROUP;
				gunion |= aclp->a_perm;
				break;
			}
		}

		/* XXX is this the POSIX thing to do? */
		if (classp != NULL)
			*classp &= gunion;

		/*
		 * we don't need to log this; rather, we clear the
		 * MD_ACL bit when we reconnect.
		 */

		error = cachefs_cacheacl(newcp, &vsec);
		if (error != 0)
			goto out;
	}

	newcp->c_metadata.md_aclclass = 07; /* XXX check posix */
	newcp->c_metadata.md_flags |= MD_ACL;
	newcp->c_flags |= CN_UPDATED;

out:

	if (gotvsec) {
		if ((vsec.vsa_aclcnt > 0) && (vsec.vsa_aclentp != NULL))
			kmem_free(vsec.vsa_aclentp,
			    vsec.vsa_aclcnt * sizeof (aclent_t));
		if ((vsec.vsa_dfaclcnt > 0) && (vsec.vsa_dfaclentp != NULL))
			kmem_free(vsec.vsa_dfaclentp,
			    vsec.vsa_dfaclcnt * sizeof (aclent_t));
	}
}

/*
 * this is translated from the UFS code for access checking.
 */

static int
cachefs_access_local(void *vcp, int mode, cred_t *cr)
{
	cnode_t *cp = vcp;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int shift = 0;

	ASSERT(MUTEX_HELD(&cp->c_statelock));

	if (mode & VWRITE) {
		/*
		 * Disallow write attempts on read-only
		 * file systems, unless the file is special.
		 */
		struct vnode *vp = CTOV(cp);
		if (vn_is_readonly(vp)) {
			if (!IS_DEVVP(vp)) {
				return (EROFS);
			}
		}
	}

	/*
	 * if we need to do ACLs, do it.  this works whether anyone
	 * has explicitly made an ACL or not.
	 */

	if (((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0) &&
	    (cachefs_vtype_aclok(CTOV(cp))))
		return (cachefs_acl_access(cp, mode, cr));

	if (crgetuid(cr) != cp->c_attr.va_uid) {
		shift += 3;
		if (!groupmember(cp->c_attr.va_gid, cr))
			shift += 3;
	}

	return (secpolicy_vnode_access2(cr, CTOV(cp), cp->c_attr.va_uid,
	    cp->c_attr.va_mode << shift, mode));
}

/*
 * This is transcribed from ufs_acl_access().  If that changes, then
 * this should, too.
 *
 * Check the cnode's ACL's to see if this mode of access is
 * allowed; return 0 if allowed, EACCES if not.
 *
 * We follow the procedure defined in Sec. 3.3.5, ACL Access
 * Check Algorithm, of the POSIX 1003.6 Draft Standard.
 */

#define	ACL_MODE_CHECK(M, PERM, C, I) \
    secpolicy_vnode_access2(C, CTOV(I), owner, (PERM), (M))

static int
cachefs_acl_access(struct cnode *cp, int mode, cred_t *cr)
{
	int error = 0;

	fscache_t *fscp = C_TO_FSCACHE(cp);

	int mask = ~0;
	int ismask = 0;

	int gperm = 0;
	int ngroup = 0;

	vsecattr_t vsec;
	int gotvsec = 0;
	aclent_t *aclp;

	uid_t owner = cp->c_attr.va_uid;

	int i;

	ASSERT(MUTEX_HELD(&cp->c_statelock));
	ASSERT((fscp->fs_info.fi_mntflags & CFS_NOACL) == 0);

	/*
	 * strictly speaking, we shouldn't set VSA_DFACL and DFACLCNT,
	 * but then i believe we'd be the only thing exercising those
	 * code paths -- probably a bad thing.
	 */

	bzero(&vsec, sizeof (vsec));
	vsec.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT;

	/* XXX KLUDGE! correct insidious 0-class problem */
	if (cp->c_metadata.md_aclclass == 0 &&
	    fscp->fs_cdconnected == CFS_CD_CONNECTED)
		cachefs_purgeacl(cp);
again:
	if (cp->c_metadata.md_flags & MD_ACL) {
		error = cachefs_getaclfromcache(cp, &vsec);
		if (error != 0) {
#ifdef CFSDEBUG
			if (error != ETIMEDOUT)
				CFS_DEBUG(CFSDEBUG_VOPS)
					printf("cachefs_acl_access():"
					    "error %d from getaclfromcache()\n",
					    error);
#endif /* CFSDEBUG */
			if ((cp->c_metadata.md_flags & MD_ACL) == 0) {
				goto again;
			} else {
				goto out;
			}
		}
	} else {
		if (cp->c_backvp == NULL) {
			if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
				error = cachefs_getbackvp(fscp, cp);
			else
				error = ETIMEDOUT;
		}
		if (error == 0)
			error = VOP_GETSECATTR(cp->c_backvp, &vsec, 0, cr,
			    NULL);
		if (error != 0) {
#ifdef CFSDEBUG
			CFS_DEBUG(CFSDEBUG_VOPS)
				printf("cachefs_acl_access():"
				    "error %d from getsecattr(backvp)\n",
				    error);
#endif /* CFSDEBUG */
			goto out;
		}
		if ((cp->c_flags & CN_NOCACHE) == 0 &&
		    !CFS_ISFS_BACKFS_NFSV4(fscp))
			(void) cachefs_cacheacl(cp, &vsec);
	}
	gotvsec = 1;

	ASSERT(error == 0);
	for (i = 0; i < vsec.vsa_aclcnt; i++) {
		aclp = ((aclent_t *)vsec.vsa_aclentp) + i;
		switch (aclp->a_type) {
		case USER_OBJ:
			/*
			 * this might look cleaner in the 2nd loop
			 * below, but we do it here as an
			 * optimization.
			 */

			owner = aclp->a_id;
			if (crgetuid(cr) == owner) {
				error = ACL_MODE_CHECK(mode, aclp->a_perm << 6,
				    cr, cp);
				goto out;
			}
			break;

		case CLASS_OBJ:
			mask = aclp->a_perm;
			ismask = 1;
			break;
		}
	}

	ASSERT(error == 0);
	for (i = 0; i < vsec.vsa_aclcnt; i++) {
		aclp = ((aclent_t *)vsec.vsa_aclentp) + i;
		switch (aclp->a_type) {
		case USER:
			if (crgetuid(cr) == aclp->a_id) {
				error = ACL_MODE_CHECK(mode,
				    (aclp->a_perm & mask) << 6, cr, cp);
				goto out;
			}
			break;

		case GROUP_OBJ:
			if (groupmember(aclp->a_id, cr)) {
				++ngroup;
				gperm |= aclp->a_perm;
				if (! ismask) {
					error = ACL_MODE_CHECK(mode,
					    aclp->a_perm << 6,
					    cr, cp);
					goto out;
				}
			}
			break;

		case GROUP:
			if (groupmember(aclp->a_id, cr)) {
				++ngroup;
				gperm |= aclp->a_perm;
			}
			break;

		case OTHER_OBJ:
			if (ngroup == 0) {
				error = ACL_MODE_CHECK(mode, aclp->a_perm << 6,
				    cr, cp);
				goto out;
			}
			break;

		default:
			break;
		}
	}

	ASSERT(ngroup > 0);
	error = ACL_MODE_CHECK(mode, (gperm & mask) << 6, cr, cp);

out:
	if (gotvsec) {
		if (vsec.vsa_aclcnt && vsec.vsa_aclentp)
			kmem_free(vsec.vsa_aclentp,
			    vsec.vsa_aclcnt * sizeof (aclent_t));
		if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp)
			kmem_free(vsec.vsa_dfaclentp,
			    vsec.vsa_dfaclcnt * sizeof (aclent_t));
	}

	return (error);
}

/*
 * see if permissions allow for removal of the given file from
 * the given directory.
 */
static int
cachefs_stickyrmchk(struct cnode *dcp, struct cnode *cp, cred_t *cr)
{
	uid_t uid;
	/*
	 * If the containing directory is sticky, the user must:
	 *  - own the directory, or
	 *  - own the file, or
	 *  - be able to write the file (if it's a plain file), or
	 *  - be sufficiently privileged.
	 */
	if ((dcp->c_attr.va_mode & S_ISVTX) &&
	    ((uid = crgetuid(cr)) != dcp->c_attr.va_uid) &&
	    (uid != cp->c_attr.va_uid) &&
	    (cp->c_attr.va_type != VREG ||
	    cachefs_access_local(cp, VWRITE, cr) != 0))
		return (secpolicy_vnode_remove(cr));

	return (0);
}

/*
 * Returns a new name, may even be unique.
 * Stolen from nfs code.
 * Since now we will use renaming to .cfs* in place of .nfs*
 * for CacheFS. Both NFS and CacheFS will rename opened files.
 */
static char cachefs_prefix[] = ".cfs";
kmutex_t cachefs_newnum_lock;

static char *
cachefs_newname(void)
{
	static uint_t newnum = 0;
	char *news;
	char *s, *p;
	uint_t id;

	mutex_enter(&cachefs_newnum_lock);
	if (newnum == 0) {
		newnum = gethrestime_sec() & 0xfffff;
		newnum |= 0x10000;
	}
	id = newnum++;
	mutex_exit(&cachefs_newnum_lock);

	news = cachefs_kmem_alloc(MAXNAMELEN, KM_SLEEP);
	s = news;
	p = cachefs_prefix;
	while (*p != '\0')
		*s++ = *p++;
	while (id != 0) {
		*s++ = "0123456789ABCDEF"[id & 0x0f];
		id >>= 4;
	}
	*s = '\0';
	return (news);
}

/*
 * Called to rename the specified file to a temporary file so
 * operations to the file after remove work.
 * Must call this routine with the dir c_rwlock held as a writer.
 */
static int
/*ARGSUSED*/
cachefs_remove_dolink(vnode_t *dvp, vnode_t *vp, char *nm, cred_t *cr)
{
	cnode_t *cp = VTOC(vp);
	char *tmpname;
	fscache_t *fscp = C_TO_FSCACHE(cp);
	int error;

	ASSERT(RW_WRITE_HELD(&(VTOC(dvp)->c_rwlock)));

	/* get the new name for the file */
	tmpname = cachefs_newname();

	/* do the link */
	if (fscp->fs_cdconnected == CFS_CD_CONNECTED)
		error = cachefs_link_connected(dvp, vp, tmpname, cr);
	else
		error = cachefs_link_disconnected(dvp, vp, tmpname, cr);
	if (error) {
		cachefs_kmem_free(tmpname, MAXNAMELEN);
		return (error);
	}

	mutex_enter(&cp->c_statelock);
	if (cp->c_unldvp) {
		VN_RELE(cp->c_unldvp);
		cachefs_kmem_free(cp->c_unlname, MAXNAMELEN);
		crfree(cp->c_unlcred);
	}

	VN_HOLD(dvp);
	cp->c_unldvp = dvp;
	crhold(cr);
	cp->c_unlcred = cr;
	cp->c_unlname = tmpname;

	/* drop the backvp so NFS does not also do a rename */
	mutex_exit(&cp->c_statelock);

	return (0);
}

/*
 * Marks the cnode as modified.
 */
static void
cachefs_modified(cnode_t *cp)
{
	fscache_t *fscp = C_TO_FSCACHE(cp);
	struct vattr va;
	int error;

	ASSERT(MUTEX_HELD(&cp->c_statelock));
	ASSERT(cp->c_metadata.md_rlno);

	/* if not on the modify list */
	if (cp->c_metadata.md_rltype != CACHEFS_RL_MODIFIED) {
		/* put on modified list, also marks the file as modified */
		cachefs_rlent_moveto(fscp->fs_cache, CACHEFS_RL_MODIFIED,
		    cp->c_metadata.md_rlno, cp->c_metadata.md_frontblks);
		cp->c_metadata.md_rltype = CACHEFS_RL_MODIFIED;
		cp->c_flags |= CN_UPDATED;

		/* if a modified regular file that is not local */
		if (((cp->c_id.cid_flags & CFS_CID_LOCAL) == 0) &&
		    (cp->c_metadata.md_flags & MD_FILE) &&
		    (cp->c_attr.va_type == VREG)) {

			if (cp->c_frontvp == NULL)
				(void) cachefs_getfrontfile(cp);
			if (cp->c_frontvp) {
				/* identify file so fsck knows it is modified */
				va.va_mode = 0766;
				va.va_mask = AT_MODE;
				error = VOP_SETATTR(cp->c_frontvp,
				    &va, 0, kcred, NULL);
				if (error) {
					cmn_err(CE_WARN,
					    "Cannot change ff mode.\n");
				}
			}
		}
	}
}

/*
 * Marks the cnode as modified.
 * Allocates a rl slot for the cnode if necessary.
 * Returns 0 for success, !0 if cannot get an rl slot.
 */
static int
cachefs_modified_alloc(cnode_t *cp)
{
	fscache_t *fscp = C_TO_FSCACHE(cp);
	filegrp_t *fgp = cp->c_filegrp;
	int error;
	rl_entry_t rl_ent;

	ASSERT(MUTEX_HELD(&cp->c_statelock));

	/* get the rl slot if needed */
	if (cp->c_metadata.md_rlno == 0) {
		/* get a metadata slot if we do not have one yet */
		if (cp->c_flags & CN_ALLOC_PENDING) {
			if (cp->c_filegrp->fg_flags & CFS_FG_ALLOC_ATTR) {
				(void) filegrp_allocattr(cp->c_filegrp);
			}
			error = filegrp_create_metadata(cp->c_filegrp,
			    &cp->c_metadata, &cp->c_id);
			if (error)
				return (error);
			cp->c_flags &= ~CN_ALLOC_PENDING;
		}

		/* get a free rl entry */
		rl_ent.rl_fileno = cp->c_id.cid_fileno;
		rl_ent.rl_local = (cp->c_id.cid_flags & CFS_CID_LOCAL) ? 1 : 0;
		rl_ent.rl_fsid = fscp->fs_cfsid;
		rl_ent.rl_attrc = 0;
		error = cachefs_rl_alloc(fscp->fs_cache, &rl_ent,
		    &cp->c_metadata.md_rlno);
		if (error)
			return (error);
		cp->c_metadata.md_rltype = CACHEFS_RL_NONE;

		/* hold the filegrp so the attrcache file is not gc */
		error = filegrp_ffhold(fgp);
		if (error) {
			cachefs_rlent_moveto(fscp->fs_cache,
			    CACHEFS_RL_FREE, cp->c_metadata.md_rlno, 0);
			cp->c_metadata.md_rlno = 0;
			return (error);
		}
	}
	cachefs_modified(cp);
	return (0);
}

int
cachefs_vtype_aclok(vnode_t *vp)
{
	vtype_t *vtp, oktypes[] = {VREG, VDIR, VFIFO, VNON};

	if (vp->v_type == VNON)
		return (0);

	for (vtp = oktypes; *vtp != VNON; vtp++)
		if (vp->v_type == *vtp)
			break;

	return (*vtp != VNON);
}

static int
cachefs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
    caller_context_t *ct)
{
	int error = 0;
	fscache_t *fscp = C_TO_FSCACHE(VTOC(vp));

	/* Assert cachefs compatibility if NFSv4 is in use */
	CFS_BACKFS_NFSV4_ASSERT_FSCACHE(fscp);
	CFS_BACKFS_NFSV4_ASSERT_CNODE(VTOC(vp));

	if (cmd == _PC_FILESIZEBITS) {
		u_offset_t maxsize = fscp->fs_offmax;
		(*valp) = 0;
		while (maxsize != 0) {
			maxsize >>= 1;
			(*valp)++;
		}
		(*valp)++;
	} else
		error = fs_pathconf(vp, cmd, valp, cr, ct);

	return (error);
}
author	Joshua M. Clulow <jmc@joyent.com>
	Mon, 04 Mar 2013 23:52:56 +0000
changeset 14188	afe390b9f1e0
parent 12273	63678502e95e
permissions	-rw-r--r--