usr/src/uts/common/fs/dev/sdev_vnops.c
author Casper H.S. Dik <Casper.Dik@Sun.COM>
Wed, 28 Apr 2010 10:01:37 +0200
changeset 12273 63678502e95e
parent 11134 8aa0c4ca6639
child 12387 e2a7d18f1d90
permissions -rw-r--r--
PSARC 2009/377 In-kernel pfexec implementation. PSARC 2009/378 Basic File Privileges PSARC 2010/072 RBAC update: user attrs from profiles 4912090 pfzsh(1) should exist 4912093 pfbash(1) should exist 4912096 pftcsh(1) should exist 6440298 Expand the basic privilege set in order to restrict file access 6859862 Move pfexec into the kernel 6919171 cred_t sidesteps kmem_debug; we need to be able to detect bad hold/free when they occur 6923721 The new SYS_SMB privilege is not backward compatible 6937562 autofs doesn't remove its door when the zone shuts down 6937727 Zones stuck on deathrow; netstack_zone keeps a credential reference to the zone 6940159 Implement PSARC 2010/072

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * vnode ops for the /dev filesystem
 *
 * - VDIR, VCHR, CBLK, and VLNK are considered must supported files
 * - VREG and VDOOR are used for some internal implementations in
 *    the global zone, e.g. devname and devfsadm communication
 * - other file types are unusual in this namespace and
 *    not supported for now
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/t_lock.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/user.h>
#include <sys/time.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/vfs_opreg.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/cred.h>
#include <sys/dirent.h>
#include <sys/pathname.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/policy.h>
#include <vm/hat.h>
#include <vm/seg_vn.h>
#include <vm/seg_map.h>
#include <vm/seg.h>
#include <vm/as.h>
#include <vm/page.h>
#include <sys/proc.h>
#include <sys/mode.h>
#include <sys/sunndi.h>
#include <sys/ptms.h>
#include <fs/fs_subr.h>
#include <sys/fs/dv_node.h>
#include <sys/fs/sdev_impl.h>

/*ARGSUSED*/
static int
sdev_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
{
	struct sdev_node *dv = VTOSDEV(*vpp);
	struct sdev_node *ddv = dv->sdev_dotdot;
	int error = 0;

	if ((*vpp)->v_type == VDIR)
		return (0);

	if (!SDEV_IS_GLOBAL(dv))
		return (ENOTSUP);

	if ((*vpp)->v_type == VLNK)
		return (ENOENT);
	ASSERT((*vpp)->v_type == VREG);
	if ((*vpp)->v_type != VREG)
		return (ENOTSUP);

	ASSERT(ddv);
	rw_enter(&ddv->sdev_contents, RW_READER);
	if (dv->sdev_attrvp == NULL) {
		rw_exit(&ddv->sdev_contents);
		return (ENOENT);
	}
	error = VOP_OPEN(&(dv->sdev_attrvp), flag, cred, ct);
	rw_exit(&ddv->sdev_contents);
	return (error);
}

/*ARGSUSED1*/
static int
sdev_close(struct vnode *vp, int flag, int count,
    offset_t offset, struct cred *cred, caller_context_t *ct)
{
	struct sdev_node *dv = VTOSDEV(vp);

	if (vp->v_type == VDIR) {
		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
		cleanshares(vp, ttoproc(curthread)->p_pid);
		return (0);
	}

	if (!SDEV_IS_GLOBAL(dv))
		return (ENOTSUP);

	ASSERT(vp->v_type == VREG);
	if (vp->v_type != VREG)
		return (ENOTSUP);

	ASSERT(dv->sdev_attrvp);
	return (VOP_CLOSE(dv->sdev_attrvp, flag, count, offset, cred, ct));
}

/*ARGSUSED*/
static int
sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
	struct caller_context *ct)
{
	struct sdev_node *dv = (struct sdev_node *)VTOSDEV(vp);
	int	error;

	if (!SDEV_IS_GLOBAL(dv))
		return (EINVAL);

	if (vp->v_type == VDIR)
		return (EISDIR);

	/* only supporting regular files in /dev */
	ASSERT(vp->v_type == VREG);
	if (vp->v_type != VREG)
		return (EINVAL);

	ASSERT(RW_READ_HELD(&VTOSDEV(vp)->sdev_contents));
	ASSERT(dv->sdev_attrvp);
	(void) VOP_RWLOCK(dv->sdev_attrvp, 0, ct);
	error = VOP_READ(dv->sdev_attrvp, uio, ioflag, cred, ct);
	VOP_RWUNLOCK(dv->sdev_attrvp, 0, ct);
	return (error);
}

/*ARGSUSED*/
static int
sdev_write(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
	struct caller_context *ct)
{
	struct sdev_node *dv = VTOSDEV(vp);
	int	error = 0;

	if (!SDEV_IS_GLOBAL(dv))
		return (EINVAL);

	if (vp->v_type == VDIR)
		return (EISDIR);

	/* only supporting regular files in /dev */
	ASSERT(vp->v_type == VREG);
	if (vp->v_type != VREG)
		return (EINVAL);

	ASSERT(dv->sdev_attrvp);

	(void) VOP_RWLOCK(dv->sdev_attrvp, 1, ct);
	error = VOP_WRITE(dv->sdev_attrvp, uio, ioflag, cred, ct);
	VOP_RWUNLOCK(dv->sdev_attrvp, 1, ct);
	if (error == 0) {
		sdev_update_timestamps(dv->sdev_attrvp, kcred,
		    AT_MTIME);
	}
	return (error);
}

/*ARGSUSED*/
static int
sdev_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
    struct cred *cred, int *rvalp,  caller_context_t *ct)
{
	struct sdev_node *dv = VTOSDEV(vp);

	if (!SDEV_IS_GLOBAL(dv) || (vp->v_type == VDIR))
		return (ENOTTY);

	ASSERT(vp->v_type == VREG);
	if (vp->v_type != VREG)
		return (EINVAL);

	ASSERT(dv->sdev_attrvp);
	return (VOP_IOCTL(dv->sdev_attrvp, cmd, arg, flag, cred, rvalp, ct));
}

static int
sdev_getattr(struct vnode *vp, struct vattr *vap, int flags,
    struct cred *cr, caller_context_t *ct)
{
	int			error = 0;
	struct sdev_node	*dv = VTOSDEV(vp);
	struct sdev_node	*parent = dv->sdev_dotdot;

	ASSERT(parent);

	rw_enter(&parent->sdev_contents, RW_READER);
	ASSERT(dv->sdev_attr || dv->sdev_attrvp);

	/*
	 * search order:
	 * 	- for persistent nodes (SDEV_PERSIST): backstore
	 *	- for non-persistent nodes: module ops if global, then memory
	 */
	if (dv->sdev_attrvp) {
		rw_exit(&parent->sdev_contents);
		error = VOP_GETATTR(dv->sdev_attrvp, vap, flags, cr, ct);
		sdev_vattr_merge(dv, vap);
	} else {
		ASSERT(dv->sdev_attr);
		*vap = *dv->sdev_attr;
		sdev_vattr_merge(dv, vap);
		rw_exit(&parent->sdev_contents);
	}

	return (error);
}

/*ARGSUSED4*/
static int
sdev_setattr(struct vnode *vp, struct vattr *vap, int flags,
    struct cred *cred, caller_context_t *ctp)
{
	return (devname_setattr_func(vp, vap, flags, cred, NULL, 0));
}

static int
sdev_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
    struct cred *cr, caller_context_t *ct)
{
	int	error;
	struct sdev_node *dv = VTOSDEV(vp);
	struct vnode *avp = dv->sdev_attrvp;

	if (avp == NULL) {
		/* return fs_fab_acl() if flavor matches, else do nothing */
		if ((SDEV_ACL_FLAVOR(vp) == _ACL_ACLENT_ENABLED &&
		    (vsap->vsa_mask & (VSA_ACLCNT | VSA_DFACLCNT))) ||
		    (SDEV_ACL_FLAVOR(vp) == _ACL_ACE_ENABLED &&
		    (vsap->vsa_mask & (VSA_ACECNT | VSA_ACE))))
			return (fs_fab_acl(vp, vsap, flags, cr, ct));

		return (ENOSYS);
	}

	(void) VOP_RWLOCK(avp, 1, ct);
	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
	VOP_RWUNLOCK(avp, 1, ct);
	return (error);
}

static int
sdev_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
    struct cred *cr, caller_context_t *ct)
{
	int	error;
	struct sdev_node *dv = VTOSDEV(vp);
	struct vnode *avp = dv->sdev_attrvp;

	if (dv->sdev_state == SDEV_ZOMBIE)
		return (0);

	if (avp == NULL) {
		if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_PERSIST(dv))
			return (fs_nosys());
		ASSERT(dv->sdev_attr);
		/*
		 * if coming in directly, the acl system call will
		 * have held the read-write lock via VOP_RWLOCK()
		 * If coming in via specfs, specfs will have
		 * held the rw lock on the realvp i.e. us.
		 */
		ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
		sdev_vattr_merge(dv, dv->sdev_attr);
		error = sdev_shadow_node(dv, cr);
		if (error) {
			return (fs_nosys());
		}

		ASSERT(dv->sdev_attrvp);
		/* clean out the memory copy if any */
		if (dv->sdev_attr) {
			kmem_free(dv->sdev_attr, sizeof (struct vattr));
			dv->sdev_attr = NULL;
		}
		avp = dv->sdev_attrvp;
	}
	ASSERT(avp);

	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, ct);
	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, ct);
	return (error);
}

int
sdev_unlocked_access(void *vdv, int mode, struct cred *cr)
{
	struct sdev_node	*dv = vdv;
	int			shift = 0;
	uid_t			owner = dv->sdev_attr->va_uid;

	if (crgetuid(cr) != owner) {
		shift += 3;
		if (groupmember(dv->sdev_attr->va_gid, cr) == 0)
			shift += 3;
	}

	return (secpolicy_vnode_access2(cr, SDEVTOV(dv), owner,
	    dv->sdev_attr->va_mode << shift, mode));
}

static int
sdev_access(struct vnode *vp, int mode, int flags, struct cred *cr,
    caller_context_t *ct)
{
	struct sdev_node	*dv = VTOSDEV(vp);
	int ret = 0;

	ASSERT(dv->sdev_attr || dv->sdev_attrvp);

	if (dv->sdev_attrvp) {
		ret = VOP_ACCESS(dv->sdev_attrvp, mode, flags, cr, ct);
	} else if (dv->sdev_attr) {
		rw_enter(&dv->sdev_contents, RW_READER);
		ret = sdev_unlocked_access(dv, mode, cr);
		if (ret)
			ret = EACCES;
		rw_exit(&dv->sdev_contents);
	}

	return (ret);
}

/*
 * Lookup
 */
/*ARGSUSED3*/
static int
sdev_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
{
	struct sdev_node *parent;
	int error;

	parent = VTOSDEV(dvp);
	ASSERT(parent);

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
		return (error);

	if (!SDEV_IS_GLOBAL(parent))
		return (prof_lookup(dvp, nm, vpp, cred));
	return (devname_lookup_func(parent, nm, vpp, cred, NULL, 0));
}

/*ARGSUSED2*/
static int
sdev_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
    int mode, struct vnode **vpp, struct cred *cred, int flag,
    caller_context_t *ct, vsecattr_t *vsecp)
{
	struct vnode		*vp = NULL;
	struct vnode		*avp;
	struct sdev_node	*parent;
	struct sdev_node	*self = NULL;
	int			error = 0;
	vtype_t			type = vap->va_type;

	ASSERT(type != VNON && type != VBAD);

	if ((type == VFIFO) || (type == VSOCK) ||
	    (type == VPROC) || (type == VPORT))
		return (ENOTSUP);

	parent = VTOSDEV(dvp);
	ASSERT(parent);

	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
	if (parent->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (ENOENT);
	}

	/* non-global do not allow pure node creation */
	if (!SDEV_IS_GLOBAL(parent)) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (prof_lookup(dvp, nm, vpp, cred));
	}
	rw_exit(&parent->sdev_dotdot->sdev_contents);

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
		return (error);

	/* check existing name */
/* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
	error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);

	/* name found */
	if (error == 0) {
		ASSERT(vp);
		if (excl == EXCL) {
			error = EEXIST;
		} else if ((vp->v_type == VDIR) && (mode & VWRITE)) {
			/* allowing create/read-only an existing directory */
			error = EISDIR;
		} else {
			error = VOP_ACCESS(vp, mode, 0, cred, ct);
		}

		if (error) {
			VN_RELE(vp);
			return (error);
		}

		/* truncation first */
		if ((vp->v_type == VREG) && (vap->va_mask & AT_SIZE) &&
		    (vap->va_size == 0)) {
			ASSERT(parent->sdev_attrvp);
			error = VOP_CREATE(parent->sdev_attrvp,
			    nm, vap, excl, mode, &avp, cred, flag, ct, vsecp);

			if (error) {
				VN_RELE(vp);
				return (error);
			}
		}

		sdev_update_timestamps(vp, kcred,
		    AT_CTIME|AT_MTIME|AT_ATIME);
		*vpp = vp;
		return (0);
	}

	/* bail out early */
	if (error != ENOENT)
		return (error);

	/* verify write access - compliance specifies ENXIO */
	if ((error = VOP_ACCESS(dvp, VEXEC|VWRITE, 0, cred, ct)) != 0) {
		if (error == EACCES)
			error = ENXIO;
		return (error);
	}

	/*
	 * For memory-based (ROFS) directory:
	 * 	- either disallow node creation;
	 *	- or implement VOP_CREATE of its own
	 */
	rw_enter(&parent->sdev_contents, RW_WRITER);
	if (!SDEV_IS_PERSIST(parent)) {
		rw_exit(&parent->sdev_contents);
		return (ENOTSUP);
	}
	ASSERT(parent->sdev_attrvp);
	error = sdev_mknode(parent, nm, &self, vap, NULL, NULL,
	    cred, SDEV_READY);
	if (error) {
		rw_exit(&parent->sdev_contents);
		if (self)
			SDEV_RELE(self);
		return (error);
	}
	rw_exit(&parent->sdev_contents);

	ASSERT(self);
	/* take care the timestamps for the node and its parent */
	sdev_update_timestamps(SDEVTOV(self), kcred,
	    AT_CTIME|AT_MTIME|AT_ATIME);
	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
	if (SDEV_IS_GLOBAL(parent))
		atomic_inc_ulong(&parent->sdev_gdir_gen);

	/* wake up other threads blocked on looking up this node */
	mutex_enter(&self->sdev_lookup_lock);
	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
	mutex_exit(&self->sdev_lookup_lock);
	error = sdev_to_vp(self, vpp);
	return (error);
}

static int
sdev_remove(struct vnode *dvp, char *nm, struct cred *cred,
    caller_context_t *ct, int flags)
{
	int	error;
	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
	struct vnode *vp = NULL;
	struct sdev_node *dv = NULL;
	int len;
	int bkstore = 0;

	/* bail out early */
	len = strlen(nm);
	if (nm[0] == '.') {
		if (len == 1) {
			return (EINVAL);
		} else if (len == 2 && nm[1] == '.') {
			return (EEXIST);
		}
	}

	ASSERT(parent);
	rw_enter(&parent->sdev_contents, RW_READER);
	if (!SDEV_IS_GLOBAL(parent)) {
		rw_exit(&parent->sdev_contents);
		return (ENOTSUP);
	}

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) {
		rw_exit(&parent->sdev_contents);
		return (error);
	}

	/* check existence first */
	dv = sdev_cache_lookup(parent, nm);
	if (dv == NULL) {
		rw_exit(&parent->sdev_contents);
		return (ENOENT);
	}

	vp = SDEVTOV(dv);
	if ((dv->sdev_state == SDEV_INIT) ||
	    (dv->sdev_state == SDEV_ZOMBIE)) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (ENOENT);
	}

	/* write access is required to remove an entry */
	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (error);
	}

	/*
	 * sdev_dirdelete does the real job of:
	 *  - make sure no open ref count
	 *  - destroying the sdev_node
	 *  - releasing the hold on attrvp
	 */
	bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
	if (!rw_tryupgrade(&parent->sdev_contents)) {
		rw_exit(&parent->sdev_contents);
		rw_enter(&parent->sdev_contents, RW_WRITER);
	}
	error = sdev_cache_update(parent, &dv, nm, SDEV_CACHE_DELETE);
	rw_exit(&parent->sdev_contents);

	sdcmn_err2(("sdev_remove: cache_update error %d\n", error));
	if (error && (error != EBUSY)) {
		/* report errors other than EBUSY */
		VN_RELE(vp);
	} else {
		sdcmn_err2(("sdev_remove: cleaning node %s from cache "
		    " with error %d\n", nm, error));

		/*
		 * best efforts clean up the backing store
		 */
		if (bkstore) {
			ASSERT(parent->sdev_attrvp);
			error = VOP_REMOVE(parent->sdev_attrvp, nm, cred,
			    ct, flags);
			/*
			 * do not report BUSY error
			 * because the backing store ref count is released
			 * when the last ref count on the sdev_node is
			 * released.
			 */
			if (error == EBUSY) {
				sdcmn_err2(("sdev_remove: device %s is still on"
				    "disk %s\n", nm, parent->sdev_path));
				error = 0;
			}
		}

		if (error == EBUSY)
			error = 0;
	}

	return (error);
}

/*
 * Some restrictions for this file system:
 *  - both oldnm and newnm are in the scope of /dev file system,
 *    to simply the namespace management model.
 */
/*ARGSUSED6*/
static int
sdev_rename(struct vnode *odvp, char *onm, struct vnode *ndvp, char *nnm,
    struct cred *cred, caller_context_t *ct, int flags)
{
	struct sdev_node	*fromparent = NULL;
	struct vattr		vattr;
	struct sdev_node	*toparent;
	struct sdev_node	*fromdv = NULL;	/* source node */
	struct vnode 		*ovp = NULL;	/* source vnode */
	struct sdev_node	*todv = NULL;	/* destination node */
	struct vnode 		*nvp = NULL;	/* destination vnode */
	int			samedir = 0;	/* set if odvp == ndvp */
	struct vnode		*realvp;
	int error = 0;
	dev_t fsid;
	int bkstore = 0;
	vtype_t type;

	/* prevent modifying "." and ".." */
	if ((onm[0] == '.' &&
	    (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
	    (nnm[0] == '.' &&
	    (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0')))) {
		return (EINVAL);
	}

	fromparent = VTOSDEV(odvp);
	toparent = VTOSDEV(ndvp);

	/* ZOMBIE parent doesn't allow new node creation */
	rw_enter(&fromparent->sdev_dotdot->sdev_contents, RW_READER);
	if (fromparent->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&fromparent->sdev_dotdot->sdev_contents);
		return (ENOENT);
	}

	/* renaming only supported for global device nodes */
	if (!SDEV_IS_GLOBAL(fromparent)) {
		rw_exit(&fromparent->sdev_dotdot->sdev_contents);
		return (ENOTSUP);
	}
	rw_exit(&fromparent->sdev_dotdot->sdev_contents);

	rw_enter(&toparent->sdev_dotdot->sdev_contents, RW_READER);
	if (toparent->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&toparent->sdev_dotdot->sdev_contents);
		return (ENOENT);
	}
	rw_exit(&toparent->sdev_dotdot->sdev_contents);

	/*
	 * acquire the global lock to prevent
	 * mount/unmount/other rename activities.
	 */
	mutex_enter(&sdev_lock);

	/* check existence of the source node */
/* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
	error = VOP_LOOKUP(odvp, onm, &ovp, NULL, 0, NULL, cred, ct,
	    NULL, NULL);
	if (error) {
		sdcmn_err2(("sdev_rename: the source node %s exists\n",
		    onm));
		mutex_exit(&sdev_lock);
		return (error);
	}

	if (VOP_REALVP(ovp, &realvp, ct) == 0) {
		VN_HOLD(realvp);
		VN_RELE(ovp);
		ovp = realvp;
	}

	/* check existence of destination */
/* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
	error = VOP_LOOKUP(ndvp, nnm, &nvp, NULL, 0, NULL, cred, ct,
	    NULL, NULL);
	if (error && (error != ENOENT)) {
		mutex_exit(&sdev_lock);
		VN_RELE(ovp);
		return (error);
	}

	if (nvp && (VOP_REALVP(nvp, &realvp, ct) == 0)) {
		VN_HOLD(realvp);
		VN_RELE(nvp);
		nvp = realvp;
	}

	/*
	 * make sure the source and the destination are
	 * in the same dev filesystem
	 */
	if (odvp != ndvp) {
		vattr.va_mask = AT_FSID;
		if (error = VOP_GETATTR(odvp, &vattr, 0, cred, ct)) {
			mutex_exit(&sdev_lock);
			VN_RELE(ovp);
			return (error);
		}
		fsid = vattr.va_fsid;
		vattr.va_mask = AT_FSID;
		if (error = VOP_GETATTR(ndvp, &vattr, 0, cred, ct)) {
			mutex_exit(&sdev_lock);
			VN_RELE(ovp);
			return (error);
		}
		if (fsid != vattr.va_fsid) {
			mutex_exit(&sdev_lock);
			VN_RELE(ovp);
			return (EXDEV);
		}
	}

	/* make sure the old entry can be deleted */
	error = VOP_ACCESS(odvp, VWRITE, 0, cred, ct);
	if (error) {
		mutex_exit(&sdev_lock);
		VN_RELE(ovp);
		return (error);
	}

	/* make sure the destination allows creation */
	samedir = (fromparent == toparent);
	if (!samedir) {
		error = VOP_ACCESS(ndvp, VEXEC|VWRITE, 0, cred, ct);
		if (error) {
			mutex_exit(&sdev_lock);
			VN_RELE(ovp);
			return (error);
		}
	}

	fromdv = VTOSDEV(ovp);
	ASSERT(fromdv);

	/* destination file exists */
	if (nvp) {
		todv = VTOSDEV(nvp);
		ASSERT(todv);
	}

	/*
	 * link source to new target in the memory
	 */
	error = sdev_rnmnode(fromparent, fromdv, toparent, &todv, nnm, cred);
	if (error) {
		sdcmn_err2(("sdev_rename: renaming %s to %s failed "
		    " with error %d\n", onm, nnm, error));
		mutex_exit(&sdev_lock);
		if (nvp)
			VN_RELE(nvp);
		VN_RELE(ovp);
		return (error);
	}

	/*
	 * unlink from source
	 */
	rw_enter(&fromparent->sdev_contents, RW_READER);
	fromdv = sdev_cache_lookup(fromparent, onm);
	if (fromdv == NULL) {
		rw_exit(&fromparent->sdev_contents);
		mutex_exit(&sdev_lock);
		sdcmn_err2(("sdev_rename: the source is deleted already\n"));
		return (0);
	}

	if (fromdv->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&fromparent->sdev_contents);
		mutex_exit(&sdev_lock);
		VN_RELE(SDEVTOV(fromdv));
		sdcmn_err2(("sdev_rename: the source is being deleted\n"));
		return (0);
	}
	rw_exit(&fromparent->sdev_contents);
	ASSERT(SDEVTOV(fromdv) == ovp);
	VN_RELE(ovp);

	/* clean out the directory contents before it can be removed */
	type = SDEVTOV(fromdv)->v_type;
	if (type == VDIR) {
		error = sdev_cleandir(fromdv, NULL, 0);
		sdcmn_err2(("sdev_rename: cleandir finished with %d\n",
		    error));
		if (error == EBUSY)
			error = 0;
	}

	rw_enter(&fromparent->sdev_contents, RW_WRITER);
	bkstore = SDEV_IS_PERSIST(fromdv) ? 1 : 0;
	error = sdev_cache_update(fromparent, &fromdv, onm,
	    SDEV_CACHE_DELETE);

	/* best effforts clean up the backing store */
	if (bkstore) {
		ASSERT(fromparent->sdev_attrvp);
		if (type != VDIR) {
/* XXXci - We may need to translate the C-I flags on VOP_REMOVE */
			error = VOP_REMOVE(fromparent->sdev_attrvp,
			    onm, kcred, ct, 0);
		} else {
/* XXXci - We may need to translate the C-I flags on VOP_RMDIR */
			error = VOP_RMDIR(fromparent->sdev_attrvp,
			    onm, fromparent->sdev_attrvp, kcred, ct, 0);
		}

		if (error) {
			sdcmn_err2(("sdev_rename: device %s is "
			    "still on disk %s\n", onm,
			    fromparent->sdev_path));
			error = 0;
		}
	}
	rw_exit(&fromparent->sdev_contents);
	mutex_exit(&sdev_lock);

	/* once reached to this point, the rename is regarded successful */
	return (0);
}

/*
 * dev-fs version of "ln -s path dev-name"
 *	tnm - path, e.g. /devices/... or /dev/...
 *	lnm - dev_name
 */
/*ARGSUSED6*/
static int
sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva,
    char *tnm, struct cred *cred, caller_context_t *ct, int flags)
{
	int error;
	struct vnode *vp = NULL;
	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
	struct sdev_node *self = (struct sdev_node *)NULL;

	ASSERT(parent);
	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
	if (parent->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		sdcmn_err2(("sdev_symlink: parent %s is ZOMBIED \n",
		    parent->sdev_name));
		return (ENOENT);
	}

	if (!SDEV_IS_GLOBAL(parent)) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (ENOTSUP);
	}
	rw_exit(&parent->sdev_dotdot->sdev_contents);

	/* execute access is required to search a directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
		return (error);

	/* find existing name */
/* XXXci - We may need to translate the C-I flags here */
	error = VOP_LOOKUP(dvp, lnm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
	if (error == 0) {
		ASSERT(vp);
		VN_RELE(vp);
		sdcmn_err2(("sdev_symlink: node %s already exists\n", lnm));
		return (EEXIST);
	}
	if (error != ENOENT)
		return (error);

	/* write access is required to create a symlink */
	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0)
		return (error);

	/* put it into memory cache */
	rw_enter(&parent->sdev_contents, RW_WRITER);
	error = sdev_mknode(parent, lnm, &self, tva, NULL, (void *)tnm,
	    cred, SDEV_READY);
	if (error) {
		rw_exit(&parent->sdev_contents);
		sdcmn_err2(("sdev_symlink: node %s creation failed\n", lnm));
		if (self)
			SDEV_RELE(self);

		return (error);
	}
	ASSERT(self && (self->sdev_state == SDEV_READY));
	rw_exit(&parent->sdev_contents);

	/* take care the timestamps for the node and its parent */
	sdev_update_timestamps(SDEVTOV(self), kcred,
	    AT_CTIME|AT_MTIME|AT_ATIME);
	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
	if (SDEV_IS_GLOBAL(parent))
		atomic_inc_ulong(&parent->sdev_gdir_gen);

	/* wake up other threads blocked on looking up this node */
	mutex_enter(&self->sdev_lookup_lock);
	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
	mutex_exit(&self->sdev_lookup_lock);
	SDEV_RELE(self);	/* don't return with vnode held */
	return (0);
}

/*ARGSUSED6*/
static int
sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
    struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
{
	int error;
	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
	struct sdev_node *self = NULL;
	struct vnode	*vp = NULL;

	ASSERT(parent && parent->sdev_dotdot);
	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
	if (parent->sdev_state == SDEV_ZOMBIE) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (ENOENT);
	}

	/* non-global do not allow pure directory creation */
	if (!SDEV_IS_GLOBAL(parent)) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (prof_lookup(dvp, nm, vpp, cred));
	}
	rw_exit(&parent->sdev_dotdot->sdev_contents);

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) {
		return (error);
	}

	/* find existing name */
/* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
	error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
	if (error == 0) {
		VN_RELE(vp);
		return (EEXIST);
	}
	if (error != ENOENT)
		return (error);

	/* require write access to create a directory */
	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0) {
		return (error);
	}

	/* put it into memory */
	rw_enter(&parent->sdev_contents, RW_WRITER);
	error = sdev_mknode(parent, nm, &self,
	    va, NULL, NULL, cred, SDEV_READY);
	if (error) {
		rw_exit(&parent->sdev_contents);
		if (self)
			SDEV_RELE(self);
		return (error);
	}
	ASSERT(self && (self->sdev_state == SDEV_READY));
	rw_exit(&parent->sdev_contents);

	/* take care the timestamps for the node and its parent */
	sdev_update_timestamps(SDEVTOV(self), kcred,
	    AT_CTIME|AT_MTIME|AT_ATIME);
	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
	if (SDEV_IS_GLOBAL(parent))
		atomic_inc_ulong(&parent->sdev_gdir_gen);

	/* wake up other threads blocked on looking up this node */
	mutex_enter(&self->sdev_lookup_lock);
	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
	mutex_exit(&self->sdev_lookup_lock);
	*vpp = SDEVTOV(self);
	return (0);
}

/*
 * allowing removing an empty directory under /dev
 */
/*ARGSUSED*/
static int
sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
    caller_context_t *ct, int flags)
{
	int error = 0;
	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
	struct sdev_node *self = NULL;
	struct vnode *vp = NULL;

	/* bail out early */
	if (strcmp(nm, ".") == 0)
		return (EINVAL);
	if (strcmp(nm, "..") == 0)
		return (EEXIST); /* should be ENOTEMPTY */

	/* no destruction of non-global node */
	ASSERT(parent && parent->sdev_dotdot);
	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
	if (!SDEV_IS_GLOBAL(parent)) {
		rw_exit(&parent->sdev_dotdot->sdev_contents);
		return (ENOTSUP);
	}
	rw_exit(&parent->sdev_dotdot->sdev_contents);

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC|VWRITE, 0, cred, ct)) != 0)
		return (error);

	/* check existing name */
	rw_enter(&parent->sdev_contents, RW_WRITER);
	self = sdev_cache_lookup(parent, nm);
	if (self == NULL) {
		rw_exit(&parent->sdev_contents);
		return (ENOENT);
	}

	vp = SDEVTOV(self);
	if ((self->sdev_state == SDEV_INIT) ||
	    (self->sdev_state == SDEV_ZOMBIE)) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (ENOENT);
	}

	/* some sanity checks */
	if (vp == dvp || vp == cdir) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (EINVAL);
	}

	if (vp->v_type != VDIR) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (ENOTDIR);
	}

	if (vn_vfswlock(vp)) {
		rw_exit(&parent->sdev_contents);
		VN_RELE(vp);
		return (EBUSY);
	}

	if (vn_mountedvfs(vp) != NULL) {
		rw_exit(&parent->sdev_contents);
		vn_vfsunlock(vp);
		VN_RELE(vp);
		return (EBUSY);
	}

	self = VTOSDEV(vp);
	/* bail out on a non-empty directory */
	rw_enter(&self->sdev_contents, RW_READER);
	if (self->sdev_nlink > 2) {
		rw_exit(&self->sdev_contents);
		rw_exit(&parent->sdev_contents);
		vn_vfsunlock(vp);
		VN_RELE(vp);
		return (ENOTEMPTY);
	}
	rw_exit(&self->sdev_contents);

	/* unlink it from the directory cache */
	error = sdev_cache_update(parent, &self, nm, SDEV_CACHE_DELETE);
	rw_exit(&parent->sdev_contents);
	vn_vfsunlock(vp);

	if (error && (error != EBUSY)) {
		VN_RELE(vp);
	} else {
		sdcmn_err2(("sdev_rmdir: cleaning node %s from directory "
		    " cache with error %d\n", nm, error));

		/* best effort to clean up the backing store */
		if (SDEV_IS_PERSIST(parent)) {
			ASSERT(parent->sdev_attrvp);
			error = VOP_RMDIR(parent->sdev_attrvp, nm,
			    parent->sdev_attrvp, kcred, ct, flags);
			sdcmn_err2(("sdev_rmdir: cleaning device %s is on"
			    " disk error %d\n", parent->sdev_path, error));
		}

		if (error == EBUSY)
			error = 0;
	}

	return (error);
}

/*
 * read the contents of a symbolic link
 */
static int
sdev_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred,
    caller_context_t *ct)
{
	struct sdev_node *dv;
	int	error = 0;

	ASSERT(vp->v_type == VLNK);

	dv = VTOSDEV(vp);

	if (dv->sdev_attrvp) {
		/* non-NULL attrvp implys a persisted node at READY state */
		return (VOP_READLINK(dv->sdev_attrvp, uiop, cred, ct));
	} else if (dv->sdev_symlink != NULL) {
		/* memory nodes, e.g. local nodes */
		rw_enter(&dv->sdev_contents, RW_READER);
		sdcmn_err2(("sdev_readlink link is %s\n", dv->sdev_symlink));
		error = uiomove(dv->sdev_symlink, strlen(dv->sdev_symlink),
		    UIO_READ, uiop);
		rw_exit(&dv->sdev_contents);
		return (error);
	}

	return (ENOENT);
}

/*ARGSUSED4*/
static int
sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
    caller_context_t *ct, int flags)
{
	struct sdev_node *parent = VTOSDEV(dvp);
	int error;

	/* execute access is required to search the directory */
	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
		return (error);

	ASSERT(parent);
	if (!SDEV_IS_GLOBAL(parent))
		prof_filldir(parent);
	return (devname_readdir_func(dvp, uiop, cred, eofp, SDEV_BROWSE));
}

/*ARGSUSED1*/
static void
sdev_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
{
	devname_inactive_func(vp, cred, NULL);
}

/*ARGSUSED2*/
static int
sdev_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
{
	struct sdev_node	*dv = VTOSDEV(vp);
	struct sdev_fid	*sdev_fid;

	if (fidp->fid_len < (sizeof (struct sdev_fid) - sizeof (ushort_t))) {
		fidp->fid_len = sizeof (struct sdev_fid) - sizeof (ushort_t);
		return (ENOSPC);
	}

	sdev_fid = (struct sdev_fid *)fidp;
	bzero(sdev_fid, sizeof (struct sdev_fid));
	sdev_fid->sdevfid_len =
	    (int)sizeof (struct sdev_fid) - sizeof (ushort_t);
	sdev_fid->sdevfid_ino = dv->sdev_ino;

	return (0);
}

/*
 * This pair of routines bracket all VOP_READ, VOP_WRITE
 * and VOP_READDIR requests.  The contents lock stops things
 * moving around while we're looking at them.
 */
/*ARGSUSED2*/
static int
sdev_rwlock(struct vnode *vp, int write_flag, caller_context_t *ctp)
{
	rw_enter(&VTOSDEV(vp)->sdev_contents,
	    write_flag ? RW_WRITER : RW_READER);
	return (write_flag ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE);
}

/*ARGSUSED1*/
static void
sdev_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ctp)
{
	rw_exit(&VTOSDEV(vp)->sdev_contents);
}

/*ARGSUSED1*/
static int
sdev_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
    caller_context_t *ct)
{
	struct vnode *attrvp = VTOSDEV(vp)->sdev_attrvp;

	ASSERT(vp->v_type != VCHR &&
	    vp->v_type != VBLK && vp->v_type != VLNK);

	if (vp->v_type == VDIR)
		return (fs_seek(vp, ooff, noffp, ct));

	ASSERT(attrvp);
	return (VOP_SEEK(attrvp, ooff, noffp, ct));
}

/*ARGSUSED1*/
static int
sdev_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
    offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
    caller_context_t *ct)
{
	int error;
	struct sdev_node *dv = VTOSDEV(vp);

	ASSERT(dv);
	ASSERT(dv->sdev_attrvp);
	error = VOP_FRLOCK(dv->sdev_attrvp, cmd, bfp, flag, offset,
	    flk_cbp, cr, ct);

	return (error);
}

static int
sdev_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
    caller_context_t *ct)
{
	switch (cmd) {
	case _PC_ACL_ENABLED:
		*valp = SDEV_ACL_FLAVOR(vp);
		return (0);
	}

	return (fs_pathconf(vp, cmd, valp, cr, ct));
}

vnodeops_t *sdev_vnodeops;

const fs_operation_def_t sdev_vnodeops_tbl[] = {
	VOPNAME_OPEN,		{ .vop_open = sdev_open },
	VOPNAME_CLOSE,		{ .vop_close = sdev_close },
	VOPNAME_READ,		{ .vop_read = sdev_read },
	VOPNAME_WRITE,		{ .vop_write = sdev_write },
	VOPNAME_IOCTL,		{ .vop_ioctl = sdev_ioctl },
	VOPNAME_GETATTR,	{ .vop_getattr = sdev_getattr },
	VOPNAME_SETATTR,	{ .vop_setattr = sdev_setattr },
	VOPNAME_ACCESS,		{ .vop_access = sdev_access },
	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_lookup },
	VOPNAME_CREATE,		{ .vop_create = sdev_create },
	VOPNAME_RENAME,		{ .vop_rename = sdev_rename },
	VOPNAME_REMOVE,		{ .vop_remove = sdev_remove },
	VOPNAME_MKDIR,		{ .vop_mkdir = sdev_mkdir },
	VOPNAME_RMDIR,		{ .vop_rmdir = sdev_rmdir },
	VOPNAME_READDIR,	{ .vop_readdir = sdev_readdir },
	VOPNAME_SYMLINK,	{ .vop_symlink = sdev_symlink },
	VOPNAME_READLINK,	{ .vop_readlink = sdev_readlink },
	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_inactive },
	VOPNAME_FID,		{ .vop_fid = sdev_fid },
	VOPNAME_RWLOCK,		{ .vop_rwlock = sdev_rwlock },
	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = sdev_rwunlock },
	VOPNAME_SEEK,		{ .vop_seek = sdev_seek },
	VOPNAME_FRLOCK,		{ .vop_frlock = sdev_frlock },
	VOPNAME_PATHCONF,	{ .vop_pathconf = sdev_pathconf },
	VOPNAME_SETSECATTR,	{ .vop_setsecattr = sdev_setsecattr },
	VOPNAME_GETSECATTR,	{ .vop_getsecattr = sdev_getsecattr },
	NULL,			NULL
};

int sdev_vnodeops_tbl_size = sizeof (sdev_vnodeops_tbl);