3978 renaming dynamic sdev nodes is problematic
authorRobert Mustacchi <rm@joyent.com>
Wed, 08 May 2013 01:46:41 +0000
changeset 14223 1652c59077c6
parent 14222 c3f8a4690b1f
child 14224 85a6c280af72
3978 renaming dynamic sdev nodes is problematic 3979 sdev dynamic directories can be created multiple times 3980 sdev vfs refcount assertion violation 3981 sdev backing store nodes not always present 3982 sdev could use some theory statements 3983 sdev_shadow_node slept through a vn_rele Reviewed by: Keith M Wesolowski <[email protected]> Reviewed by: Jerry Jelinek <[email protected]> Reviewed by: Gordon Ross <[email protected]> Reviewed by: Richard Lowe <[email protected]> Approved by: Garrett D'Amore <[email protected]>
usr/src/uts/common/fs/dev/sdev_ipnetops.c
usr/src/uts/common/fs/dev/sdev_netops.c
usr/src/uts/common/fs/dev/sdev_profile.c
usr/src/uts/common/fs/dev/sdev_ptsops.c
usr/src/uts/common/fs/dev/sdev_subr.c
usr/src/uts/common/fs/dev/sdev_vnops.c
usr/src/uts/common/fs/dev/sdev_vtops.c
usr/src/uts/common/fs/dev/sdev_zvolops.c
usr/src/uts/common/sys/fs/sdev_impl.h
--- a/usr/src/uts/common/fs/dev/sdev_ipnetops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_ipnetops.c	Wed May 08 01:46:41 2013 +0000
@@ -161,6 +161,14 @@
 	if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
 		rw_exit(&ddv->sdev_contents);
 		rw_enter(&ddv->sdev_contents, RW_WRITER);
+		/*
+		 * We've been made a zombie while we weren't looking. We'll bail
+		 * if that's the case.
+		 */
+		if (ddv->sdev_state == SDEV_ZOMBIE) {
+			rw_exit(&ddv->sdev_contents);
+			return;
+		}
 	}
 
 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -186,6 +194,7 @@
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
+		SDEV_RELE(dv);
 	}
 
 	ipnet_walk_if(devipnet_filldir_entry, ddv, getzoneid());
--- a/usr/src/uts/common/fs/dev/sdev_netops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c	Wed May 08 01:46:41 2013 +0000
@@ -58,7 +58,6 @@
 	datalink_id_t linkid;
 	zoneid_t zoneid;
 
-	ASSERT(!(dv->sdev_flags & SDEV_STALE));
 	ASSERT(dv->sdev_state == SDEV_READY);
 
 	if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
@@ -149,13 +148,9 @@
 	 * directory cache lookup:
 	 */
 	if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
-		if (dv->sdev_state == SDEV_READY) {
-			if (!(dv->sdev_flags & SDEV_ATTR_INVALID))
-				goto found;
-		} else {
-			ASSERT(dv->sdev_state == SDEV_ZOMBIE);
-			goto failed;
-		}
+		ASSERT(dv->sdev_state == SDEV_READY);
+		if (!(dv->sdev_flags & SDEV_ATTR_INVALID))
+			goto found;
 	}
 
 	/*
@@ -170,7 +165,6 @@
 
 	error = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL, cred, SDEV_READY);
 	if (error != 0) {
-		ASSERT(dv == NULL);
 		dls_devnet_close(ddh);
 		goto failed;
 	}
@@ -286,6 +280,7 @@
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
+		SDEV_RELE(dv);
 	}
 
 	if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
--- a/usr/src/uts/common/fs/dev/sdev_profile.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_profile.c	Wed May 08 01:46:41 2013 +0000
@@ -161,7 +161,7 @@
 		return (rv);
 	}
 
-	rv = sdev_cache_update(dir, &dv, name, SDEV_CACHE_ADD);
+	sdev_cache_update(dir, &dv, name, SDEV_CACHE_ADD);
 	*newdv = dv;
 
 	/* put it in ready state */
@@ -173,6 +173,9 @@
 			sdcmn_err10(("sdev_origin for %s set to 0x%p\n",
 			    name, arg));
 		apply_glob_pattern(dir, *newdv);
+	} else {
+		sdev_cache_update(dir, &dv, name, SDEV_CACHE_DELETE);
+		SDEV_RELE(dv);
 	}
 	return (rv);
 }
@@ -198,8 +201,8 @@
 	if (newdv = sdev_cache_lookup(dir, name)) {
 		*dirp = newdv;
 		*gdirp = newdv->sdev_origin;
+		rw_exit(&dir->sdev_contents);
 		SDEV_RELE(dir);
-		rw_exit(&dir->sdev_contents);
 		return (0);
 	}
 	rw_exit(&dir->sdev_contents);
@@ -676,6 +679,10 @@
 	    == gdir->sdev_gdir_gen))
 		return;		/* already up to date */
 
+	/* We may have become a zombie (across a try) */
+	if (ddv->sdev_state == SDEV_ZOMBIE)
+		return;
+
 	if (firsttime && rw_tryupgrade(&ddv->sdev_contents) == 0) {
 		rw_exit(&ddv->sdev_contents);
 		firsttime = 0;
--- a/usr/src/uts/common/fs/dev/sdev_ptsops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_ptsops.c	Wed May 08 01:46:41 2013 +0000
@@ -107,7 +107,6 @@
 	timestruc_t now;
 	char *nm = dv->sdev_name;
 
-	ASSERT(!(dv->sdev_flags & SDEV_STALE));
 	ASSERT(dv->sdev_state == SDEV_READY);
 
 	/* validate only READY nodes */
@@ -258,6 +257,7 @@
 		/* remove the cache node */
 		(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
 		    SDEV_CACHE_DELETE);
+		SDEV_RELE(dv);
 	}
 	rw_downgrade(&ddv->sdev_contents);
 }
--- a/usr/src/uts/common/fs/dev/sdev_subr.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c	Wed May 08 01:46:41 2013 +0000
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -370,7 +370,8 @@
 }
 
 /*
- * transition a sdev_node into SDEV_READY state
+ * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
+ * caller to transition the node to the SDEV_ZOMBIE state.
  */
 int
 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
@@ -433,26 +434,12 @@
 		/* transition to READY state */
 		sdev_set_nodestate(dv, SDEV_READY);
 		sdev_nc_node_exists(dv);
-	} else {
-		sdev_set_nodestate(dv, SDEV_ZOMBIE);
 	}
 	rw_exit(&dv->sdev_contents);
 	return (error);
 }
 
 /*
- * setting ZOMBIE state
- */
-static int
-sdev_nodezombied(struct sdev_node *dv)
-{
-	rw_enter(&dv->sdev_contents, RW_WRITER);
-	sdev_set_nodestate(dv, SDEV_ZOMBIE);
-	rw_exit(&dv->sdev_contents);
-	return (0);
-}
-
-/*
  * Build the VROOT sdev_node.
  */
 /*ARGSUSED*/
@@ -568,23 +555,31 @@
 	{ NULL, NULL, NULL, NULL, NULL, 0}
 };
 
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
 struct sdev_vop_table *
 sdev_match(struct sdev_node *dv)
 {
 	int vlen;
 	int i;
+	const char *path;
+
+	if (strlen(dv->sdev_path) <= 5)
+		return (NULL);
+
+	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+		return (NULL);
+	path = dv->sdev_path + 5;
 
 	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, dv->sdev_name) == 0)
+		if (strcmp(vtab[i].vt_name, path) == 0)
 			return (&vtab[i]);
 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
-			char *ptr;
-
-			ASSERT(strlen(dv->sdev_path) > 5);
-			ptr = dv->sdev_path + 5;
 			vlen = strlen(vtab[i].vt_name);
-			if ((strncmp(vtab[i].vt_name, ptr,
-			    vlen - 1) == 0) && ptr[vlen] == '/')
+			if ((strncmp(vtab[i].vt_name, path,
+			    vlen - 1) == 0) && path[vlen] == '/')
 				return (&vtab[i]);
 		}
 
@@ -610,10 +605,14 @@
 	/* gets the vtab entry it matches */
 	if ((vtp = sdev_match(dv)) != NULL) {
 		dv->sdev_flags |= vtp->vt_flags;
+		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+			dv->sdev_flags |= SDEV_PERSIST;
 
 		if (vtp->vt_vops) {
 			if (vtp->vt_global_vops)
 				*(vtp->vt_global_vops) = vtp->vt_vops;
+
 			return (vtp->vt_vops);
 		}
 
@@ -631,8 +630,10 @@
 				*(vtp->vt_global_vops) = vtp->vt_vops;
 			}
 			sdev_free_vtab(templ);
+
 			return (vtp->vt_vops);
 		}
+
 		return (sdev_vnodeops);
 	}
 
@@ -743,7 +744,6 @@
 	ASSERT(linkvp->v_type == VLNK);
 	err = sdev_getlink(linkvp, &link);
 	if (err) {
-		(void) sdev_nodezombied(dv);
 		dv->sdev_symlink = NULL;
 		return (ENOENT);
 	}
@@ -839,12 +839,7 @@
 		ASSERT(dv);
 
 		/* insert into the directory cache */
-		error = sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
-		if (error) {
-			sdcmn_err9(("sdev_mknode: node %s can not"
-			    " be added into directory cache\n", nm));
-			return (ENOENT);
-		}
+		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
 	}
 
 	ASSERT(dv);
@@ -878,7 +873,13 @@
 		*newdv = dv;
 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
 	} else {
-		SDEV_SIMPLE_RELE(dv);
+		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
+		/*
+		 * We created this node, it wasn't passed into us. Therefore it
+		 * is up to us to delete it.
+		 */
+		if (*newdv == NULL)
+			SDEV_SIMPLE_RELE(dv);
 		*newdv = NULL;
 	}
 
@@ -986,6 +987,7 @@
 	if (dv) {
 		ASSERT(dv->sdev_dotdot == ddv);
 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
+		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
 		SDEV_HOLD(dv);
 		return (dv);
 	}
@@ -1004,6 +1006,7 @@
 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
 	ASSERT(ddv->sdev_nlink >= 2);
 	ASSERT(dv->sdev_nlink == 0);
+	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
 
 	dv->sdev_dotdot = ddv;
 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
@@ -1019,60 +1022,55 @@
 static void
 decr_link(struct sdev_node *dv)
 {
-	if (dv->sdev_state != SDEV_INIT)
+	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
+	if (dv->sdev_state != SDEV_INIT) {
+		VERIFY(dv->sdev_nlink >= 1);
 		dv->sdev_nlink--;
-	else
-		ASSERT(dv->sdev_nlink == 0);
+	} else {
+		VERIFY(dv->sdev_nlink == 0);
+	}
 }
 
 /*
  * Delete an existing dv from directory cache
  *
- * In the case of a node is still held by non-zero reference count,
- *     the node is put into ZOMBIE state. Once the reference count
- *     reaches "0", the node is unlinked and destroyed,
- *     in sdev_inactive().
+ * In the case of a node is still held by non-zero reference count, the node is
+ * put into ZOMBIE state. The node is always unlinked from its parent, but it is
+ * not destroyed via sdev_inactive until its reference count reaches "0".
  */
-static int
+static void
 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
 {
 	struct vnode *vp;
+	sdev_node_state_t os;
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 
 	vp = SDEVTOV(dv);
 	mutex_enter(&vp->v_lock);
-
-	/* dv is held still */
-	if (vp->v_count > 1) {
-		rw_enter(&dv->sdev_contents, RW_WRITER);
-		if (dv->sdev_state == SDEV_READY) {
-			sdcmn_err9((
-			    "sdev_dirdelete: node %s busy with count %d\n",
-			    dv->sdev_name, vp->v_count));
-			dv->sdev_state = SDEV_ZOMBIE;
-		}
-		rw_exit(&dv->sdev_contents);
-		--vp->v_count;
-		mutex_exit(&vp->v_lock);
-		return (EBUSY);
-	}
-	ASSERT(vp->v_count == 1);
-
-	/* unlink from the memory cache */
-	ddv->sdev_nlink--;	/* .. to above */
-	if (vp->v_type == VDIR) {
-		decr_link(dv);		/* . to self */
-	}
-
+	rw_enter(&dv->sdev_contents, RW_WRITER);
+	os = dv->sdev_state;
+	ASSERT(os != SDEV_ZOMBIE);
+	dv->sdev_state = SDEV_ZOMBIE;
+
+	/*
+	 * unlink ourselves from the parent directory now to take care of the ..
+	 * link. However, if we're a directory, we don't remove our reference to
+	 * ourself eg. '.' until we are torn down in the inactive callback.
+	 */
+	decr_link(ddv);
 	avl_remove(&ddv->sdev_entries, dv);
-	decr_link(dv);	/* name, back to zero */
-	vp->v_count--;
+	/*
+	 * sdev_inactive expects nodes to have a link to themselves when we're
+	 * tearing them down. If we're transitioning from the initial state to
+	 * zombie and not via ready, then we're not going to have this link that
+	 * comes from the node being ready. As a result, we need to increment
+	 * our link count by one to account for this.
+	 */
+	if (os == SDEV_INIT && dv->sdev_nlink == 0)
+		dv->sdev_nlink++;
+	rw_exit(&dv->sdev_contents);
 	mutex_exit(&vp->v_lock);
-
-	/* destroy the node */
-	sdev_nodedestroy(dv, 0);
-	return (0);
 }
 
 /*
@@ -1169,6 +1167,35 @@
 			goto err_out;
 	}
 
+	/* fix the source for a symlink */
+	if (vattr.va_type == VLNK) {
+		if (odv->sdev_symlink == NULL) {
+			error = sdev_follow_link(odv);
+			if (error) {
+				/*
+				 * The underlying symlink doesn't exist. This
+				 * node probably shouldn't even exist. While
+				 * it's a bit jarring to consumers, we're going
+				 * to remove the node from /dev.
+				 */
+				if (SDEV_IS_PERSIST((*ndvp)))
+					bkstore = 1;
+				sdev_dirdelete(oddv, odv);
+				if (bkstore) {
+					ASSERT(nddv->sdev_attrvp);
+					error = VOP_REMOVE(nddv->sdev_attrvp,
+					    nnm, cred, NULL, 0);
+					if (error)
+						goto err_out;
+				}
+				error = ENOENT;
+				goto err_out;
+			}
+		}
+		ASSERT(odv->sdev_symlink);
+		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
+	}
+
 	/* destination existing */
 	if (*ndvp) {
 		nvp = SDEVTOV(*ndvp);
@@ -1205,7 +1232,12 @@
 			}
 			vn_vfsunlock(nvp);
 
-			(void) sdev_dirdelete(nddv, *ndvp);
+			/*
+			 * We did not place the hold on *ndvp, so even though
+			 * we're deleting the node, we should not get rid of our
+			 * reference.
+			 */
+			sdev_dirdelete(nddv, *ndvp);
 			*ndvp = NULL;
 			ASSERT(nddv->sdev_attrvp);
 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
@@ -1223,11 +1255,11 @@
 			}
 
 			/*
-			 * get rid of the node from the directory cache
-			 * note, in case EBUSY is returned, the ZOMBIE
-			 * node is taken care in sdev_mknode.
+			 * Get rid of the node from the directory cache note.
+			 * Don't forget that it's not up to us to remove the vn
+			 * ref on the sdev node, as we did not place it.
 			 */
-			(void) sdev_dirdelete(nddv, *ndvp);
+			sdev_dirdelete(nddv, *ndvp);
 			*ndvp = NULL;
 			if (bkstore) {
 				ASSERT(nddv->sdev_attrvp);
@@ -1239,19 +1271,6 @@
 		}
 	}
 
-	/* fix the source for a symlink */
-	if (vattr.va_type == VLNK) {
-		if (odv->sdev_symlink == NULL) {
-			error = sdev_follow_link(odv);
-			if (error) {
-				error = ENOENT;
-				goto err_out;
-			}
-		}
-		ASSERT(odv->sdev_symlink);
-		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
-	}
-
 	/*
 	 * make a fresh node from the source attrs
 	 */
@@ -1259,8 +1278,10 @@
 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
 	    NULL, (void *)link, cred, SDEV_READY);
 
-	if (link)
+	if (link != NULL) {
 		kmem_free(link, strlen(link) + 1);
+		link = NULL;
+	}
 
 	if (error)
 		goto err_out;
@@ -1271,9 +1292,11 @@
 	if (doingdir) {
 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
+			SDEV_HOLD(idv);
 			error = sdev_rnmnode(odv, idv,
 			    (struct sdev_node *)(*ndvp), &ndv,
 			    idv->sdev_name, cred);
+			SDEV_RELE(idv);
 			if (error)
 				goto err_out;
 			ndv = NULL;
@@ -1307,6 +1330,11 @@
 	return (error);
 
 err_out:
+	if (link != NULL) {
+		kmem_free(link, strlen(link) + 1);
+		link = NULL;
+	}
+
 	rw_exit(&nddv->sdev_contents);
 	if (!samedir)
 		rw_exit(&oddv->sdev_contents);
@@ -1465,18 +1493,8 @@
 			vp = NULLVP;
 			dv = sdev_cache_lookup(ddv, nm);
 			if (dv) {
-				if (dv->sdev_state != SDEV_ZOMBIE) {
-					SDEV_SIMPLE_RELE(dv);
-				} else {
-					/*
-					 * A ZOMBIE node may not have been
-					 * cleaned up from the backing store,
-					 * bypass this entry in this case,
-					 * and clean it up from the directory
-					 * cache if this is the last call.
-					 */
-					(void) sdev_dirdelete(ddv, dv);
-				}
+				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
+				SDEV_SIMPLE_RELE(dv);
 				continue;
 			}
 
@@ -1547,11 +1565,16 @@
 	vap->va_ctime = vap->va_atime;
 	for (i = 0; vtab[i].vt_name != NULL; i++) {
 		/*
-		 * This early, we may be in a read-only /dev
-		 * environment: leave the creation of any nodes we'd
-		 * attempt to persist to devfsadm.
+		 * This early, we may be in a read-only /dev environment: leave
+		 * the creation of any nodes we'd attempt to persist to
+		 * devfsadm. Because /dev itself is normally persistent, any
+		 * node which is not marked dynamic will end up being marked
+		 * persistent. However, some nodes are both dynamic and
+		 * persistent, mostly lofi and rlofi, so we need to be careful
+		 * in our check.
 		 */
-		if (vtab[i].vt_flags & SDEV_PERSIST)
+		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
+		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
 			continue;
 		nm = vtab[i].vt_name;
 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
@@ -1614,6 +1637,8 @@
 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
 		    (void *)(*rvp), error));
+		if (!error)
+			VN_RELE(*rvp);
 		break;
 	case VCHR:
 	case VBLK:
@@ -1647,74 +1672,49 @@
 	return (error);
 }
 
-static int
+static void
 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
 {
-	int error = 0;
 	struct sdev_node *dup = NULL;
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
 		sdev_direnter(ddv, *dv);
 	} else {
-		if (dup->sdev_state == SDEV_ZOMBIE) {
-			error = sdev_dirdelete(ddv, dup);
-			/*
-			 * The ZOMBIE node is still hanging
-			 * around with more than one reference counts.
-			 * Fail the new node creation so that
-			 * the directory cache won't have
-			 * duplicate entries for the same named node
-			 */
-			if (error == EBUSY) {
-				SDEV_SIMPLE_RELE(*dv);
-				sdev_nodedestroy(*dv, 0);
-				*dv = NULL;
-				return (error);
-			}
-			sdev_direnter(ddv, *dv);
-		} else {
-			ASSERT((*dv)->sdev_state != SDEV_ZOMBIE);
-			SDEV_SIMPLE_RELE(*dv);
-			sdev_nodedestroy(*dv, 0);
-			*dv = dup;
-		}
+		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
+		SDEV_SIMPLE_RELE(*dv);
+		sdev_nodedestroy(*dv, 0);
+		*dv = dup;
 	}
-
-	return (0);
 }
 
-static int
+static void
 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
 {
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
-	return (sdev_dirdelete(ddv, *dv));
+	sdev_dirdelete(ddv, *dv);
 }
 
 /*
  * update the in-core directory cache
  */
-int
+void
 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
     sdev_cache_ops_t ops)
 {
-	int error = 0;
-
 	ASSERT((SDEV_HELD(*dv)));
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 	switch (ops) {
 	case SDEV_CACHE_ADD:
-		error = sdev_cache_add(ddv, dv, nm);
+		sdev_cache_add(ddv, dv, nm);
 		break;
 	case SDEV_CACHE_DELETE:
-		error = sdev_cache_delete(ddv, dv);
+		sdev_cache_delete(ddv, dv);
 		break;
 	default:
 		break;
 	}
-
-	return (error);
 }
 
 /*
@@ -2157,7 +2157,6 @@
 	}
 
 found:
-	ASSERT(!(dv->sdev_flags & SDEV_STALE));
 	ASSERT(dv->sdev_state == SDEV_READY);
 	if (vtor) {
 		/*
@@ -2176,13 +2175,11 @@
 				rw_exit(&ddv->sdev_contents);
 				rw_enter(&ddv->sdev_contents, RW_WRITER);
 			}
-			error = sdev_cache_update(ddv, &dv, nm,
-			    SDEV_CACHE_DELETE);
+			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
 			rw_downgrade(&ddv->sdev_contents);
-			if (error == 0) {
-				dv = NULL;
-				goto lookup_create_node;
-			}
+			SDEV_RELE(dv);
+			dv = NULL;
+			goto lookup_create_node;
 			/* FALLTHRU */
 		case SDEV_VTOR_INVALID:
 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
@@ -2229,9 +2226,10 @@
 		 * changes. Re-check.
 		 */
 		if (dv->sdev_state == SDEV_INIT) {
-			(void) sdev_dirdelete(ddv, dv);
+			sdev_dirdelete(ddv, dv);
 			rw_exit(&ddv->sdev_contents);
 			sdev_lookup_failed(ddv, nm, failed_flags);
+			SDEV_RELE(dv);
 			*vpp = NULL;
 			return (ENOENT);
 		}
@@ -2263,15 +2261,14 @@
 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
 
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = SDEV_NEXT_ENTRY(ddv, dv)) {
+	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
 		vp = SDEVTOV(dv);
+		SDEV_HOLD(dv);
 		if (vp->v_type == VDIR)
 			sdev_stale(dv);
 
-		sdcmn_err9(("sdev_stale: setting stale %s\n",
-		    dv->sdev_path));
-		dv->sdev_flags |= SDEV_STALE;
-		avl_remove(&ddv->sdev_entries, dv);
+		sdev_dirdelete(ddv, dv);
+		SDEV_RELE(dv);
 	}
 	ddv->sdev_flags |= SDEV_BUILD;
 	rw_exit(&ddv->sdev_contents);
@@ -2289,7 +2286,7 @@
 	int error = 0;
 	int busy = 0;
 	struct vnode *vp;
-	struct sdev_node *dv, *next = NULL;
+	struct sdev_node *dv;
 	int bkstore = 0;
 	int len = 0;
 	char *bks_name = NULL;
@@ -2300,8 +2297,7 @@
 	 * We try our best to destroy all unused sdev_node's
 	 */
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
-		next = SDEV_NEXT_ENTRY(ddv, dv);
+	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
 		vp = SDEVTOV(dv);
 
 		if (expr && gmatch(dv->sdev_name, expr) == 0)
@@ -2338,15 +2334,10 @@
 			bcopy(dv->sdev_name, bks_name, len);
 		}
 
-		error = sdev_dirdelete(ddv, dv);
-
-		if (error == EBUSY) {
-			sdcmn_err9(("sdev_cleandir: dir busy\n"));
-			busy++;
-		}
+		sdev_dirdelete(ddv, dv);
 
 		/* take care the backing store clean up */
-		if (bkstore && (error == 0)) {
+		if (bkstore) {
 			ASSERT(bks_name);
 			ASSERT(ddv->sdev_attrvp);
 
@@ -2370,6 +2361,9 @@
 			bks_name = NULL;
 			len = 0;
 		}
+
+		ddv->sdev_flags |= SDEV_BUILD;
+		SDEV_RELE(dv);
 	}
 
 	ddv->sdev_flags |= SDEV_BUILD;
@@ -3114,42 +3108,41 @@
 {
 	int clean;
 	struct sdev_node *dv = VTOSDEV(vp);
-	struct sdev_node *ddv = dv->sdev_dotdot;
 	int state;
 
-	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	state = dv->sdev_state;
-
 	mutex_enter(&vp->v_lock);
 	ASSERT(vp->v_count >= 1);
 
+
 	if (vp->v_count == 1 && callback != NULL)
 		callback(vp);
 
+	rw_enter(&dv->sdev_contents, RW_WRITER);
+	state = dv->sdev_state;
+
 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
 
 	/*
-	 * last ref count on the ZOMBIE node is released.
-	 * clean up the sdev_node, and
-	 * release the hold on the backing store node so that
-	 * the ZOMBIE backing stores also cleaned out.
+	 * sdev is a rather bad public citizen. It violates the general
+	 * agreement that in memory nodes should always have a valid reference
+	 * count on their vnode. But that's not the case here. This means that
+	 * we do actually have to distinguish between getting inactive callbacks
+	 * for zombies and otherwise. This should probably be fixed.
 	 */
 	if (clean) {
-		ASSERT(ddv);
-
-		ddv->sdev_nlink--;
+		/* Remove the . entry to ourselves */
 		if (vp->v_type == VDIR) {
-			dv->sdev_nlink--;
+			decr_link(dv);
 		}
-		if ((dv->sdev_flags & SDEV_STALE) == 0)
-			avl_remove(&ddv->sdev_entries, dv);
-		dv->sdev_nlink--;
+		VERIFY(dv->sdev_nlink == 1);
+		decr_link(dv);
 		--vp->v_count;
+		rw_exit(&dv->sdev_contents);
 		mutex_exit(&vp->v_lock);
 		sdev_nodedestroy(dv, 0);
 	} else {
 		--vp->v_count;
+		rw_exit(&dv->sdev_contents);
 		mutex_exit(&vp->v_lock);
 	}
-	rw_exit(&ddv->sdev_contents);
 }
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c	Wed May 08 01:46:41 2013 +0000
@@ -21,6 +21,9 @@
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ */
 
 /*
  * vnode ops for the /dev filesystem
@@ -32,6 +35,248 @@
  *    not supported for now
  */
 
+/*
+ * sdev has a few basic goals:
+ *   o Provide /dev for the global zone as well as various non-global zones.
+ *   o Provide the basic functionality that devfsadm might need (mknod,
+ *     symlinks, etc.)
+ *   o Allow persistent permissions on files in /dev.
+ *   o Allow for dynamic directories and nodes for use by various services (pts,
+ *     zvol, net, etc.)
+ *
+ * The sdev file system is primarily made up of sdev_node_t's which is sdev's
+ * counterpart to the vnode_t. There are two different classes of sdev_node_t's
+ * that we generally care about, dynamic and otherwise.
+ *
+ * Persisting Information
+ * ----------------------
+ *
+ * When sdev is mounted, it keeps track of the underlying file system it is
+ * mounted over. In certain situations, sdev will go and create entries in that
+ * underlying file system. These underlying 'back end' nodes are used as proxies
+ * for various changes in permissions. While specific sets of nodes, such as
+ * dynamic ones, are exempt, this process stores permission changes against
+ * these back end nodes. The point of all of this is to allow for these settings
+ * to persist across host and zone reboots. As an example, consider the entry
+ * /dev/dsk/c0t0d0 which is a character device and that / is in UFS. Upon
+ * changing the permissions on c0t0d0 you'd have the following logical
+ * relationships:
+ *
+ *    +------------------+   sdev_vnode     +--------------+
+ *    | sdev_node_t      |<---------------->| vnode_t      |
+ *    | /dev/dsk/c0t0d0  |<---------------->| for sdev     |
+ *    +------------------+                  +--------------+
+ *           |
+ *           | sdev_attrvp
+ *           |
+ *           |    +---------------------+
+ *           +--->| vnode_t for UFS|ZFS |
+ *                | /dev/dsk/c0t0d0     |
+ *                +---------------------+
+ *
+ * sdev is generally in memory. Therefore when a lookup happens and there is no
+ * entry already inside of a directory cache, it will next check the backing
+ * store. If the backing store exists, we will reconstitute the sdev_node based
+ * on the information that we persisted. When we create the backing store node,
+ * we use the struct vattr information that we already have in sdev_node_t.
+ * Because of this, we already know if the entry was previously a symlink,
+ * directory, or some other kind of type. Note that not all types of nodes are
+ * supported. Currently only VDIR, VCHR, VBLK, VREG, VDOOR, and VLNK are
+ * eligible to be persisted.
+ *
+ * When the sdev_node is created and the lookup is done, we grab a hold on the
+ * underlying vnode as part of the call to VOP_LOOKUP. That reference is held
+ * until the sdev_node becomes inactive. Once its reference count reaches one
+ * and the VOP_INACTIVE callback fires leading to the destruction of the node,
+ * the reference on the underlying vnode will be released.
+ *
+ * The backing store node will be deleted only when the node itself is deleted
+ * through the means of a VOP_REMOVE, VOP_RMDIR, or similar call.
+ *
+ * Not everything can be persisted, see The Rules section for more details.
+ *
+ * Dynamic Nodes
+ * -------------
+ *
+ * Dynamic nodes allow for specific interactions with various kernel subsystems
+ * when looking up directory entries. This allows the lookup and readdir
+ * functions to check against the kernel subsystem's for validity. eg. does a
+ * zvol or nic still exist.
+ *
+ * More specifically, when we create various directories we check if the
+ * directory name matches that of one of the names in the vtab[] (sdev_subr.c).
+ * If it does, we swap out the vnode operations into a new set which combine the
+ * normal sdev vnode operations with the dynamic set here.
+ *
+ * In addition, various dynamic nodes implement a verification entry point. This
+ * verification entry is used as a part of lookup and readdir. The goal for
+ * these dynamic nodes is to allow them to check with the underlying subsystems
+ * to ensure that these devices are still present, or if they have gone away, to
+ * remove them from the results. This is indicated by using the SDEV_VTOR flag
+ * in vtab[].
+ *
+ * Dynamic nodes have additional restrictions placed upon them. They may only
+ * appear at the top level directory of the file system. In addition, users
+ * cannot create dirents below any leve of a dynamic node aside from its special
+ * vnops.
+ *
+ * Profiles
+ * --------
+ *
+ * Profiles exist for the purpose of non-global zones. They work with the zone
+ * brands and zoneadmd to set up a filter of allowed devices that can appear in
+ * a non-global zone's /dev. These are sent to sdev by means of libdevinfo and a
+ * modctl system call. Specifically it allows one to add patterns of device
+ * paths to include and exclude. It allows for a collection of symlinks to be
+ * added and it allows for remapping names.
+ *
+ * When operating in a non-global zone, several of the sdev vnops are redirected
+ * to the profile versions. These impose additional restrictions such as
+ * enforcing that a non-global zone's /dev is read only.
+ *
+ * sdev_node_t States
+ * ------------------
+ *
+ * A given sdev_node_t has a field called the sdev_state which describes where
+ * in the sdev life cycle it is. There are three primary states: SDEV_INIT,
+ * SDEV_READY, and SDEV_ZOMBIE.
+ *
+ *	SDEV_INIT: When a new /dev file is first looked up, a sdev_node
+ *		   is allocated, initialized and added to the directory's
+ *		   sdev_node cache. A node at this state will also
+ *		   have the SDEV_LOOKUP flag set.
+ *
+ *		   Other threads that are trying to look up a node at
+ *		   this state will be blocked until the SDEV_LOOKUP flag
+ *		   is cleared.
+ *
+ *		   When the SDEV_LOOKUP flag is cleared, the node may
+ *		   transition into the SDEV_READY state for a successful
+ *		   lookup or the node is removed from the directory cache
+ *		   and destroyed if the named node can not be found.
+ *		   An ENOENT error is returned for the second case.
+ *
+ *	SDEV_READY: A /dev file has been successfully looked up and
+ *		    associated with a vnode. The /dev file is available
+ *		    for the supported /dev file system operations.
+ *
+ *	SDEV_ZOMBIE: Deletion of a /dev file has been explicitly issued
+ *		    to an SDEV_READY node. The node is transitioned into
+ *		    the SDEV_ZOMBIE state if the vnode reference count
+ *		    is still held. A SDEV_ZOMBIE node does not support
+ *		    any of the /dev file system operations. A SDEV_ZOMBIE
+ *		    node is immediately removed from the directory cache
+ *		    and destroyed once the reference count reaches zero.
+ *
+ * Historically nodes that were marked SDEV_ZOMBIE were not removed from the
+ * underlying directory caches. This has been the source of numerous bugs and
+ * thus to better mimic what happens on a real file system, it is no longer the
+ * case.
+ *
+ * The following state machine describes the life cycle of a given node and its
+ * associated states:
+ *
+ * node is . . . . .
+ * allocated via   .     +-------------+         . . . . . . . vnode_t refcount
+ * sdev_nodeinit() .     | Unallocated |         .             reaches zero and
+ *        +--------*-----|   Memory    |<--------*---+         sdev_inactive is
+ *        |              +-------------+             |         called.
+ *        |       +------------^                     |         called.
+ *        v       |                                  |
+ *  +-----------+ * . . sdev_nodeready()      +-------------+
+ *  | SDEV_INIT | |     or related setup      | SDEV_ZOMBIE |
+ *  +-----------+ |     failure               +-------------+
+ *        |       |                                  ^
+ *        |       |      +------------+              |
+ *        +-*----------->| SDEV_READY |--------*-----+
+ *          .            +------------+        .          The node is no longer
+ *          . . node successfully              . . . . .  valid or we've been
+ *              inserted into the                         asked to remove it.
+ *              directory cache                           This happens via
+ *              and sdev_nodready()                       sdev_dirdelete().
+ *              call successful.
+ *
+ * Adding and Removing Dirents, Zombie Nodes
+ * -----------------------------------------
+ *
+ * As part of doing a lookup, readdir, or an explicit creation operation like
+ * mkdir or create, nodes may be created. Every directory has an avl tree which
+ * contains its children, the sdev_entries tree. This is only used if the type
+ * is VDIR. Access to this is controlled by the sdev_node_t's contents_lock and
+ * it is managed through sdev_cache_update().
+ *
+ * Every sdev_node_t has a field sdev_state, which describes the current state
+ * of the node. A node is generally speaking in the SDEV_READY state. When it is
+ * there, it can be looked up, accessed, and operations performed on it. When a
+ * node is going to be removed from the directory cache it is marked as a
+ * zombie. Once a node becomes a zombie, no other file system operations will
+ * succeed and it will continue to exist as a node until the vnode count on the
+ * node reaches zero. At that point, the node will be freed.  However, once a
+ * node has been marked as a zombie, it will be removed immediately from the
+ * directory cache such that no one else may find it again.  This means that
+ * someone else can insert a new entry into that directory with the same name
+ * and without a problem.
+ *
+ * To remove a node, see the section on that in The Rules.
+ *
+ * The Rules
+ * ---------
+ * These are the rules to live by when working in sdev. These are not
+ * exhaustive.
+ *
+ * - Set 1: Working with Backing Nodes
+ *   o If there is a SDEV_READY sdev_node_t, it knows about its backing node.
+ *   o If we find a backing node when looking up an sdev_node_t for the first
+ *     time, we use its attributes to build our sdev_node_t.
+ *   o If there is a found backing node, or we create a backing node, that's
+ *     when we grab the hold on its vnode.
+ *   o If we mark an sdev_node_t a ZOMBIE, we must remove its backing node from
+ *     the underlying file system. It must not be searchable or findable.
+ *   o We release our hold on the backing node vnode when we destroy the
+ *     sdev_node_t.
+ *
+ * - Set 2: Locking rules for sdev (not exhaustive)
+ *   o The majority of nodes contain an sdev_contents rw lock. You must hold it
+ *     for read or write if manipulating its contents appropriately.
+ *   o You must lock your parent before yourself.
+ *   o If you need your vnode's v_lock and the sdev_contents rw lock, you must
+ *     grab the v_lock before the sdev_contents rw_lock.
+ *   o If you release a lock on the node as a part of upgrading it, you must
+ *     verify that the node has not become a zombie as a part of this process.
+ *
+ * - Set 3: Zombie Status and What it Means
+ *   o If you encounter a node that is a ZOMBIE, that means that it has been
+ *     unlinked from the backing store.
+ *   o If you release your contents lock and acquire it again (say as part of
+ *     trying to grab a write lock) you must check that the node has not become
+ *     a zombie.
+ *   o You should VERIFY that a looked up node is not a zombie. This follows
+ *     from the following logic. To mark something as a zombie means that it is
+ *     removed from the parents directory cache. To do that, you must have a
+ *     write lock on the parent's sdev_contents. To lookup through that
+ *     directory you must have a read lock. This then becomes a simple ordering
+ *     problem. If you've been granted the lock then the other operation cannot
+ *     be in progress or must have already succeeded.
+ *
+ * - Set 4: Removing Directory Entries (aka making nodes Zombies)
+ *   o Write lock must be held on the directory
+ *   o Write lock must be held on the node
+ *   o Remove the sdev_node_t from its parent cache
+ *   o Remove the corresponding backing store node, if it exists, eg. use
+ *     VOP_REMOVE or VOP_RMDIR.
+ *   o You must NOT make any change in the vnode reference count! Nodes should
+ *     only be cleaned up through VOP_INACTIVE callbacks.
+ *   o VOP_INACTIVE is the only one responsible for doing the final vn_rele of
+ *     the backing store vnode that was grabbed during lookup.
+ *
+ * - Set 5: What Nodes may be Persisted
+ *   o The root, /dev is always persisted
+ *   o Any node in vtab which is marked SDEV_DYNAMIC, may not be persisted
+ *     unless it is also marked SDEV_PERSIST
+ *   o Anything whose parent directory is marked SDEV_PERSIST will pass that
+ *     along to the child as long as it does not contradict the above rules
+ */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/t_lock.h>
@@ -559,6 +804,12 @@
 	if (!rw_tryupgrade(&parent->sdev_contents)) {
 		rw_exit(&parent->sdev_contents);
 		rw_enter(&parent->sdev_contents, RW_WRITER);
+		/* Make sure we didn't become a zombie */
+		if (parent->sdev_state == SDEV_ZOMBIE) {
+			rw_exit(&parent->sdev_contents);
+			VN_RELE(vp);
+			return (ENOENT);
+		}
 	}
 
 	/* we do not support unlinking a non-empty directory */
@@ -574,39 +825,28 @@
 	 *  - destroying the sdev_node
 	 *  - releasing the hold on attrvp
 	 */
-	error = sdev_cache_update(parent, &dv, nm, SDEV_CACHE_DELETE);
+	sdev_cache_update(parent, &dv, nm, SDEV_CACHE_DELETE);
+	VN_RELE(vp);
 	rw_exit(&parent->sdev_contents);
 
-	sdcmn_err2(("sdev_remove: cache_update error %d\n", error));
-	if (error && (error != EBUSY)) {
-		/* report errors other than EBUSY */
-		VN_RELE(vp);
-	} else {
-		sdcmn_err2(("sdev_remove: cleaning node %s from cache "
-		    " with error %d\n", nm, error));
-
+	/*
+	 * best efforts clean up the backing store
+	 */
+	if (bkstore) {
+		ASSERT(parent->sdev_attrvp);
+		error = VOP_REMOVE(parent->sdev_attrvp, nm, cred,
+		    ct, flags);
 		/*
-		 * best efforts clean up the backing store
+		 * do not report BUSY error
+		 * because the backing store ref count is released
+		 * when the last ref count on the sdev_node is
+		 * released.
 		 */
-		if (bkstore) {
-			ASSERT(parent->sdev_attrvp);
-			error = VOP_REMOVE(parent->sdev_attrvp, nm, cred,
-			    ct, flags);
-			/*
-			 * do not report BUSY error
-			 * because the backing store ref count is released
-			 * when the last ref count on the sdev_node is
-			 * released.
-			 */
-			if (error == EBUSY) {
-				sdcmn_err2(("sdev_remove: device %s is still on"
-				    "disk %s\n", nm, parent->sdev_path));
-				error = 0;
-			}
+		if (error == EBUSY) {
+			sdcmn_err2(("sdev_remove: device %s is still on"
+			    "disk %s\n", nm, parent->sdev_path));
+			error = 0;
 		}
-
-		if (error == EBUSY)
-			error = 0;
 	}
 
 	return (error);
@@ -716,6 +956,8 @@
 		if (error = VOP_GETATTR(odvp, &vattr, 0, cred, ct)) {
 			mutex_exit(&sdev_lock);
 			VN_RELE(ovp);
+			if (nvp != NULL)
+				VN_RELE(nvp);
 			return (error);
 		}
 		fsid = vattr.va_fsid;
@@ -723,11 +965,15 @@
 		if (error = VOP_GETATTR(ndvp, &vattr, 0, cred, ct)) {
 			mutex_exit(&sdev_lock);
 			VN_RELE(ovp);
+			if (nvp != NULL)
+				VN_RELE(nvp);
 			return (error);
 		}
 		if (fsid != vattr.va_fsid) {
 			mutex_exit(&sdev_lock);
 			VN_RELE(ovp);
+			if (nvp != NULL)
+				VN_RELE(nvp);
 			return (EXDEV);
 		}
 	}
@@ -737,6 +983,8 @@
 	if (error) {
 		mutex_exit(&sdev_lock);
 		VN_RELE(ovp);
+		if (nvp != NULL)
+			VN_RELE(nvp);
 		return (error);
 	}
 
@@ -747,6 +995,8 @@
 		if (error) {
 			mutex_exit(&sdev_lock);
 			VN_RELE(ovp);
+			if (nvp != NULL)
+				VN_RELE(nvp);
 			return (error);
 		}
 	}
@@ -755,21 +1005,31 @@
 	ASSERT(fromdv);
 
 	/* destination file exists */
-	if (nvp) {
+	if (nvp != NULL) {
 		todv = VTOSDEV(nvp);
 		ASSERT(todv);
 	}
 
+	if ((fromdv->sdev_flags & SDEV_DYNAMIC) != 0 ||
+	    (todv != NULL && (todv->sdev_flags & SDEV_DYNAMIC) != 0)) {
+		mutex_exit(&sdev_lock);
+		if (nvp != NULL)
+			VN_RELE(nvp);
+		VN_RELE(ovp);
+		return (EACCES);
+	}
+
 	/*
-	 * link source to new target in the memory
+	 * link source to new target in the memory. Regardless of failure, we
+	 * must rele our hold on nvp.
 	 */
 	error = sdev_rnmnode(fromparent, fromdv, toparent, &todv, nnm, cred);
+	if (nvp != NULL)
+		VN_RELE(nvp);
 	if (error) {
 		sdcmn_err2(("sdev_rename: renaming %s to %s failed "
 		    " with error %d\n", onm, nnm, error));
 		mutex_exit(&sdev_lock);
-		if (nvp)
-			VN_RELE(nvp);
 		VN_RELE(ovp);
 		return (error);
 	}
@@ -782,6 +1042,7 @@
 	if (fromdv == NULL) {
 		rw_exit(&fromparent->sdev_contents);
 		mutex_exit(&sdev_lock);
+		VN_RELE(ovp);
 		sdcmn_err2(("sdev_rename: the source is deleted already\n"));
 		return (0);
 	}
@@ -790,6 +1051,7 @@
 		rw_exit(&fromparent->sdev_contents);
 		mutex_exit(&sdev_lock);
 		VN_RELE(SDEVTOV(fromdv));
+		VN_RELE(ovp);
 		sdcmn_err2(("sdev_rename: the source is being deleted\n"));
 		return (0);
 	}
@@ -809,8 +1071,9 @@
 
 	rw_enter(&fromparent->sdev_contents, RW_WRITER);
 	bkstore = SDEV_IS_PERSIST(fromdv) ? 1 : 0;
-	error = sdev_cache_update(fromparent, &fromdv, onm,
+	sdev_cache_update(fromparent, &fromdv, onm,
 	    SDEV_CACHE_DELETE);
+	VN_RELE(SDEVTOV(fromdv));
 
 	/* best effforts clean up the backing store */
 	if (bkstore) {
@@ -1078,27 +1341,23 @@
 	rw_exit(&self->sdev_contents);
 
 	/* unlink it from the directory cache */
-	error = sdev_cache_update(parent, &self, nm, SDEV_CACHE_DELETE);
+	sdev_cache_update(parent, &self, nm, SDEV_CACHE_DELETE);
 	rw_exit(&parent->sdev_contents);
 	vn_vfsunlock(vp);
+	VN_RELE(vp);
 
-	if (error && (error != EBUSY)) {
-		VN_RELE(vp);
-	} else {
-		sdcmn_err2(("sdev_rmdir: cleaning node %s from directory "
-		    " cache with error %d\n", nm, error));
+	/* best effort to clean up the backing store */
+	if (SDEV_IS_PERSIST(parent)) {
+		ASSERT(parent->sdev_attrvp);
+		error = VOP_RMDIR(parent->sdev_attrvp, nm,
+		    parent->sdev_attrvp, kcred, ct, flags);
 
-		/* best effort to clean up the backing store */
-		if (SDEV_IS_PERSIST(parent)) {
-			ASSERT(parent->sdev_attrvp);
-			error = VOP_RMDIR(parent->sdev_attrvp, nm,
-			    parent->sdev_attrvp, kcred, ct, flags);
+		if (error)
 			sdcmn_err2(("sdev_rmdir: cleaning device %s is on"
 			    " disk error %d\n", parent->sdev_path, error));
-		}
-
 		if (error == EBUSY)
 			error = 0;
+
 	}
 
 	return (error);
--- a/usr/src/uts/common/fs/dev/sdev_vtops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_vtops.c	Wed May 08 01:46:41 2013 +0000
@@ -101,7 +101,6 @@
 	char *nm = dv->sdev_name;
 	int rval;
 
-	ASSERT(!(dv->sdev_flags & SDEV_STALE));
 	ASSERT(dv->sdev_state == SDEV_READY);
 	ASSERT(RW_LOCK_HELD(&(dv->sdev_dotdot)->sdev_contents));
 
@@ -273,6 +272,11 @@
 		kmem_free(link, MAXPATHLEN);
 	}
 
+	if (error != 0) {
+		SDEV_RELE(sdv);
+		return;
+	}
+
 	mutex_enter(&sdv->sdev_lookup_lock);
 	SDEV_UNBLOCK_OTHERS(sdv, SDEV_LOOKUP);
 	mutex_exit(&sdv->sdev_lookup_lock);
@@ -337,6 +341,7 @@
 			SDEV_HOLD(dv);
 			(void) sdev_cache_update(ddv, &dv,
 			    dv->sdev_name, SDEV_CACHE_DELETE);
+			SDEV_RELE(dv);
 			break;
 		case SDEV_VTOR_STALE:
 			devvt_rebuild_stale_link(ddv, dv);
@@ -382,9 +387,6 @@
 		for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) {
 			next = SDEV_NEXT_ENTRY(sdvp, dv);
 
-			/* skip stale nodes */
-			if (dv->sdev_flags & SDEV_STALE)
-				continue;
 			/* validate only ready nodes */
 			if (dv->sdev_state != SDEV_READY)
 				continue;
@@ -403,9 +405,6 @@
 	for (dv = SDEV_FIRST_ENTRY(sdvp); dv; dv = next) {
 		next = SDEV_NEXT_ENTRY(sdvp, dv);
 
-		/* skip stale nodes */
-		if (dv->sdev_flags & SDEV_STALE)
-			continue;
 		/* validate only ready nodes */
 		if (dv->sdev_state != SDEV_READY)
 			continue;
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c	Wed May 08 01:46:41 2013 +0000
@@ -464,12 +464,6 @@
 	dv = SDEV_FIRST_ENTRY(ddv);
 	while (dv) {
 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
-		/* skip stale nodes */
-		if (dv->sdev_flags & SDEV_STALE) {
-			sdcmn_err13(("  stale"));
-			dv = SDEV_NEXT_ENTRY(ddv, dv);
-			continue;
-		}
 
 		switch (devzvol_validate(dv)) {
 		case SDEV_VTOR_VALID:
@@ -489,11 +483,10 @@
 		}
 		SDEV_HOLD(dv);
 		/* remove the cache node */
-		if (sdev_cache_update(ddv, &dv, dv->sdev_name,
-		    SDEV_CACHE_DELETE) == 0)
-			dv = SDEV_FIRST_ENTRY(ddv);
-		else
-			dv = SDEV_NEXT_ENTRY(ddv, dv);
+		sdev_cache_update(ddv, &dv, dv->sdev_name,
+		    SDEV_CACHE_DELETE);
+		SDEV_RELE(dv);
+		dv = SDEV_FIRST_ENTRY(ddv);
 	}
 	rw_downgrade(&ddv->sdev_contents);
 }
--- a/usr/src/uts/common/sys/fs/sdev_impl.h	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/sys/fs/sdev_impl.h	Wed May 08 01:46:41 2013 +0000
@@ -181,35 +181,8 @@
 #define	SDEV_NEXT_ENTRY(ddv, dv)	AVL_NEXT(&(ddv)->sdev_entries, (dv))
 
 /*
- * sdev_state
- *
- * A sdev_node may go through 3 states:
- *	SDEV_INIT: When a new /dev file is first looked up, a sdev_node
- *		   is allocated, initialized and added to the directory's
- *		   sdev_node cache. A node at this state will also
- *		   have the SDEV_LOOKUP flag set.
- *
- *		   Other threads that are trying to look up a node at
- *		   this state will be blocked until the SDEV_LOOKUP flag
- *		   is cleared.
- *
- *		   When the SDEV_LOOKUP flag is cleared, the node may
- *		   transition into the SDEV_READY state for a successful
- *		   lookup or the node is removed from the directory cache
- *		   and destroyed if the named node can not be found.
- *		   An ENOENT error is returned for the second case.
- *
- *	SDEV_READY: A /dev file has been successfully looked up and
- *		    associated with a vnode. The /dev file is available
- *		    for the supported /dev filesystem operations.
- *
- *	SDEV_ZOMBIE: Deletion of a /dev file has been explicitely issued
- *		    to an SDEV_READY node. The node is transitioned into
- *		    the SDEV_ZOMBIE state if the vnode reference count
- *		    is still held. A SDEV_ZOMBIE node does not support
- *		    any of the /dev filesystem operations. A SDEV_ZOMBIE
- *		    node is removed from the directory cache and destroyed
- *		    once the reference count reaches "zero".
+ * See the big theory statement in sdev_vnops.c for an explanation of these
+ * states.
  */
 typedef enum {
 	SDEV_ZOMBIE = -1,
@@ -219,17 +192,16 @@
 
 /* sdev_flags */
 #define	SDEV_BUILD		0x0001	/* directory cache out-of-date */
-#define	SDEV_STALE		0x0002	/* stale sdev nodes */
-#define	SDEV_GLOBAL		0x0004	/* global /dev nodes */
-#define	SDEV_PERSIST		0x0008	/* backing store persisted node */
-#define	SDEV_NO_NCACHE		0x0010	/* do not include in neg. cache */
-#define	SDEV_DYNAMIC		0x0020	/* special-purpose vnode ops */
+#define	SDEV_GLOBAL		0x0002	/* global /dev nodes */
+#define	SDEV_PERSIST		0x0004	/* backing store persisted node */
+#define	SDEV_NO_NCACHE		0x0008	/* do not include in neg. cache */
+#define	SDEV_DYNAMIC		0x0010	/* special-purpose vnode ops */
 					/* (ex: pts) */
-#define	SDEV_VTOR		0x0040	/* validate sdev_nodes during search */
-#define	SDEV_ATTR_INVALID	0x0080	/* invalid node attributes, */
+#define	SDEV_VTOR		0x0020	/* validate sdev_nodes during search */
+#define	SDEV_ATTR_INVALID	0x0040	/* invalid node attributes, */
 					/* need update */
-#define	SDEV_SUBDIR		0x0100	/* match all subdirs under here */
-#define	SDEV_ZONED		0x0200  /* zoned subdir */
+#define	SDEV_SUBDIR		0x0080	/* match all subdirs under here */
+#define	SDEV_ZONED		0x0100  /* zoned subdir */
 
 /* sdev_lookup_flags */
 #define	SDEV_LOOKUP	0x0001	/* node creation in progress */
@@ -471,7 +443,7 @@
 } sdev_cache_ops_t;
 
 extern struct sdev_node *sdev_cache_lookup(struct sdev_node *, char *);
-extern int sdev_cache_update(struct sdev_node *, struct sdev_node **, char *,
+extern void sdev_cache_update(struct sdev_node *, struct sdev_node **, char *,
     sdev_cache_ops_t);
 extern void sdev_node_cache_init(void);
 extern void sdev_node_cache_fini(void);