usr/src/uts/common/fs/dev/sdev_subr.c
changeset 14223 1652c59077c6
parent 14222 c3f8a4690b1f
--- a/usr/src/uts/common/fs/dev/sdev_subr.c	Mon Jun 11 18:23:37 2012 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c	Wed May 08 01:46:41 2013 +0000
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -370,7 +370,8 @@
 }
 
 /*
- * transition a sdev_node into SDEV_READY state
+ * Transition a sdev_node into SDEV_READY state. If this fails, it is up to the
+ * caller to transition the node to the SDEV_ZOMBIE state.
  */
 int
 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
@@ -433,26 +434,12 @@
 		/* transition to READY state */
 		sdev_set_nodestate(dv, SDEV_READY);
 		sdev_nc_node_exists(dv);
-	} else {
-		sdev_set_nodestate(dv, SDEV_ZOMBIE);
 	}
 	rw_exit(&dv->sdev_contents);
 	return (error);
 }
 
 /*
- * setting ZOMBIE state
- */
-static int
-sdev_nodezombied(struct sdev_node *dv)
-{
-	rw_enter(&dv->sdev_contents, RW_WRITER);
-	sdev_set_nodestate(dv, SDEV_ZOMBIE);
-	rw_exit(&dv->sdev_contents);
-	return (0);
-}
-
-/*
  * Build the VROOT sdev_node.
  */
 /*ARGSUSED*/
@@ -568,23 +555,31 @@
 	{ NULL, NULL, NULL, NULL, NULL, 0}
 };
 
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
 struct sdev_vop_table *
 sdev_match(struct sdev_node *dv)
 {
 	int vlen;
 	int i;
+	const char *path;
+
+	if (strlen(dv->sdev_path) <= 5)
+		return (NULL);
+
+	if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+		return (NULL);
+	path = dv->sdev_path + 5;
 
 	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, dv->sdev_name) == 0)
+		if (strcmp(vtab[i].vt_name, path) == 0)
 			return (&vtab[i]);
 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
-			char *ptr;
-
-			ASSERT(strlen(dv->sdev_path) > 5);
-			ptr = dv->sdev_path + 5;
 			vlen = strlen(vtab[i].vt_name);
-			if ((strncmp(vtab[i].vt_name, ptr,
-			    vlen - 1) == 0) && ptr[vlen] == '/')
+			if ((strncmp(vtab[i].vt_name, path,
+			    vlen - 1) == 0) && path[vlen] == '/')
 				return (&vtab[i]);
 		}
 
@@ -610,10 +605,14 @@
 	/* gets the vtab entry it matches */
 	if ((vtp = sdev_match(dv)) != NULL) {
 		dv->sdev_flags |= vtp->vt_flags;
+		if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+		    (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+			dv->sdev_flags |= SDEV_PERSIST;
 
 		if (vtp->vt_vops) {
 			if (vtp->vt_global_vops)
 				*(vtp->vt_global_vops) = vtp->vt_vops;
+
 			return (vtp->vt_vops);
 		}
 
@@ -631,8 +630,10 @@
 				*(vtp->vt_global_vops) = vtp->vt_vops;
 			}
 			sdev_free_vtab(templ);
+
 			return (vtp->vt_vops);
 		}
+
 		return (sdev_vnodeops);
 	}
 
@@ -743,7 +744,6 @@
 	ASSERT(linkvp->v_type == VLNK);
 	err = sdev_getlink(linkvp, &link);
 	if (err) {
-		(void) sdev_nodezombied(dv);
 		dv->sdev_symlink = NULL;
 		return (ENOENT);
 	}
@@ -839,12 +839,7 @@
 		ASSERT(dv);
 
 		/* insert into the directory cache */
-		error = sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
-		if (error) {
-			sdcmn_err9(("sdev_mknode: node %s can not"
-			    " be added into directory cache\n", nm));
-			return (ENOENT);
-		}
+		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
 	}
 
 	ASSERT(dv);
@@ -878,7 +873,13 @@
 		*newdv = dv;
 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
 	} else {
-		SDEV_SIMPLE_RELE(dv);
+		sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
+		/*
+		 * We created this node, it wasn't passed into us. Therefore it
+		 * is up to us to delete it.
+		 */
+		if (*newdv == NULL)
+			SDEV_SIMPLE_RELE(dv);
 		*newdv = NULL;
 	}
 
@@ -986,6 +987,7 @@
 	if (dv) {
 		ASSERT(dv->sdev_dotdot == ddv);
 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
+		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
 		SDEV_HOLD(dv);
 		return (dv);
 	}
@@ -1004,6 +1006,7 @@
 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
 	ASSERT(ddv->sdev_nlink >= 2);
 	ASSERT(dv->sdev_nlink == 0);
+	ASSERT(dv->sdev_state != SDEV_ZOMBIE);
 
 	dv->sdev_dotdot = ddv;
 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
@@ -1019,60 +1022,55 @@
 static void
 decr_link(struct sdev_node *dv)
 {
-	if (dv->sdev_state != SDEV_INIT)
+	VERIFY(RW_WRITE_HELD(&dv->sdev_contents));
+	if (dv->sdev_state != SDEV_INIT) {
+		VERIFY(dv->sdev_nlink >= 1);
 		dv->sdev_nlink--;
-	else
-		ASSERT(dv->sdev_nlink == 0);
+	} else {
+		VERIFY(dv->sdev_nlink == 0);
+	}
 }
 
 /*
  * Delete an existing dv from directory cache
  *
- * In the case of a node is still held by non-zero reference count,
- *     the node is put into ZOMBIE state. Once the reference count
- *     reaches "0", the node is unlinked and destroyed,
- *     in sdev_inactive().
+ * In the case of a node is still held by non-zero reference count, the node is
+ * put into ZOMBIE state. The node is always unlinked from its parent, but it is
+ * not destroyed via sdev_inactive until its reference count reaches "0".
  */
-static int
+static void
 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
 {
 	struct vnode *vp;
+	sdev_node_state_t os;
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 
 	vp = SDEVTOV(dv);
 	mutex_enter(&vp->v_lock);
-
-	/* dv is held still */
-	if (vp->v_count > 1) {
-		rw_enter(&dv->sdev_contents, RW_WRITER);
-		if (dv->sdev_state == SDEV_READY) {
-			sdcmn_err9((
-			    "sdev_dirdelete: node %s busy with count %d\n",
-			    dv->sdev_name, vp->v_count));
-			dv->sdev_state = SDEV_ZOMBIE;
-		}
-		rw_exit(&dv->sdev_contents);
-		--vp->v_count;
-		mutex_exit(&vp->v_lock);
-		return (EBUSY);
-	}
-	ASSERT(vp->v_count == 1);
-
-	/* unlink from the memory cache */
-	ddv->sdev_nlink--;	/* .. to above */
-	if (vp->v_type == VDIR) {
-		decr_link(dv);		/* . to self */
-	}
-
+	rw_enter(&dv->sdev_contents, RW_WRITER);
+	os = dv->sdev_state;
+	ASSERT(os != SDEV_ZOMBIE);
+	dv->sdev_state = SDEV_ZOMBIE;
+
+	/*
+	 * unlink ourselves from the parent directory now to take care of the ..
+	 * link. However, if we're a directory, we don't remove our reference to
+	 * ourself eg. '.' until we are torn down in the inactive callback.
+	 */
+	decr_link(ddv);
 	avl_remove(&ddv->sdev_entries, dv);
-	decr_link(dv);	/* name, back to zero */
-	vp->v_count--;
+	/*
+	 * sdev_inactive expects nodes to have a link to themselves when we're
+	 * tearing them down. If we're transitioning from the initial state to
+	 * zombie and not via ready, then we're not going to have this link that
+	 * comes from the node being ready. As a result, we need to increment
+	 * our link count by one to account for this.
+	 */
+	if (os == SDEV_INIT && dv->sdev_nlink == 0)
+		dv->sdev_nlink++;
+	rw_exit(&dv->sdev_contents);
 	mutex_exit(&vp->v_lock);
-
-	/* destroy the node */
-	sdev_nodedestroy(dv, 0);
-	return (0);
 }
 
 /*
@@ -1169,6 +1167,35 @@
 			goto err_out;
 	}
 
+	/* fix the source for a symlink */
+	if (vattr.va_type == VLNK) {
+		if (odv->sdev_symlink == NULL) {
+			error = sdev_follow_link(odv);
+			if (error) {
+				/*
+				 * The underlying symlink doesn't exist. This
+				 * node probably shouldn't even exist. While
+				 * it's a bit jarring to consumers, we're going
+				 * to remove the node from /dev.
+				 */
+				if (SDEV_IS_PERSIST((*ndvp)))
+					bkstore = 1;
+				sdev_dirdelete(oddv, odv);
+				if (bkstore) {
+					ASSERT(nddv->sdev_attrvp);
+					error = VOP_REMOVE(nddv->sdev_attrvp,
+					    nnm, cred, NULL, 0);
+					if (error)
+						goto err_out;
+				}
+				error = ENOENT;
+				goto err_out;
+			}
+		}
+		ASSERT(odv->sdev_symlink);
+		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
+	}
+
 	/* destination existing */
 	if (*ndvp) {
 		nvp = SDEVTOV(*ndvp);
@@ -1205,7 +1232,12 @@
 			}
 			vn_vfsunlock(nvp);
 
-			(void) sdev_dirdelete(nddv, *ndvp);
+			/*
+			 * We did not place the hold on *ndvp, so even though
+			 * we're deleting the node, we should not get rid of our
+			 * reference.
+			 */
+			sdev_dirdelete(nddv, *ndvp);
 			*ndvp = NULL;
 			ASSERT(nddv->sdev_attrvp);
 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
@@ -1223,11 +1255,11 @@
 			}
 
 			/*
-			 * get rid of the node from the directory cache
-			 * note, in case EBUSY is returned, the ZOMBIE
-			 * node is taken care in sdev_mknode.
+			 * Get rid of the node from the directory cache note.
+			 * Don't forget that it's not up to us to remove the vn
+			 * ref on the sdev node, as we did not place it.
 			 */
-			(void) sdev_dirdelete(nddv, *ndvp);
+			sdev_dirdelete(nddv, *ndvp);
 			*ndvp = NULL;
 			if (bkstore) {
 				ASSERT(nddv->sdev_attrvp);
@@ -1239,19 +1271,6 @@
 		}
 	}
 
-	/* fix the source for a symlink */
-	if (vattr.va_type == VLNK) {
-		if (odv->sdev_symlink == NULL) {
-			error = sdev_follow_link(odv);
-			if (error) {
-				error = ENOENT;
-				goto err_out;
-			}
-		}
-		ASSERT(odv->sdev_symlink);
-		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
-	}
-
 	/*
 	 * make a fresh node from the source attrs
 	 */
@@ -1259,8 +1278,10 @@
 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
 	    NULL, (void *)link, cred, SDEV_READY);
 
-	if (link)
+	if (link != NULL) {
 		kmem_free(link, strlen(link) + 1);
+		link = NULL;
+	}
 
 	if (error)
 		goto err_out;
@@ -1271,9 +1292,11 @@
 	if (doingdir) {
 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
+			SDEV_HOLD(idv);
 			error = sdev_rnmnode(odv, idv,
 			    (struct sdev_node *)(*ndvp), &ndv,
 			    idv->sdev_name, cred);
+			SDEV_RELE(idv);
 			if (error)
 				goto err_out;
 			ndv = NULL;
@@ -1307,6 +1330,11 @@
 	return (error);
 
 err_out:
+	if (link != NULL) {
+		kmem_free(link, strlen(link) + 1);
+		link = NULL;
+	}
+
 	rw_exit(&nddv->sdev_contents);
 	if (!samedir)
 		rw_exit(&oddv->sdev_contents);
@@ -1465,18 +1493,8 @@
 			vp = NULLVP;
 			dv = sdev_cache_lookup(ddv, nm);
 			if (dv) {
-				if (dv->sdev_state != SDEV_ZOMBIE) {
-					SDEV_SIMPLE_RELE(dv);
-				} else {
-					/*
-					 * A ZOMBIE node may not have been
-					 * cleaned up from the backing store,
-					 * bypass this entry in this case,
-					 * and clean it up from the directory
-					 * cache if this is the last call.
-					 */
-					(void) sdev_dirdelete(ddv, dv);
-				}
+				VERIFY(dv->sdev_state != SDEV_ZOMBIE);
+				SDEV_SIMPLE_RELE(dv);
 				continue;
 			}
 
@@ -1547,11 +1565,16 @@
 	vap->va_ctime = vap->va_atime;
 	for (i = 0; vtab[i].vt_name != NULL; i++) {
 		/*
-		 * This early, we may be in a read-only /dev
-		 * environment: leave the creation of any nodes we'd
-		 * attempt to persist to devfsadm.
+		 * This early, we may be in a read-only /dev environment: leave
+		 * the creation of any nodes we'd attempt to persist to
+		 * devfsadm. Because /dev itself is normally persistent, any
+		 * node which is not marked dynamic will end up being marked
+		 * persistent. However, some nodes are both dynamic and
+		 * persistent, mostly lofi and rlofi, so we need to be careful
+		 * in our check.
 		 */
-		if (vtab[i].vt_flags & SDEV_PERSIST)
+		if ((vtab[i].vt_flags & SDEV_PERSIST) ||
+		    !(vtab[i].vt_flags & SDEV_DYNAMIC))
 			continue;
 		nm = vtab[i].vt_name;
 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
@@ -1614,6 +1637,8 @@
 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
 		    (void *)(*rvp), error));
+		if (!error)
+			VN_RELE(*rvp);
 		break;
 	case VCHR:
 	case VBLK:
@@ -1647,74 +1672,49 @@
 	return (error);
 }
 
-static int
+static void
 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
 {
-	int error = 0;
 	struct sdev_node *dup = NULL;
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
 		sdev_direnter(ddv, *dv);
 	} else {
-		if (dup->sdev_state == SDEV_ZOMBIE) {
-			error = sdev_dirdelete(ddv, dup);
-			/*
-			 * The ZOMBIE node is still hanging
-			 * around with more than one reference counts.
-			 * Fail the new node creation so that
-			 * the directory cache won't have
-			 * duplicate entries for the same named node
-			 */
-			if (error == EBUSY) {
-				SDEV_SIMPLE_RELE(*dv);
-				sdev_nodedestroy(*dv, 0);
-				*dv = NULL;
-				return (error);
-			}
-			sdev_direnter(ddv, *dv);
-		} else {
-			ASSERT((*dv)->sdev_state != SDEV_ZOMBIE);
-			SDEV_SIMPLE_RELE(*dv);
-			sdev_nodedestroy(*dv, 0);
-			*dv = dup;
-		}
+		VERIFY(dup->sdev_state != SDEV_ZOMBIE);
+		SDEV_SIMPLE_RELE(*dv);
+		sdev_nodedestroy(*dv, 0);
+		*dv = dup;
 	}
-
-	return (0);
 }
 
-static int
+static void
 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
 {
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
-	return (sdev_dirdelete(ddv, *dv));
+	sdev_dirdelete(ddv, *dv);
 }
 
 /*
  * update the in-core directory cache
  */
-int
+void
 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
     sdev_cache_ops_t ops)
 {
-	int error = 0;
-
 	ASSERT((SDEV_HELD(*dv)));
 
 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
 	switch (ops) {
 	case SDEV_CACHE_ADD:
-		error = sdev_cache_add(ddv, dv, nm);
+		sdev_cache_add(ddv, dv, nm);
 		break;
 	case SDEV_CACHE_DELETE:
-		error = sdev_cache_delete(ddv, dv);
+		sdev_cache_delete(ddv, dv);
 		break;
 	default:
 		break;
 	}
-
-	return (error);
 }
 
 /*
@@ -2157,7 +2157,6 @@
 	}
 
 found:
-	ASSERT(!(dv->sdev_flags & SDEV_STALE));
 	ASSERT(dv->sdev_state == SDEV_READY);
 	if (vtor) {
 		/*
@@ -2176,13 +2175,11 @@
 				rw_exit(&ddv->sdev_contents);
 				rw_enter(&ddv->sdev_contents, RW_WRITER);
 			}
-			error = sdev_cache_update(ddv, &dv, nm,
-			    SDEV_CACHE_DELETE);
+			sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_DELETE);
 			rw_downgrade(&ddv->sdev_contents);
-			if (error == 0) {
-				dv = NULL;
-				goto lookup_create_node;
-			}
+			SDEV_RELE(dv);
+			dv = NULL;
+			goto lookup_create_node;
 			/* FALLTHRU */
 		case SDEV_VTOR_INVALID:
 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
@@ -2229,9 +2226,10 @@
 		 * changes. Re-check.
 		 */
 		if (dv->sdev_state == SDEV_INIT) {
-			(void) sdev_dirdelete(ddv, dv);
+			sdev_dirdelete(ddv, dv);
 			rw_exit(&ddv->sdev_contents);
 			sdev_lookup_failed(ddv, nm, failed_flags);
+			SDEV_RELE(dv);
 			*vpp = NULL;
 			return (ENOENT);
 		}
@@ -2263,15 +2261,14 @@
 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
 
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = SDEV_NEXT_ENTRY(ddv, dv)) {
+	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
 		vp = SDEVTOV(dv);
+		SDEV_HOLD(dv);
 		if (vp->v_type == VDIR)
 			sdev_stale(dv);
 
-		sdcmn_err9(("sdev_stale: setting stale %s\n",
-		    dv->sdev_path));
-		dv->sdev_flags |= SDEV_STALE;
-		avl_remove(&ddv->sdev_entries, dv);
+		sdev_dirdelete(ddv, dv);
+		SDEV_RELE(dv);
 	}
 	ddv->sdev_flags |= SDEV_BUILD;
 	rw_exit(&ddv->sdev_contents);
@@ -2289,7 +2286,7 @@
 	int error = 0;
 	int busy = 0;
 	struct vnode *vp;
-	struct sdev_node *dv, *next = NULL;
+	struct sdev_node *dv;
 	int bkstore = 0;
 	int len = 0;
 	char *bks_name = NULL;
@@ -2300,8 +2297,7 @@
 	 * We try our best to destroy all unused sdev_node's
 	 */
 	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
-		next = SDEV_NEXT_ENTRY(ddv, dv);
+	while ((dv = SDEV_FIRST_ENTRY(ddv)) != NULL) {
 		vp = SDEVTOV(dv);
 
 		if (expr && gmatch(dv->sdev_name, expr) == 0)
@@ -2338,15 +2334,10 @@
 			bcopy(dv->sdev_name, bks_name, len);
 		}
 
-		error = sdev_dirdelete(ddv, dv);
-
-		if (error == EBUSY) {
-			sdcmn_err9(("sdev_cleandir: dir busy\n"));
-			busy++;
-		}
+		sdev_dirdelete(ddv, dv);
 
 		/* take care the backing store clean up */
-		if (bkstore && (error == 0)) {
+		if (bkstore) {
 			ASSERT(bks_name);
 			ASSERT(ddv->sdev_attrvp);
 
@@ -2370,6 +2361,9 @@
 			bks_name = NULL;
 			len = 0;
 		}
+
+		ddv->sdev_flags |= SDEV_BUILD;
+		SDEV_RELE(dv);
 	}
 
 	ddv->sdev_flags |= SDEV_BUILD;
@@ -3114,42 +3108,41 @@
 {
 	int clean;
 	struct sdev_node *dv = VTOSDEV(vp);
-	struct sdev_node *ddv = dv->sdev_dotdot;
 	int state;
 
-	rw_enter(&ddv->sdev_contents, RW_WRITER);
-	state = dv->sdev_state;
-
 	mutex_enter(&vp->v_lock);
 	ASSERT(vp->v_count >= 1);
 
+
 	if (vp->v_count == 1 && callback != NULL)
 		callback(vp);
 
+	rw_enter(&dv->sdev_contents, RW_WRITER);
+	state = dv->sdev_state;
+
 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
 
 	/*
-	 * last ref count on the ZOMBIE node is released.
-	 * clean up the sdev_node, and
-	 * release the hold on the backing store node so that
-	 * the ZOMBIE backing stores also cleaned out.
+	 * sdev is a rather bad public citizen. It violates the general
+	 * agreement that in memory nodes should always have a valid reference
+	 * count on their vnode. But that's not the case here. This means that
+	 * we do actually have to distinguish between getting inactive callbacks
+	 * for zombies and otherwise. This should probably be fixed.
 	 */
 	if (clean) {
-		ASSERT(ddv);
-
-		ddv->sdev_nlink--;
+		/* Remove the . entry to ourselves */
 		if (vp->v_type == VDIR) {
-			dv->sdev_nlink--;
+			decr_link(dv);
 		}
-		if ((dv->sdev_flags & SDEV_STALE) == 0)
-			avl_remove(&ddv->sdev_entries, dv);
-		dv->sdev_nlink--;
+		VERIFY(dv->sdev_nlink == 1);
+		decr_link(dv);
 		--vp->v_count;
+		rw_exit(&dv->sdev_contents);
 		mutex_exit(&vp->v_lock);
 		sdev_nodedestroy(dv, 0);
 	} else {
 		--vp->v_count;
+		rw_exit(&dv->sdev_contents);
 		mutex_exit(&vp->v_lock);
 	}
-	rw_exit(&ddv->sdev_contents);
 }