19 * CDDL HEADER END |
19 * CDDL HEADER END |
20 */ |
20 */ |
21 /* |
21 /* |
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. |
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. |
23 */ |
23 */ |
|
24 /* |
|
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved. |
|
26 */ |
24 |
27 |
25 /* |
28 /* |
26 * vnode ops for the /dev filesystem |
29 * vnode ops for the /dev filesystem |
27 * |
30 * |
28 * - VDIR, VCHR, CBLK, and VLNK are considered must supported files |
31 * - VDIR, VCHR, CBLK, and VLNK are considered must supported files |
29 * - VREG and VDOOR are used for some internal implementations in |
32 * - VREG and VDOOR are used for some internal implementations in |
30 * the global zone, e.g. devname and devfsadm communication |
33 * the global zone, e.g. devname and devfsadm communication |
31 * - other file types are unusual in this namespace and |
34 * - other file types are unusual in this namespace and |
32 * not supported for now |
35 * not supported for now |
|
36 */ |
|
37 |
|
38 /* |
|
39 * sdev has a few basic goals: |
|
40 * o Provide /dev for the global zone as well as various non-global zones. |
|
41 * o Provide the basic functionality that devfsadm might need (mknod, |
|
42 * symlinks, etc.) |
|
43 * o Allow persistent permissions on files in /dev. |
|
44 * o Allow for dynamic directories and nodes for use by various services (pts, |
|
45 * zvol, net, etc.) |
|
46 * |
|
47 * The sdev file system is primarily made up of sdev_node_t's which is sdev's |
|
48 * counterpart to the vnode_t. There are two different classes of sdev_node_t's |
|
49 * that we generally care about, dynamic and otherwise. |
|
50 * |
|
51 * Persisting Information |
|
52 * ---------------------- |
|
53 * |
|
54 * When sdev is mounted, it keeps track of the underlying file system it is |
|
55 * mounted over. In certain situations, sdev will go and create entries in that |
|
56 * underlying file system. These underlying 'back end' nodes are used as proxies |
|
57 * for various changes in permissions. While specific sets of nodes, such as |
|
58 * dynamic ones, are exempt, this process stores permission changes against |
|
59 * these back end nodes. The point of all of this is to allow for these settings |
|
60 * to persist across host and zone reboots. As an example, consider the entry |
|
61 * /dev/dsk/c0t0d0 which is a character device and that / is in UFS. Upon |
|
62 * changing the permissions on c0t0d0 you'd have the following logical |
|
63 * relationships: |
|
64 * |
|
65 * +------------------+ sdev_vnode +--------------+ |
|
66 * | sdev_node_t |<---------------->| vnode_t | |
|
67 * | /dev/dsk/c0t0d0 |<---------------->| for sdev | |
|
68 * +------------------+ +--------------+ |
|
69 * | |
|
70 * | sdev_attrvp |
|
71 * | |
|
72 * | +---------------------+ |
|
73 * +--->| vnode_t for UFS|ZFS | |
|
74 * | /dev/dsk/c0t0d0 | |
|
75 * +---------------------+ |
|
76 * |
|
77 * sdev is generally in memory. Therefore when a lookup happens and there is no |
|
78 * entry already inside of a directory cache, it will next check the backing |
|
79 * store. If the backing store exists, we will reconstitute the sdev_node based |
|
80 * on the information that we persisted. When we create the backing store node, |
|
81 * we use the struct vattr information that we already have in sdev_node_t. |
|
82 * Because of this, we already know if the entry was previously a symlink, |
|
83 * directory, or some other kind of type. Note that not all types of nodes are |
|
84 * supported. Currently only VDIR, VCHR, VBLK, VREG, VDOOR, and VLNK are |
|
85 * eligible to be persisted. |
|
86 * |
|
87 * When the sdev_node is created and the lookup is done, we grab a hold on the |
|
88 * underlying vnode as part of the call to VOP_LOOKUP. That reference is held |
|
89 * until the sdev_node becomes inactive. Once its reference count reaches one |
|
90 * and the VOP_INACTIVE callback fires leading to the destruction of the node, |
|
91 * the reference on the underlying vnode will be released. |
|
92 * |
|
93 * The backing store node will be deleted only when the node itself is deleted |
|
94 * through the means of a VOP_REMOVE, VOP_RMDIR, or similar call. |
|
95 * |
|
96 * Not everything can be persisted, see The Rules section for more details. |
|
97 * |
|
98 * Dynamic Nodes |
|
99 * ------------- |
|
100 * |
|
101 * Dynamic nodes allow for specific interactions with various kernel subsystems |
|
102 * when looking up directory entries. This allows the lookup and readdir |
|
103 * functions to check against the kernel subsystem's for validity. eg. does a |
|
104 * zvol or nic still exist. |
|
105 * |
|
106 * More specifically, when we create various directories we check if the |
|
107 * directory name matches that of one of the names in the vtab[] (sdev_subr.c). |
|
108 * If it does, we swap out the vnode operations into a new set which combine the |
|
109 * normal sdev vnode operations with the dynamic set here. |
|
110 * |
|
111 * In addition, various dynamic nodes implement a verification entry point. This |
|
112 * verification entry is used as a part of lookup and readdir. The goal for |
|
113 * these dynamic nodes is to allow them to check with the underlying subsystems |
|
114 * to ensure that these devices are still present, or if they have gone away, to |
|
115 * remove them from the results. This is indicated by using the SDEV_VTOR flag |
|
116 * in vtab[]. |
|
117 * |
|
118 * Dynamic nodes have additional restrictions placed upon them. They may only |
|
119 * appear at the top level directory of the file system. In addition, users |
|
120 * cannot create dirents below any leve of a dynamic node aside from its special |
|
121 * vnops. |
|
122 * |
|
123 * Profiles |
|
124 * -------- |
|
125 * |
|
126 * Profiles exist for the purpose of non-global zones. They work with the zone |
|
127 * brands and zoneadmd to set up a filter of allowed devices that can appear in |
|
128 * a non-global zone's /dev. These are sent to sdev by means of libdevinfo and a |
|
129 * modctl system call. Specifically it allows one to add patterns of device |
|
130 * paths to include and exclude. It allows for a collection of symlinks to be |
|
131 * added and it allows for remapping names. |
|
132 * |
|
133 * When operating in a non-global zone, several of the sdev vnops are redirected |
|
134 * to the profile versions. These impose additional restrictions such as |
|
135 * enforcing that a non-global zone's /dev is read only. |
|
136 * |
|
137 * sdev_node_t States |
|
138 * ------------------ |
|
139 * |
|
140 * A given sdev_node_t has a field called the sdev_state which describes where |
|
141 * in the sdev life cycle it is. There are three primary states: SDEV_INIT, |
|
142 * SDEV_READY, and SDEV_ZOMBIE. |
|
143 * |
|
144 * SDEV_INIT: When a new /dev file is first looked up, a sdev_node |
|
145 * is allocated, initialized and added to the directory's |
|
146 * sdev_node cache. A node at this state will also |
|
147 * have the SDEV_LOOKUP flag set. |
|
148 * |
|
149 * Other threads that are trying to look up a node at |
|
150 * this state will be blocked until the SDEV_LOOKUP flag |
|
151 * is cleared. |
|
152 * |
|
153 * When the SDEV_LOOKUP flag is cleared, the node may |
|
154 * transition into the SDEV_READY state for a successful |
|
155 * lookup or the node is removed from the directory cache |
|
156 * and destroyed if the named node can not be found. |
|
157 * An ENOENT error is returned for the second case. |
|
158 * |
|
159 * SDEV_READY: A /dev file has been successfully looked up and |
|
160 * associated with a vnode. The /dev file is available |
|
161 * for the supported /dev file system operations. |
|
162 * |
|
163 * SDEV_ZOMBIE: Deletion of a /dev file has been explicitly issued |
|
164 * to an SDEV_READY node. The node is transitioned into |
|
165 * the SDEV_ZOMBIE state if the vnode reference count |
|
166 * is still held. A SDEV_ZOMBIE node does not support |
|
167 * any of the /dev file system operations. A SDEV_ZOMBIE |
|
168 * node is immediately removed from the directory cache |
|
169 * and destroyed once the reference count reaches zero. |
|
170 * |
|
171 * Historically nodes that were marked SDEV_ZOMBIE were not removed from the |
|
172 * underlying directory caches. This has been the source of numerous bugs and |
|
173 * thus to better mimic what happens on a real file system, it is no longer the |
|
174 * case. |
|
175 * |
|
176 * The following state machine describes the life cycle of a given node and its |
|
177 * associated states: |
|
178 * |
|
179 * node is . . . . . |
|
180 * allocated via . +-------------+ . . . . . . . vnode_t refcount |
|
181 * sdev_nodeinit() . | Unallocated | . reaches zero and |
|
182 * +--------*-----| Memory |<--------*---+ sdev_inactive is |
|
183 * | +-------------+ | called. |
|
184 * | +------------^ | called. |
|
185 * v | | |
|
186 * +-----------+ * . . sdev_nodeready() +-------------+ |
|
187 * | SDEV_INIT | | or related setup | SDEV_ZOMBIE | |
|
188 * +-----------+ | failure +-------------+ |
|
189 * | | ^ |
|
190 * | | +------------+ | |
|
191 * +-*----------->| SDEV_READY |--------*-----+ |
|
192 * . +------------+ . The node is no longer |
|
193 * . . node successfully . . . . . valid or we've been |
|
194 * inserted into the asked to remove it. |
|
195 * directory cache This happens via |
|
196 * and sdev_nodready() sdev_dirdelete(). |
|
197 * call successful. |
|
198 * |
|
199 * Adding and Removing Dirents, Zombie Nodes |
|
200 * ----------------------------------------- |
|
201 * |
|
202 * As part of doing a lookup, readdir, or an explicit creation operation like |
|
203 * mkdir or create, nodes may be created. Every directory has an avl tree which |
|
204 * contains its children, the sdev_entries tree. This is only used if the type |
|
205 * is VDIR. Access to this is controlled by the sdev_node_t's contents_lock and |
|
206 * it is managed through sdev_cache_update(). |
|
207 * |
|
208 * Every sdev_node_t has a field sdev_state, which describes the current state |
|
209 * of the node. A node is generally speaking in the SDEV_READY state. When it is |
|
210 * there, it can be looked up, accessed, and operations performed on it. When a |
|
211 * node is going to be removed from the directory cache it is marked as a |
|
212 * zombie. Once a node becomes a zombie, no other file system operations will |
|
213 * succeed and it will continue to exist as a node until the vnode count on the |
|
214 * node reaches zero. At that point, the node will be freed. However, once a |
|
215 * node has been marked as a zombie, it will be removed immediately from the |
|
216 * directory cache such that no one else may find it again. This means that |
|
217 * someone else can insert a new entry into that directory with the same name |
|
218 * and without a problem. |
|
219 * |
|
220 * To remove a node, see the section on that in The Rules. |
|
221 * |
|
222 * The Rules |
|
223 * --------- |
|
224 * These are the rules to live by when working in sdev. These are not |
|
225 * exhaustive. |
|
226 * |
|
227 * - Set 1: Working with Backing Nodes |
|
228 * o If there is a SDEV_READY sdev_node_t, it knows about its backing node. |
|
229 * o If we find a backing node when looking up an sdev_node_t for the first |
|
230 * time, we use its attributes to build our sdev_node_t. |
|
231 * o If there is a found backing node, or we create a backing node, that's |
|
232 * when we grab the hold on its vnode. |
|
233 * o If we mark an sdev_node_t a ZOMBIE, we must remove its backing node from |
|
234 * the underlying file system. It must not be searchable or findable. |
|
235 * o We release our hold on the backing node vnode when we destroy the |
|
236 * sdev_node_t. |
|
237 * |
|
238 * - Set 2: Locking rules for sdev (not exhaustive) |
|
239 * o The majority of nodes contain an sdev_contents rw lock. You must hold it |
|
240 * for read or write if manipulating its contents appropriately. |
|
241 * o You must lock your parent before yourself. |
|
242 * o If you need your vnode's v_lock and the sdev_contents rw lock, you must |
|
243 * grab the v_lock before the sdev_contents rw_lock. |
|
244 * o If you release a lock on the node as a part of upgrading it, you must |
|
245 * verify that the node has not become a zombie as a part of this process. |
|
246 * |
|
247 * - Set 3: Zombie Status and What it Means |
|
248 * o If you encounter a node that is a ZOMBIE, that means that it has been |
|
249 * unlinked from the backing store. |
|
250 * o If you release your contents lock and acquire it again (say as part of |
|
251 * trying to grab a write lock) you must check that the node has not become |
|
252 * a zombie. |
|
253 * o You should VERIFY that a looked up node is not a zombie. This follows |
|
254 * from the following logic. To mark something as a zombie means that it is |
|
255 * removed from the parents directory cache. To do that, you must have a |
|
256 * write lock on the parent's sdev_contents. To lookup through that |
|
257 * directory you must have a read lock. This then becomes a simple ordering |
|
258 * problem. If you've been granted the lock then the other operation cannot |
|
259 * be in progress or must have already succeeded. |
|
260 * |
|
261 * - Set 4: Removing Directory Entries (aka making nodes Zombies) |
|
262 * o Write lock must be held on the directory |
|
263 * o Write lock must be held on the node |
|
264 * o Remove the sdev_node_t from its parent cache |
|
265 * o Remove the corresponding backing store node, if it exists, eg. use |
|
266 * VOP_REMOVE or VOP_RMDIR. |
|
267 * o You must NOT make any change in the vnode reference count! Nodes should |
|
268 * only be cleaned up through VOP_INACTIVE callbacks. |
|
269 * o VOP_INACTIVE is the only one responsible for doing the final vn_rele of |
|
270 * the backing store vnode that was grabbed during lookup. |
|
271 * |
|
272 * - Set 5: What Nodes may be Persisted |
|
273 * o The root, /dev is always persisted |
|
274 * o Any node in vtab which is marked SDEV_DYNAMIC, may not be persisted |
|
275 * unless it is also marked SDEV_PERSIST |
|
276 * o Anything whose parent directory is marked SDEV_PERSIST will pass that |
|
277 * along to the child as long as it does not contradict the above rules |
33 */ |
278 */ |
34 |
279 |
35 #include <sys/types.h> |
280 #include <sys/types.h> |
36 #include <sys/param.h> |
281 #include <sys/param.h> |
37 #include <sys/t_lock.h> |
282 #include <sys/t_lock.h> |
714 if (odvp != ndvp) { |
954 if (odvp != ndvp) { |
715 vattr.va_mask = AT_FSID; |
955 vattr.va_mask = AT_FSID; |
716 if (error = VOP_GETATTR(odvp, &vattr, 0, cred, ct)) { |
956 if (error = VOP_GETATTR(odvp, &vattr, 0, cred, ct)) { |
717 mutex_exit(&sdev_lock); |
957 mutex_exit(&sdev_lock); |
718 VN_RELE(ovp); |
958 VN_RELE(ovp); |
|
959 if (nvp != NULL) |
|
960 VN_RELE(nvp); |
719 return (error); |
961 return (error); |
720 } |
962 } |
721 fsid = vattr.va_fsid; |
963 fsid = vattr.va_fsid; |
722 vattr.va_mask = AT_FSID; |
964 vattr.va_mask = AT_FSID; |
723 if (error = VOP_GETATTR(ndvp, &vattr, 0, cred, ct)) { |
965 if (error = VOP_GETATTR(ndvp, &vattr, 0, cred, ct)) { |
724 mutex_exit(&sdev_lock); |
966 mutex_exit(&sdev_lock); |
725 VN_RELE(ovp); |
967 VN_RELE(ovp); |
|
968 if (nvp != NULL) |
|
969 VN_RELE(nvp); |
726 return (error); |
970 return (error); |
727 } |
971 } |
728 if (fsid != vattr.va_fsid) { |
972 if (fsid != vattr.va_fsid) { |
729 mutex_exit(&sdev_lock); |
973 mutex_exit(&sdev_lock); |
730 VN_RELE(ovp); |
974 VN_RELE(ovp); |
|
975 if (nvp != NULL) |
|
976 VN_RELE(nvp); |
731 return (EXDEV); |
977 return (EXDEV); |
732 } |
978 } |
733 } |
979 } |
734 |
980 |
735 /* make sure the old entry can be deleted */ |
981 /* make sure the old entry can be deleted */ |
736 error = VOP_ACCESS(odvp, VWRITE, 0, cred, ct); |
982 error = VOP_ACCESS(odvp, VWRITE, 0, cred, ct); |
737 if (error) { |
983 if (error) { |
738 mutex_exit(&sdev_lock); |
984 mutex_exit(&sdev_lock); |
739 VN_RELE(ovp); |
985 VN_RELE(ovp); |
|
986 if (nvp != NULL) |
|
987 VN_RELE(nvp); |
740 return (error); |
988 return (error); |
741 } |
989 } |
742 |
990 |
743 /* make sure the destination allows creation */ |
991 /* make sure the destination allows creation */ |
744 samedir = (fromparent == toparent); |
992 samedir = (fromparent == toparent); |
745 if (!samedir) { |
993 if (!samedir) { |
746 error = VOP_ACCESS(ndvp, VEXEC|VWRITE, 0, cred, ct); |
994 error = VOP_ACCESS(ndvp, VEXEC|VWRITE, 0, cred, ct); |
747 if (error) { |
995 if (error) { |
748 mutex_exit(&sdev_lock); |
996 mutex_exit(&sdev_lock); |
749 VN_RELE(ovp); |
997 VN_RELE(ovp); |
|
998 if (nvp != NULL) |
|
999 VN_RELE(nvp); |
750 return (error); |
1000 return (error); |
751 } |
1001 } |
752 } |
1002 } |
753 |
1003 |
754 fromdv = VTOSDEV(ovp); |
1004 fromdv = VTOSDEV(ovp); |
755 ASSERT(fromdv); |
1005 ASSERT(fromdv); |
756 |
1006 |
757 /* destination file exists */ |
1007 /* destination file exists */ |
758 if (nvp) { |
1008 if (nvp != NULL) { |
759 todv = VTOSDEV(nvp); |
1009 todv = VTOSDEV(nvp); |
760 ASSERT(todv); |
1010 ASSERT(todv); |
761 } |
1011 } |
762 |
1012 |
|
1013 if ((fromdv->sdev_flags & SDEV_DYNAMIC) != 0 || |
|
1014 (todv != NULL && (todv->sdev_flags & SDEV_DYNAMIC) != 0)) { |
|
1015 mutex_exit(&sdev_lock); |
|
1016 if (nvp != NULL) |
|
1017 VN_RELE(nvp); |
|
1018 VN_RELE(ovp); |
|
1019 return (EACCES); |
|
1020 } |
|
1021 |
763 /* |
1022 /* |
764 * link source to new target in the memory |
1023 * link source to new target in the memory. Regardless of failure, we |
|
1024 * must rele our hold on nvp. |
765 */ |
1025 */ |
766 error = sdev_rnmnode(fromparent, fromdv, toparent, &todv, nnm, cred); |
1026 error = sdev_rnmnode(fromparent, fromdv, toparent, &todv, nnm, cred); |
|
1027 if (nvp != NULL) |
|
1028 VN_RELE(nvp); |
767 if (error) { |
1029 if (error) { |
768 sdcmn_err2(("sdev_rename: renaming %s to %s failed " |
1030 sdcmn_err2(("sdev_rename: renaming %s to %s failed " |
769 " with error %d\n", onm, nnm, error)); |
1031 " with error %d\n", onm, nnm, error)); |
770 mutex_exit(&sdev_lock); |
1032 mutex_exit(&sdev_lock); |
771 if (nvp) |
|
772 VN_RELE(nvp); |
|
773 VN_RELE(ovp); |
1033 VN_RELE(ovp); |
774 return (error); |
1034 return (error); |
775 } |
1035 } |
776 |
1036 |
777 /* |
1037 /* |