author | amw |
Thu, 25 Oct 2007 16:34:29 -0700 | |
changeset 5331 | 3047ad28a67b |
parent 4577 | ed36b0e652bc |
child 5446 | 51fbc14b301d |
permissions | -rw-r--r-- |
789 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
5 |
* Common Development and Distribution License (the "License"). |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
6 |
* You may not use this file except in compliance with the License. |
789 | 7 |
* |
8 |
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
3461 | 22 |
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. |
789 | 23 |
* Use is subject to license terms. |
24 |
*/ |
|
25 |
||
26 |
#pragma ident "%Z%%M% %I% %E% SMI" |
|
27 |
||
28 |
#include <sys/types.h> |
|
29 |
#include <sys/param.h> |
|
30 |
#include <sys/time.h> |
|
31 |
#include <sys/systm.h> |
|
32 |
#include <sys/sysmacros.h> |
|
33 |
#include <sys/resource.h> |
|
34 |
#include <sys/vfs.h> |
|
35 |
#include <sys/vnode.h> |
|
36 |
#include <sys/file.h> |
|
37 |
#include <sys/mode.h> |
|
38 |
#include <sys/kmem.h> |
|
39 |
#include <sys/uio.h> |
|
40 |
#include <sys/pathname.h> |
|
41 |
#include <sys/cmn_err.h> |
|
42 |
#include <sys/errno.h> |
|
43 |
#include <sys/stat.h> |
|
44 |
#include <sys/unistd.h> |
|
45 |
#include <sys/random.h> |
|
46 |
#include <sys/policy.h> |
|
47 |
#include <sys/zfs_dir.h> |
|
48 |
#include <sys/zfs_acl.h> |
|
49 |
#include <sys/fs/zfs.h> |
|
50 |
#include "fs/fs_subr.h" |
|
51 |
#include <sys/zap.h> |
|
52 |
#include <sys/dmu.h> |
|
53 |
#include <sys/atomic.h> |
|
54 |
#include <sys/zfs_ctldir.h> |
|
5331 | 55 |
#include <sys/zfs_fuid.h> |
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
56 |
#include <sys/dnlc.h> |
5331 | 57 |
#include <sys/extdirent.h> |
58 |
#include <sys/zfs_i18n.h> |
|
59 |
||
60 |
/* |
|
61 |
* zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups |
|
62 |
* of names after deciding which is the appropriate lookup interface. |
|
63 |
*/ |
|
64 |
static int |
|
65 |
zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, |
|
66 |
boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) |
|
67 |
{ |
|
68 |
int error; |
|
69 |
||
70 |
if (zfsvfs->z_norm) { |
|
71 |
matchtype_t mt = MT_FIRST; |
|
72 |
boolean_t conflict = B_FALSE; |
|
73 |
size_t bufsz = 0; |
|
74 |
char *buf = NULL; |
|
75 |
||
76 |
if (rpnp) { |
|
77 |
buf = rpnp->pn_path; |
|
78 |
bufsz = rpnp->pn_bufsize; |
|
79 |
} |
|
80 |
if (exact) |
|
81 |
mt = MT_EXACT; |
|
82 |
/* |
|
83 |
* In the non-mixed case we only expect there would ever |
|
84 |
* be one match, but we need to use the normalizing lookup. |
|
85 |
*/ |
|
86 |
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, |
|
87 |
zoid, mt, buf, bufsz, &conflict); |
|
88 |
if (deflags) |
|
89 |
*deflags = conflict ? ED_CASE_CONFLICT : 0; |
|
90 |
} else { |
|
91 |
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); |
|
92 |
} |
|
93 |
*zoid = ZFS_DIRENT_OBJ(*zoid); |
|
94 |
||
95 |
if (error == ENOENT && update) |
|
96 |
dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); |
|
97 |
||
98 |
return (error); |
|
99 |
} |
|
789 | 100 |
|
101 |
/* |
|
102 |
* Lock a directory entry. A dirlock on <dzp, name> protects that name |
|
103 |
* in dzp's directory zap object. As long as you hold a dirlock, you can |
|
104 |
* assume two things: (1) dzp cannot be reaped, and (2) no other thread |
|
105 |
* can change the zap entry for (i.e. link or unlink) this name. |
|
106 |
* |
|
107 |
* Input arguments: |
|
108 |
* dzp - znode for directory |
|
109 |
* name - name of entry to lock |
|
110 |
* flag - ZNEW: if the entry already exists, fail with EEXIST. |
|
111 |
* ZEXISTS: if the entry does not exist, fail with ENOENT. |
|
112 |
* ZSHARED: allow concurrent access with other ZSHARED callers. |
|
113 |
* ZXATTR: we want dzp's xattr directory |
|
5331 | 114 |
* ZCILOOK: On a mixed sensitivity file system, |
115 |
* this lookup should be case-insensitive. |
|
116 |
* ZCIEXACT: On a purely case-insensitive file system, |
|
117 |
* this lookup should be case-sensitive. |
|
118 |
* ZRENAMING: we are locking for renaming, force narrow locks |
|
789 | 119 |
* |
120 |
* Output arguments: |
|
121 |
* zpp - pointer to the znode for the entry (NULL if there isn't one) |
|
122 |
* dlpp - pointer to the dirlock for this entry (NULL on error) |
|
5331 | 123 |
* direntflags - (case-insensitive lookup only) |
124 |
* flags if multiple case-sensitive matches exist in directory |
|
125 |
* realpnp - (case-insensitive lookup only) |
|
126 |
* actual name matched within the directory |
|
789 | 127 |
* |
128 |
* Return value: 0 on success or errno on failure. |
|
129 |
* |
|
130 |
* NOTE: Always checks for, and rejects, '.' and '..'. |
|
5331 | 131 |
* NOTE: For case-insensitive file systems we take wide locks (see below), |
132 |
* but return znode pointers to a single match. |
|
789 | 133 |
*/ |
134 |
int |
|
135 |
zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, |
|
5331 | 136 |
int flag, int *direntflags, pathname_t *realpnp) |
789 | 137 |
{ |
138 |
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; |
|
139 |
zfs_dirlock_t *dl; |
|
5331 | 140 |
boolean_t update; |
141 |
boolean_t exact; |
|
789 | 142 |
uint64_t zoid; |
5331 | 143 |
vnode_t *vp = NULL; |
144 |
int error = 0; |
|
145 |
int cmpflags; |
|
789 | 146 |
|
147 |
*zpp = NULL; |
|
148 |
*dlpp = NULL; |
|
149 |
||
150 |
/* |
|
151 |
* Verify that we are not trying to lock '.', '..', or '.zfs' |
|
152 |
*/ |
|
153 |
if (name[0] == '.' && |
|
154 |
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || |
|
155 |
zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) |
|
156 |
return (EEXIST); |
|
157 |
||
158 |
/* |
|
5331 | 159 |
* Case sensitivity and normalization preferences are set when |
160 |
* the file system is created. These are stored in the |
|
161 |
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices |
|
162 |
* affect what vnodes can be cached in the DNLC, how we |
|
163 |
* perform zap lookups, and the "width" of our dirlocks. |
|
164 |
* |
|
165 |
* A normal dirlock locks a single name. Note that with |
|
166 |
* normalization a name can be composed multiple ways, but |
|
167 |
* when normalized, these names all compare equal. A wide |
|
168 |
* dirlock locks multiple names. We need these when the file |
|
169 |
* system is supporting mixed-mode access. It is sometimes |
|
170 |
* necessary to lock all case permutations of file name at |
|
171 |
* once so that simultaneous case-insensitive/case-sensitive |
|
172 |
* behaves as rationally as possible. |
|
173 |
*/ |
|
174 |
||
175 |
/* |
|
176 |
* Decide if exact matches should be requested when performing |
|
177 |
* a zap lookup on file systems supporting case-insensitive |
|
178 |
* access. |
|
179 |
*/ |
|
180 |
exact = ((zfsvfs->z_case & ZFS_CI_ONLY) && (flag & ZCIEXACT)) || |
|
181 |
((zfsvfs->z_case & ZFS_CI_MIXD) && !(flag & ZCILOOK)); |
|
182 |
||
183 |
/* |
|
184 |
* Only look in or update the DNLC if we are looking for the |
|
185 |
* name on a file system that does not require normalization |
|
186 |
* or case folding. We can also look there if we happen to be |
|
187 |
* on a non-normalizing, mixed sensitivity file system IF we |
|
188 |
* are looking for the exact name. |
|
189 |
* |
|
190 |
* Maybe can add TO-UPPERed version of name to dnlc in ci-only |
|
191 |
* case for performance improvement? |
|
192 |
*/ |
|
193 |
update = !zfsvfs->z_norm || |
|
194 |
((zfsvfs->z_case & ZFS_CI_MIXD) && |
|
195 |
!(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); |
|
196 |
||
197 |
/* |
|
198 |
* ZRENAMING indicates we are in a situation where we should |
|
199 |
* take narrow locks regardless of the file system's |
|
200 |
* preferences for normalizing and case folding. This will |
|
201 |
* prevent us deadlocking trying to grab the same wide lock |
|
202 |
* twice if the two names happen to be case-insensitive |
|
203 |
* matches. |
|
204 |
*/ |
|
205 |
if (flag & ZRENAMING) |
|
206 |
cmpflags = 0; |
|
207 |
else |
|
208 |
cmpflags = zfsvfs->z_norm; |
|
209 |
||
210 |
/* |
|
789 | 211 |
* Wait until there are no locks on this name. |
212 |
*/ |
|
3897
278bade789ba
6437750 panic: db->db_buf==0||arc_referenced(db->db_buf), file: dbuf.c,line:1539
maybee
parents:
3713
diff
changeset
|
213 |
rw_enter(&dzp->z_name_lock, RW_READER); |
789 | 214 |
mutex_enter(&dzp->z_lock); |
215 |
for (;;) { |
|
3461 | 216 |
if (dzp->z_unlinked) { |
789 | 217 |
mutex_exit(&dzp->z_lock); |
3897
278bade789ba
6437750 panic: db->db_buf==0||arc_referenced(db->db_buf), file: dbuf.c,line:1539
maybee
parents:
3713
diff
changeset
|
218 |
rw_exit(&dzp->z_name_lock); |
789 | 219 |
return (ENOENT); |
220 |
} |
|
5331 | 221 |
for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { |
222 |
if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, |
|
223 |
U8_UNICODE_LATEST, &error) == 0) || error != 0) |
|
789 | 224 |
break; |
5331 | 225 |
} |
226 |
if (error != 0) { |
|
227 |
mutex_exit(&dzp->z_lock); |
|
228 |
rw_exit(&dzp->z_name_lock); |
|
229 |
return (ENOENT); |
|
230 |
} |
|
789 | 231 |
if (dl == NULL) { |
232 |
/* |
|
233 |
* Allocate a new dirlock and add it to the list. |
|
234 |
*/ |
|
235 |
dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); |
|
236 |
cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); |
|
237 |
dl->dl_name = name; |
|
238 |
dl->dl_sharecnt = 0; |
|
239 |
dl->dl_namesize = 0; |
|
240 |
dl->dl_dzp = dzp; |
|
241 |
dl->dl_next = dzp->z_dirlocks; |
|
242 |
dzp->z_dirlocks = dl; |
|
243 |
break; |
|
244 |
} |
|
245 |
if ((flag & ZSHARED) && dl->dl_sharecnt != 0) |
|
246 |
break; |
|
247 |
cv_wait(&dl->dl_cv, &dzp->z_lock); |
|
248 |
} |
|
249 |
||
250 |
if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { |
|
251 |
/* |
|
252 |
* We're the second shared reference to dl. Make a copy of |
|
253 |
* dl_name in case the first thread goes away before we do. |
|
254 |
* Note that we initialize the new name before storing its |
|
255 |
* pointer into dl_name, because the first thread may load |
|
256 |
* dl->dl_name at any time. He'll either see the old value, |
|
257 |
* which is his, or the new shared copy; either is OK. |
|
258 |
*/ |
|
259 |
dl->dl_namesize = strlen(dl->dl_name) + 1; |
|
260 |
name = kmem_alloc(dl->dl_namesize, KM_SLEEP); |
|
261 |
bcopy(dl->dl_name, name, dl->dl_namesize); |
|
262 |
dl->dl_name = name; |
|
263 |
} |
|
264 |
||
265 |
mutex_exit(&dzp->z_lock); |
|
266 |
||
267 |
/* |
|
268 |
* We have a dirlock on the name. (Note that it is the dirlock, |
|
269 |
* not the dzp's z_lock, that protects the name in the zap object.) |
|
270 |
* See if there's an object by this name; if so, put a hold on it. |
|
271 |
*/ |
|
272 |
if (flag & ZXATTR) { |
|
273 |
zoid = dzp->z_phys->zp_xattr; |
|
274 |
error = (zoid == 0 ? ENOENT : 0); |
|
275 |
} else { |
|
5331 | 276 |
if (update) |
277 |
vp = dnlc_lookup(ZTOV(dzp), name); |
|
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
278 |
if (vp == DNLC_NO_VNODE) { |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
279 |
VN_RELE(vp); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
280 |
error = ENOENT; |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
281 |
} else if (vp) { |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
282 |
if (flag & ZNEW) { |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
283 |
zfs_dirent_unlock(dl); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
284 |
VN_RELE(vp); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
285 |
return (EEXIST); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
286 |
} |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
287 |
*dlpp = dl; |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
288 |
*zpp = VTOZ(vp); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
289 |
return (0); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
290 |
} else { |
5331 | 291 |
error = zfs_match_find(zfsvfs, dzp, name, exact, |
292 |
update, direntflags, realpnp, &zoid); |
|
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
293 |
} |
789 | 294 |
} |
295 |
if (error) { |
|
296 |
if (error != ENOENT || (flag & ZEXISTS)) { |
|
297 |
zfs_dirent_unlock(dl); |
|
298 |
return (error); |
|
299 |
} |
|
300 |
} else { |
|
301 |
if (flag & ZNEW) { |
|
302 |
zfs_dirent_unlock(dl); |
|
303 |
return (EEXIST); |
|
304 |
} |
|
305 |
error = zfs_zget(zfsvfs, zoid, zpp); |
|
306 |
if (error) { |
|
307 |
zfs_dirent_unlock(dl); |
|
308 |
return (error); |
|
309 |
} |
|
5331 | 310 |
if (!(flag & ZXATTR) && update) |
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
311 |
dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); |
789 | 312 |
} |
313 |
||
314 |
*dlpp = dl; |
|
315 |
||
316 |
return (0); |
|
317 |
} |
|
318 |
||
319 |
/* |
|
320 |
* Unlock this directory entry and wake anyone who was waiting for it. |
|
321 |
*/ |
|
322 |
void |
|
323 |
zfs_dirent_unlock(zfs_dirlock_t *dl) |
|
324 |
{ |
|
325 |
znode_t *dzp = dl->dl_dzp; |
|
326 |
zfs_dirlock_t **prev_dl, *cur_dl; |
|
327 |
||
328 |
mutex_enter(&dzp->z_lock); |
|
3897
278bade789ba
6437750 panic: db->db_buf==0||arc_referenced(db->db_buf), file: dbuf.c,line:1539
maybee
parents:
3713
diff
changeset
|
329 |
rw_exit(&dzp->z_name_lock); |
789 | 330 |
if (dl->dl_sharecnt > 1) { |
331 |
dl->dl_sharecnt--; |
|
332 |
mutex_exit(&dzp->z_lock); |
|
333 |
return; |
|
334 |
} |
|
335 |
prev_dl = &dzp->z_dirlocks; |
|
336 |
while ((cur_dl = *prev_dl) != dl) |
|
337 |
prev_dl = &cur_dl->dl_next; |
|
338 |
*prev_dl = dl->dl_next; |
|
339 |
cv_broadcast(&dl->dl_cv); |
|
340 |
mutex_exit(&dzp->z_lock); |
|
341 |
||
342 |
if (dl->dl_namesize != 0) |
|
343 |
kmem_free(dl->dl_name, dl->dl_namesize); |
|
344 |
cv_destroy(&dl->dl_cv); |
|
345 |
kmem_free(dl, sizeof (*dl)); |
|
346 |
} |
|
347 |
||
348 |
/* |
|
349 |
* Look up an entry in a directory. |
|
350 |
* |
|
351 |
* NOTE: '.' and '..' are handled as special cases because |
|
352 |
* no directory entries are actually stored for them. If this is |
|
353 |
* the root of a filesystem, then '.zfs' is also treated as a |
|
354 |
* special pseudo-directory. |
|
355 |
*/ |
|
356 |
int |
|
5331 | 357 |
zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, |
358 |
int *deflg, pathname_t *rpnp) |
|
789 | 359 |
{ |
360 |
zfs_dirlock_t *dl; |
|
361 |
znode_t *zp; |
|
362 |
int error = 0; |
|
363 |
||
364 |
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { |
|
365 |
*vpp = ZTOV(dzp); |
|
366 |
VN_HOLD(*vpp); |
|
367 |
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { |
|
368 |
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; |
|
369 |
/* |
|
370 |
* If we are a snapshot mounted under .zfs, return |
|
371 |
* the vp for the snapshot directory. |
|
372 |
*/ |
|
1878
c22df0f5603f
6413573 deadlock between fsflush() and zfs_create()
maybee
parents:
1544
diff
changeset
|
373 |
if (dzp->z_phys->zp_parent == dzp->z_id && |
c22df0f5603f
6413573 deadlock between fsflush() and zfs_create()
maybee
parents:
1544
diff
changeset
|
374 |
zfsvfs->z_parent != zfsvfs) { |
789 | 375 |
error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, |
5331 | 376 |
"snapshot", vpp, NULL, 0, NULL, kcred, |
377 |
NULL, NULL, NULL); |
|
789 | 378 |
return (error); |
379 |
} |
|
380 |
rw_enter(&dzp->z_parent_lock, RW_READER); |
|
381 |
error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); |
|
382 |
if (error == 0) |
|
383 |
*vpp = ZTOV(zp); |
|
384 |
rw_exit(&dzp->z_parent_lock); |
|
385 |
} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { |
|
386 |
*vpp = zfsctl_root(dzp); |
|
387 |
} else { |
|
5331 | 388 |
int zf; |
389 |
||
390 |
zf = ZEXISTS | ZSHARED; |
|
391 |
if (flags & FIGNORECASE) |
|
392 |
zf |= ZCILOOK; |
|
393 |
||
394 |
error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); |
|
789 | 395 |
if (error == 0) { |
396 |
*vpp = ZTOV(zp); |
|
397 |
zfs_dirent_unlock(dl); |
|
869
dc133b87dfb3
6297285 znode prefetching in zfs_readdir causes 5x performance degradation for 'ls'
perrin
parents:
789
diff
changeset
|
398 |
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ |
789 | 399 |
} |
5331 | 400 |
rpnp = NULL; |
789 | 401 |
} |
402 |
||
5331 | 403 |
if ((flags & FIGNORECASE) && rpnp) |
404 |
(void) strlcpy(rpnp->pn_path, name, rpnp->pn_bufsize); |
|
405 |
||
789 | 406 |
return (error); |
407 |
} |
|
408 |
||
409 |
static char * |
|
3461 | 410 |
zfs_unlinked_hexname(char namebuf[17], uint64_t x) |
789 | 411 |
{ |
412 |
char *name = &namebuf[16]; |
|
413 |
const char digits[16] = "0123456789abcdef"; |
|
414 |
||
415 |
*name = '\0'; |
|
416 |
do { |
|
417 |
*--name = digits[x & 0xf]; |
|
418 |
x >>= 4; |
|
419 |
} while (x != 0); |
|
420 |
||
421 |
return (name); |
|
422 |
} |
|
423 |
||
1544 | 424 |
/* |
3461 | 425 |
* unlinked Set (formerly known as the "delete queue") Error Handling |
1544 | 426 |
* |
3461 | 427 |
* When dealing with the unlinked set, we dmu_tx_hold_zap(), but we |
1544 | 428 |
* don't specify the name of the entry that we will be manipulating. We |
429 |
* also fib and say that we won't be adding any new entries to the |
|
3461 | 430 |
* unlinked set, even though we might (this is to lower the minimum file |
1544 | 431 |
* size that can be deleted in a full filesystem). So on the small |
3461 | 432 |
* chance that the nlink list is using a fat zap (ie. has more than |
1544 | 433 |
* 2000 entries), we *may* not pre-read a block that's needed. |
434 |
* Therefore it is remotely possible for some of the assertions |
|
3461 | 435 |
* regarding the unlinked set below to fail due to i/o error. On a |
1544 | 436 |
* nondebug system, this will result in the space being leaked. |
437 |
*/ |
|
789 | 438 |
void |
3461 | 439 |
zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) |
789 | 440 |
{ |
441 |
zfsvfs_t *zfsvfs = zp->z_zfsvfs; |
|
442 |
char obj_name[17]; |
|
443 |
int error; |
|
444 |
||
3461 | 445 |
ASSERT(zp->z_unlinked); |
789 | 446 |
ASSERT3U(zp->z_phys->zp_links, ==, 0); |
447 |
||
3461 | 448 |
error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj, |
449 |
zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx); |
|
789 | 450 |
ASSERT3U(error, ==, 0); |
451 |
} |
|
452 |
||
453 |
/* |
|
3461 | 454 |
* Clean up any znodes that had no links when we either crashed or |
455 |
* (force) umounted the file system. |
|
456 |
*/ |
|
457 |
void |
|
458 |
zfs_unlinked_drain(zfsvfs_t *zfsvfs) |
|
459 |
{ |
|
460 |
zap_cursor_t zc; |
|
461 |
zap_attribute_t zap; |
|
462 |
dmu_object_info_t doi; |
|
463 |
znode_t *zp; |
|
464 |
int error; |
|
465 |
||
466 |
/* |
|
467 |
* Interate over the contents of the unlinked set. |
|
468 |
*/ |
|
469 |
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); |
|
470 |
zap_cursor_retrieve(&zc, &zap) == 0; |
|
471 |
zap_cursor_advance(&zc)) { |
|
472 |
||
473 |
/* |
|
474 |
* See what kind of object we have in list |
|
475 |
*/ |
|
476 |
||
477 |
error = dmu_object_info(zfsvfs->z_os, |
|
478 |
zap.za_first_integer, &doi); |
|
479 |
if (error != 0) |
|
480 |
continue; |
|
481 |
||
482 |
ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || |
|
483 |
(doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); |
|
484 |
/* |
|
485 |
* We need to re-mark these list entries for deletion, |
|
486 |
* so we pull them back into core and set zp->z_unlinked. |
|
487 |
*/ |
|
488 |
error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); |
|
489 |
||
490 |
/* |
|
491 |
* We may pick up znodes that are already marked for deletion. |
|
492 |
* This could happen during the purge of an extended attribute |
|
493 |
* directory. All we need to do is skip over them, since they |
|
494 |
* are already in the system marked z_unlinked. |
|
495 |
*/ |
|
496 |
if (error != 0) |
|
497 |
continue; |
|
498 |
||
499 |
zp->z_unlinked = B_TRUE; |
|
500 |
VN_RELE(ZTOV(zp)); |
|
501 |
} |
|
502 |
zap_cursor_fini(&zc); |
|
503 |
} |
|
504 |
||
505 |
/* |
|
789 | 506 |
* Delete the entire contents of a directory. Return a count |
507 |
* of the number of entries that could not be deleted. |
|
508 |
* |
|
509 |
* NOTE: this function assumes that the directory is inactive, |
|
510 |
* so there is no need to lock its entries before deletion. |
|
511 |
* Also, it assumes the directory contents is *only* regular |
|
512 |
* files. |
|
513 |
*/ |
|
514 |
static int |
|
515 |
zfs_purgedir(znode_t *dzp) |
|
516 |
{ |
|
517 |
zap_cursor_t zc; |
|
518 |
zap_attribute_t zap; |
|
519 |
znode_t *xzp; |
|
520 |
dmu_tx_t *tx; |
|
521 |
zfsvfs_t *zfsvfs = dzp->z_zfsvfs; |
|
522 |
zfs_dirlock_t dl; |
|
523 |
int skipped = 0; |
|
524 |
int error; |
|
525 |
||
526 |
for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); |
|
527 |
(error = zap_cursor_retrieve(&zc, &zap)) == 0; |
|
528 |
zap_cursor_advance(&zc)) { |
|
3912 | 529 |
error = zfs_zget(zfsvfs, |
530 |
ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); |
|
789 | 531 |
ASSERT3U(error, ==, 0); |
532 |
||
533 |
ASSERT((ZTOV(xzp)->v_type == VREG) || |
|
534 |
(ZTOV(xzp)->v_type == VLNK)); |
|
535 |
||
536 |
tx = dmu_tx_create(zfsvfs->z_os); |
|
537 |
dmu_tx_hold_bonus(tx, dzp->z_id); |
|
1544 | 538 |
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); |
789 | 539 |
dmu_tx_hold_bonus(tx, xzp->z_id); |
3461 | 540 |
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); |
789 | 541 |
error = dmu_tx_assign(tx, TXG_WAIT); |
542 |
if (error) { |
|
543 |
dmu_tx_abort(tx); |
|
544 |
VN_RELE(ZTOV(xzp)); |
|
545 |
skipped += 1; |
|
546 |
continue; |
|
547 |
} |
|
548 |
bzero(&dl, sizeof (dl)); |
|
549 |
dl.dl_dzp = dzp; |
|
550 |
dl.dl_name = zap.za_name; |
|
551 |
||
552 |
error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); |
|
553 |
ASSERT3U(error, ==, 0); |
|
554 |
dmu_tx_commit(tx); |
|
555 |
||
556 |
VN_RELE(ZTOV(xzp)); |
|
557 |
} |
|
885
d925b21dba78
6347493 tar of 25K empty directory entries in ZFS takes 30+ seconds ...
ahrens
parents:
869
diff
changeset
|
558 |
zap_cursor_fini(&zc); |
789 | 559 |
ASSERT(error == ENOENT); |
560 |
return (skipped); |
|
561 |
} |
|
562 |
||
563 |
void |
|
564 |
zfs_rmnode(znode_t *zp) |
|
565 |
{ |
|
566 |
zfsvfs_t *zfsvfs = zp->z_zfsvfs; |
|
567 |
objset_t *os = zfsvfs->z_os; |
|
568 |
znode_t *xzp = NULL; |
|
569 |
char obj_name[17]; |
|
570 |
dmu_tx_t *tx; |
|
571 |
uint64_t acl_obj; |
|
572 |
int error; |
|
573 |
||
574 |
ASSERT(ZTOV(zp)->v_count == 0); |
|
575 |
ASSERT(zp->z_phys->zp_links == 0); |
|
576 |
||
577 |
/* |
|
578 |
* If this is an attribute directory, purge its contents. |
|
579 |
*/ |
|
3461 | 580 |
if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { |
789 | 581 |
if (zfs_purgedir(zp) != 0) { |
582 |
/* |
|
3461 | 583 |
* Not enough space to delete some xattrs. |
584 |
* Leave it on the unlinked set. |
|
789 | 585 |
*/ |
586 |
return; |
|
587 |
} |
|
3461 | 588 |
} |
789 | 589 |
|
590 |
/* |
|
3461 | 591 |
* If the file has extended attributes, we're going to unlink |
592 |
* the xattr dir. |
|
789 | 593 |
*/ |
594 |
if (zp->z_phys->zp_xattr) { |
|
595 |
error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); |
|
596 |
ASSERT(error == 0); |
|
597 |
} |
|
598 |
||
599 |
acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; |
|
600 |
||
601 |
/* |
|
602 |
* Set up the transaction. |
|
603 |
*/ |
|
604 |
tx = dmu_tx_create(os); |
|
605 |
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); |
|
3461 | 606 |
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); |
789 | 607 |
if (xzp) { |
608 |
dmu_tx_hold_bonus(tx, xzp->z_id); |
|
3461 | 609 |
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); |
789 | 610 |
} |
611 |
if (acl_obj) |
|
612 |
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); |
|
613 |
error = dmu_tx_assign(tx, TXG_WAIT); |
|
614 |
if (error) { |
|
3461 | 615 |
/* |
616 |
* Not enough space to delete the file. Leave it in the |
|
617 |
* unlinked set, leaking it until the fs is remounted (at |
|
618 |
* which point we'll call zfs_unlinked_drain() to process it). |
|
619 |
*/ |
|
789 | 620 |
dmu_tx_abort(tx); |
621 |
return; |
|
622 |
} |
|
623 |
||
624 |
if (xzp) { |
|
625 |
dmu_buf_will_dirty(xzp->z_dbuf, tx); |
|
626 |
mutex_enter(&xzp->z_lock); |
|
3461 | 627 |
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ |
789 | 628 |
xzp->z_phys->zp_links = 0; /* no more links to it */ |
629 |
mutex_exit(&xzp->z_lock); |
|
3461 | 630 |
zfs_unlinked_add(xzp, tx); |
789 | 631 |
} |
632 |
||
3461 | 633 |
/* Remove this znode from the unlinked set */ |
634 |
error = zap_remove(os, zfsvfs->z_unlinkedobj, |
|
635 |
zfs_unlinked_hexname(obj_name, zp->z_id), tx); |
|
789 | 636 |
ASSERT3U(error, ==, 0); |
637 |
||
638 |
zfs_znode_delete(zp, tx); |
|
639 |
||
640 |
dmu_tx_commit(tx); |
|
641 |
||
642 |
if (xzp) |
|
643 |
VN_RELE(ZTOV(xzp)); |
|
644 |
} |
|
645 |
||
4577 | 646 |
static uint64_t |
647 |
zfs_dirent(znode_t *zp) |
|
648 |
{ |
|
649 |
uint64_t de = zp->z_id; |
|
650 |
if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) |
|
651 |
de |= IFTODT((zp)->z_phys->zp_mode) << 60; |
|
652 |
return (de); |
|
653 |
} |
|
654 |
||
789 | 655 |
/* |
3461 | 656 |
* Link zp into dl. Can only fail if zp has been unlinked. |
789 | 657 |
*/ |
658 |
int |
|
659 |
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) |
|
660 |
{ |
|
661 |
znode_t *dzp = dl->dl_dzp; |
|
662 |
vnode_t *vp = ZTOV(zp); |
|
3912 | 663 |
uint64_t value; |
789 | 664 |
int zp_is_dir = (vp->v_type == VDIR); |
665 |
int error; |
|
666 |
||
667 |
dmu_buf_will_dirty(zp->z_dbuf, tx); |
|
668 |
mutex_enter(&zp->z_lock); |
|
669 |
||
670 |
if (!(flag & ZRENAMING)) { |
|
3461 | 671 |
if (zp->z_unlinked) { /* no new links to unlinked zp */ |
789 | 672 |
ASSERT(!(flag & (ZNEW | ZEXISTS))); |
673 |
mutex_exit(&zp->z_lock); |
|
674 |
return (ENOENT); |
|
675 |
} |
|
676 |
zp->z_phys->zp_links++; |
|
677 |
} |
|
678 |
zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ |
|
679 |
||
680 |
if (!(flag & ZNEW)) |
|
681 |
zfs_time_stamper_locked(zp, STATE_CHANGED, tx); |
|
682 |
mutex_exit(&zp->z_lock); |
|
683 |
||
684 |
dmu_buf_will_dirty(dzp->z_dbuf, tx); |
|
685 |
mutex_enter(&dzp->z_lock); |
|
686 |
dzp->z_phys->zp_size++; /* one dirent added */ |
|
687 |
dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ |
|
688 |
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); |
|
689 |
mutex_exit(&dzp->z_lock); |
|
690 |
||
4577 | 691 |
value = zfs_dirent(zp); |
789 | 692 |
error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, |
3912 | 693 |
8, 1, &value, tx); |
789 | 694 |
ASSERT(error == 0); |
695 |
||
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
696 |
dnlc_update(ZTOV(dzp), dl->dl_name, vp); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
697 |
|
789 | 698 |
return (0); |
699 |
} |
|
700 |
||
701 |
/* |
|
3461 | 702 |
* Unlink zp from dl, and mark zp for deletion if this was the last link. |
789 | 703 |
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). |
3461 | 704 |
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. |
705 |
* If it's non-NULL, we use it to indicate whether the znode needs deletion, |
|
789 | 706 |
* and it's the caller's job to do it. |
707 |
*/ |
|
708 |
int |
|
709 |
zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, |
|
3461 | 710 |
boolean_t *unlinkedp) |
789 | 711 |
{ |
712 |
znode_t *dzp = dl->dl_dzp; |
|
713 |
vnode_t *vp = ZTOV(zp); |
|
714 |
int zp_is_dir = (vp->v_type == VDIR); |
|
3461 | 715 |
boolean_t unlinked = B_FALSE; |
789 | 716 |
int error; |
717 |
||
1484
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
718 |
dnlc_remove(ZTOV(dzp), dl->dl_name); |
d330e98f8ed7
6350001 ZFS lookup performance still much slower than UFS : help tar : help spec SFS
ek110237
parents:
1231
diff
changeset
|
719 |
|
789 | 720 |
if (!(flag & ZRENAMING)) { |
721 |
dmu_buf_will_dirty(zp->z_dbuf, tx); |
|
722 |
||
723 |
if (vn_vfswlock(vp)) /* prevent new mounts on zp */ |
|
724 |
return (EBUSY); |
|
725 |
||
726 |
if (vn_ismntpt(vp)) { /* don't remove mount point */ |
|
727 |
vn_vfsunlock(vp); |
|
728 |
return (EBUSY); |
|
729 |
} |
|
730 |
||
731 |
mutex_enter(&zp->z_lock); |
|
732 |
if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ |
|
733 |
mutex_exit(&zp->z_lock); |
|
734 |
vn_vfsunlock(vp); |
|
735 |
return (EEXIST); |
|
736 |
} |
|
3713 | 737 |
if (zp->z_phys->zp_links <= zp_is_dir) { |
738 |
zfs_panic_recover("zfs: link count on %s is %u, " |
|
739 |
"should be at least %u", |
|
740 |
zp->z_vnode->v_path ? zp->z_vnode->v_path : |
|
741 |
"<unknown>", (int)zp->z_phys->zp_links, |
|
742 |
zp_is_dir + 1); |
|
743 |
zp->z_phys->zp_links = zp_is_dir + 1; |
|
744 |
} |
|
789 | 745 |
if (--zp->z_phys->zp_links == zp_is_dir) { |
3461 | 746 |
zp->z_unlinked = B_TRUE; |
789 | 747 |
zp->z_phys->zp_links = 0; |
3461 | 748 |
unlinked = B_TRUE; |
789 | 749 |
} else { |
750 |
zfs_time_stamper_locked(zp, STATE_CHANGED, tx); |
|
751 |
} |
|
752 |
mutex_exit(&zp->z_lock); |
|
753 |
vn_vfsunlock(vp); |
|
754 |
} |
|
755 |
||
756 |
dmu_buf_will_dirty(dzp->z_dbuf, tx); |
|
757 |
mutex_enter(&dzp->z_lock); |
|
758 |
dzp->z_phys->zp_size--; /* one dirent removed */ |
|
759 |
dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ |
|
760 |
zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); |
|
761 |
mutex_exit(&dzp->z_lock); |
|
762 |
||
5331 | 763 |
if (zp->z_zfsvfs->z_norm) { |
764 |
if (((zp->z_zfsvfs->z_case & ZFS_CI_ONLY) && |
|
765 |
(flag & ZCIEXACT)) || |
|
766 |
((zp->z_zfsvfs->z_case & ZFS_CI_MIXD) && |
|
767 |
!(flag & ZCILOOK))) |
|
768 |
error = zap_remove_norm(zp->z_zfsvfs->z_os, |
|
769 |
dzp->z_id, dl->dl_name, MT_EXACT, tx); |
|
770 |
else |
|
771 |
error = zap_remove_norm(zp->z_zfsvfs->z_os, |
|
772 |
dzp->z_id, dl->dl_name, MT_FIRST, tx); |
|
773 |
} else { |
|
774 |
error = zap_remove(zp->z_zfsvfs->z_os, |
|
775 |
dzp->z_id, dl->dl_name, tx); |
|
776 |
} |
|
789 | 777 |
ASSERT(error == 0); |
778 |
||
3461 | 779 |
if (unlinkedp != NULL) |
780 |
*unlinkedp = unlinked; |
|
781 |
else if (unlinked) |
|
782 |
zfs_unlinked_add(zp, tx); |
|
789 | 783 |
|
784 |
return (0); |
|
785 |
} |
|
786 |
||
787 |
/* |
|
788 |
* Indicate whether the directory is empty. Works with or without z_lock |
|
789 |
* held, but can only be consider a hint in the latter case. Returns true |
|
790 |
* if only "." and ".." remain and there's no work in progress. |
|
791 |
*/ |
|
792 |
boolean_t |
|
793 |
zfs_dirempty(znode_t *dzp) |
|
794 |
{ |
|
795 |
return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); |
|
796 |
} |
|
797 |
||
798 |
int |
|
799 |
zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) |
|
800 |
{ |
|
801 |
zfsvfs_t *zfsvfs = zp->z_zfsvfs; |
|
802 |
znode_t *xzp; |
|
803 |
dmu_tx_t *tx; |
|
804 |
uint64_t xoid; |
|
805 |
int error; |
|
5331 | 806 |
zfs_fuid_info_t *fuidp = NULL; |
789 | 807 |
|
808 |
*xvpp = NULL; |
|
809 |
||
5331 | 810 |
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) |
789 | 811 |
return (error); |
812 |
||
813 |
tx = dmu_tx_create(zfsvfs->z_os); |
|
814 |
dmu_tx_hold_bonus(tx, zp->z_id); |
|
1544 | 815 |
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); |
5331 | 816 |
if (zfsvfs->z_fuid_obj == 0) { |
817 |
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); |
|
818 |
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); |
|
819 |
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); |
|
820 |
} else { |
|
821 |
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); |
|
822 |
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, SPA_MAXBLOCKSIZE); |
|
823 |
} |
|
789 | 824 |
error = dmu_tx_assign(tx, zfsvfs->z_assign); |
825 |
if (error) { |
|
2113
0510bb40c993
6430121 3-way deadlock involving tc_lock within zfs
ahrens
parents:
1878
diff
changeset
|
826 |
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) |
0510bb40c993
6430121 3-way deadlock involving tc_lock within zfs
ahrens
parents:
1878
diff
changeset
|
827 |
dmu_tx_wait(tx); |
789 | 828 |
dmu_tx_abort(tx); |
829 |
return (error); |
|
830 |
} |
|
5331 | 831 |
zfs_mknode(zp, vap, &xoid, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); |
789 | 832 |
ASSERT(xzp->z_id == xoid); |
833 |
ASSERT(xzp->z_phys->zp_parent == zp->z_id); |
|
834 |
dmu_buf_will_dirty(zp->z_dbuf, tx); |
|
835 |
zp->z_phys->zp_xattr = xoid; |
|
836 |
||
5331 | 837 |
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, |
838 |
xzp, "", NULL, fuidp, vap); |
|
839 |
if (fuidp) |
|
840 |
zfs_fuid_info_free(fuidp); |
|
789 | 841 |
dmu_tx_commit(tx); |
842 |
||
843 |
*xvpp = ZTOV(xzp); |
|
844 |
||
845 |
return (0); |
|
846 |
} |
|
847 |
||
848 |
/* |
|
849 |
* Return a znode for the extended attribute directory for zp. |
|
850 |
* ** If the directory does not already exist, it is created ** |
|
851 |
* |
|
852 |
* IN: zp - znode to obtain attribute directory from |
|
853 |
* cr - credentials of caller |
|
3280
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
854 |
* flags - flags from the VOP_LOOKUP call |
789 | 855 |
* |
856 |
* OUT: xzpp - pointer to extended attribute znode |
|
857 |
* |
|
858 |
* RETURN: 0 on success |
|
859 |
* error number on failure |
|
860 |
*/ |
|
861 |
int |
|
3280
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
862 |
zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) |
789 | 863 |
{ |
864 |
zfsvfs_t *zfsvfs = zp->z_zfsvfs; |
|
865 |
znode_t *xzp; |
|
866 |
zfs_dirlock_t *dl; |
|
867 |
vattr_t va; |
|
868 |
int error; |
|
869 |
top: |
|
5331 | 870 |
error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); |
789 | 871 |
if (error) |
872 |
return (error); |
|
873 |
||
874 |
if (xzp != NULL) { |
|
875 |
*xvpp = ZTOV(xzp); |
|
876 |
zfs_dirent_unlock(dl); |
|
877 |
return (0); |
|
878 |
} |
|
879 |
||
880 |
ASSERT(zp->z_phys->zp_xattr == 0); |
|
881 |
||
3280
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
882 |
if (!(flags & CREATE_XATTR_DIR)) { |
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
883 |
zfs_dirent_unlock(dl); |
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
884 |
return (ENOENT); |
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
885 |
} |
e93ccc27c51d
6492686 NFSv4 client got EACCES over ZFS when trying to OPENATTR without createdir
ck153898
parents:
2597
diff
changeset
|
886 |
|
789 | 887 |
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { |
888 |
zfs_dirent_unlock(dl); |
|
889 |
return (EROFS); |
|
890 |
} |
|
891 |
||
892 |
/* |
|
893 |
* The ability to 'create' files in an attribute |
|
894 |
* directory comes from the write_xattr permission on the base file. |
|
895 |
* |
|
896 |
* The ability to 'search' an attribute directory requires |
|
897 |
* read_xattr permission on the base file. |
|
898 |
* |
|
899 |
* Once in a directory the ability to read/write attributes |
|
900 |
* is controlled by the permissions on the attribute file. |
|
901 |
*/ |
|
902 |
va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; |
|
903 |
va.va_type = VDIR; |
|
1231
64215f768e86
6354804 The file's ACL was changed when cp it from one ZFS file system to another one.
marks
parents:
885
diff
changeset
|
904 |
va.va_mode = S_IFDIR | S_ISVTX | 0777; |
5331 | 905 |
zfs_fuid_map_ids(zp, &va.va_uid, &va.va_gid); |
789 | 906 |
|
907 |
error = zfs_make_xattrdir(zp, &va, xvpp, cr); |
|
908 |
zfs_dirent_unlock(dl); |
|
909 |
||
910 |
if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { |
|
2113
0510bb40c993
6430121 3-way deadlock involving tc_lock within zfs
ahrens
parents:
1878
diff
changeset
|
911 |
/* NB: we already did dmu_tx_wait() if necessary */ |
789 | 912 |
goto top; |
913 |
} |
|
914 |
||
915 |
return (error); |
|
916 |
} |
|
917 |
||
918 |
/* |
|
919 |
* Decide whether it is okay to remove within a sticky directory. |
|
920 |
* |
|
921 |
* In sticky directories, write access is not sufficient; |
|
922 |
* you can remove entries from a directory only if: |
|
923 |
* |
|
924 |
* you own the directory, |
|
925 |
* you own the entry, |
|
926 |
* the entry is a plain file and you have write access, |
|
927 |
* or you are privileged (checked in secpolicy...). |
|
928 |
* |
|
929 |
* The function returns 0 if remove access is granted. |
|
930 |
*/ |
|
931 |
int |
|
932 |
zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) |
|
933 |
{ |
|
934 |
uid_t uid; |
|
5331 | 935 |
uid_t downer; |
936 |
uid_t fowner; |
|
937 |
zfsvfs_t *zfsvfs = zdp->z_zfsvfs; |
|
789 | 938 |
|
939 |
if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ |
|
940 |
return (0); |
|
941 |
||
5331 | 942 |
if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) |
943 |
return (0); |
|
944 |
||
945 |
zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, ZFS_OWNER, &downer); |
|
946 |
zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, ZFS_OWNER, &fowner); |
|
947 |
||
948 |
if ((uid = crgetuid(cr)) == downer || uid == fowner || |
|
789 | 949 |
(ZTOV(zp)->v_type == VREG && |
5331 | 950 |
zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) |
789 | 951 |
return (0); |
952 |
else |
|
953 |
return (secpolicy_vnode_remove(cr)); |
|
954 |
} |