6775100 stat() performance on files on zfs should be improved
6827779 rrwlock is overly protective of its counters
--- a/usr/src/uts/common/fs/zfs/rrwlock.c Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/rrwlock.c Sat Jun 27 19:22:00 2009 -0600
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/refcount.h>
#include <sys/rrwlock.h>
@@ -118,7 +116,7 @@
rrw_node_t *prev = NULL;
if (refcount_count(&rrl->rr_linked_rcount) == 0)
- return (NULL);
+ return (B_FALSE);
for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
if (rn->rn_rrl == rrl) {
@@ -159,6 +157,14 @@
rrw_enter_read(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+ rrl->rr_anon_rcount.rc_count++;
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
ASSERT(rrl->rr_writer != curthread);
ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
@@ -208,19 +214,28 @@
rrw_exit(rrwlock_t *rrl, void *tag)
{
mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+ if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+ rrl->rr_anon_rcount.rc_count--;
+ if (rrl->rr_anon_rcount.rc_count == 0)
+ cv_broadcast(&rrl->rr_cv);
+ mutex_exit(&rrl->rr_lock);
+ return;
+ }
+ DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
!refcount_is_zero(&rrl->rr_linked_rcount) ||
rrl->rr_writer != NULL);
if (rrl->rr_writer == NULL) {
- if (rrn_find_and_remove(rrl)) {
- if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
-
- } else {
- if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
- cv_broadcast(&rrl->rr_cv);
- }
+ int64_t count;
+ if (rrn_find_and_remove(rrl))
+ count = refcount_remove(&rrl->rr_linked_rcount, tag);
+ else
+ count = refcount_remove(&rrl->rr_anon_rcount, tag);
+ if (count == 0)
+ cv_broadcast(&rrl->rr_cv);
} else {
ASSERT(rrl->rr_writer == curthread);
ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h Sat Jun 27 19:22:00 2009 -0600
@@ -203,6 +203,7 @@
void zfs_ace_byteswap(void *, size_t, boolean_t);
extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
extern int zfs_acl_access(struct znode *, int, cred_t *);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h Sat Jun 27 19:22:00 2009 -0600
@@ -77,6 +77,7 @@
#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */
+#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */
/*
* Is ID ephemeral?
@@ -200,6 +201,7 @@
uint64_t z_gen; /* generation (same as zp_gen) */
uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */
+ zfs_acl_t *z_acl_cached; /* cached acl */
list_node_t z_link_node; /* all znodes in fs link */
/*
* These are dmu managed fields.
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c Sat Jun 27 19:22:00 2009 -0600
@@ -781,6 +781,7 @@
uint64_t who;
uint16_t iflags, type;
uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
@@ -905,8 +906,26 @@
}
}
}
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
}
}
+
+ if (!an_exec_denied && !(seen & (S_IXUSR | S_IXGRP | S_IXOTH)) ||
+ !(mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+
return (mode);
}
@@ -960,8 +979,14 @@
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_acl_cached) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
*aclpp = zfs_acl_node_read_internal(zp, will_modify);
+ zp->z_acl_cached = *aclpp;
return (0);
}
@@ -994,7 +1019,7 @@
return (error);
}
- *aclpp = aclp;
+ zp->z_acl_cached = *aclpp = aclp;
return (0);
}
@@ -1019,6 +1044,11 @@
dmu_buf_will_dirty(zp->z_dbuf, tx);
+ if (zp->z_acl_cached != aclp && zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
zphys->zp_mode = zfs_mode_compute(zp, aclp);
/*
@@ -1606,6 +1636,7 @@
if (error == 0) {
(*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+ zp->z_acl_cached = *aclp;
}
mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock);
@@ -1869,7 +1900,6 @@
mutex_exit(&dzp->z_acl_lock);
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
- zfs_acl_free(paclp);
} else {
acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
@@ -1998,8 +2028,6 @@
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
-
return (0);
}
@@ -2095,11 +2123,6 @@
aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
}
top:
- if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
- zfs_acl_free(aclp);
- return (error);
- }
-
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
@@ -2154,7 +2177,7 @@
if (fuidp)
zfs_fuid_info_free(fuidp);
- zfs_acl_free(aclp);
+ zp->z_acl_cached = aclp;
dmu_tx_commit(tx);
done:
mutex_exit(&zp->z_acl_lock);
@@ -2301,7 +2324,6 @@
checkit = B_TRUE;
break;
} else {
- zfs_acl_free(aclp);
mutex_exit(&zp->z_acl_lock);
return (EIO);
}
@@ -2334,7 +2356,6 @@
}
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
/* Put the found 'denies' back on the working mode */
if (deny_mask) {
@@ -2420,6 +2441,72 @@
check_privs, B_FALSE, cr));
}
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t fowner;
+ uid_t gowner;
+ uid_t uid = crgetuid(cr);
+ int error;
+
+ if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+ return (EACCES);
+
+ is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+ (ZTOV(zdp)->v_type == VDIR));
+ if (is_attr)
+ goto slow;
+
+ mutex_enter(&zdp->z_acl_lock);
+
+ if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+
+ if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
+ FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+
+ fowner = (uid_t)zdp->z_phys->zp_uid;
+ gowner = (uid_t)zdp->z_phys->zp_gid;
+
+ if (uid == fowner) {
+ owner = B_TRUE;
+ if (zdp->z_phys->zp_mode & S_IXUSR) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+ if (groupmember(gowner, cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_phys->zp_mode & S_IXGRP) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_phys->zp_mode & S_IXOTH) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zdp->z_acl_lock);
+
+slow:
+ DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+ ZFS_ENTER(zdp->z_zfsvfs);
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ ZFS_EXIT(zdp->z_zfsvfs);
+ return (error);
+}
+
/*
* Determine whether Access should be granted/denied, invoking least
* priv subsytem when a deny is determined.
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Sat Jun 27 19:22:00 2009 -0600
@@ -988,6 +988,27 @@
}
/*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+static int
+specvp_check(vnode_t **vpp, cred_t *cr)
+{
+ int error = 0;
+
+ if (IS_DEVVP(*vpp)) {
+ struct vnode *svp;
+
+ svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+ VN_RELE(*vpp);
+ if (svp == NULL)
+ error = ENOSYS;
+ *vpp = svp;
+ }
+ return (error);
+}
+
+
+/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held vnode reference for it.
*
@@ -1017,7 +1038,46 @@
{
znode_t *zdp = VTOZ(dvp);
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
- int error;
+ int error = 0;
+
+ /* fast path */
+ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+ if (dvp->v_type != VDIR) {
+ return (ENOTDIR);
+ } else if (zdp->z_dbuf == NULL) {
+ return (EIO);
+ }
+
+ if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (!error) {
+ *vpp = dvp;
+ VN_HOLD(*vpp);
+ return (0);
+ }
+ return (error);
+ } else {
+ vnode_t *tvp = dnlc_lookup(dvp, nm);
+
+ if (tvp) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (error) {
+ VN_RELE(tvp);
+ return (error);
+ }
+ if (tvp == DNLC_NO_VNODE) {
+ VN_RELE(tvp);
+ return (ENOENT);
+ } else {
+ *vpp = tvp;
+ return (specvp_check(vpp, cr));
+ }
+ }
+ }
+ }
+
+ DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zdp);
@@ -1082,21 +1142,8 @@
}
error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
- if (error == 0) {
- /*
- * Convert device special files
- */
- if (IS_DEVVP(*vpp)) {
- vnode_t *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = ENOSYS;
- else
- *vpp = svp;
- }
- }
+ if (error == 0)
+ error = specvp_check(vpp, cr);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1332,19 +1379,7 @@
VN_RELE(ZTOV(zp));
} else {
*vpp = ZTOV(zp);
- /*
- * If vnode is for a device return a specfs vnode instead.
- */
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
-
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL) {
- error = ENOSYS;
- }
- *vpp = svp;
- }
+ error = specvp_check(vpp, cr);
}
ZFS_EXIT(zfsvfs);
@@ -2456,6 +2491,7 @@
top:
attrzp = NULL;
+ /* Can this be moved to before the top label? */
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
ZFS_EXIT(zfsvfs);
return (EROFS);
@@ -2856,11 +2892,6 @@
if (attrzp)
VN_RELE(ZTOV(attrzp));
- if (aclp) {
- zfs_acl_free(aclp);
- aclp = NULL;
- }
-
if (fuidp) {
zfs_fuid_info_free(fuidp);
fuidp = NULL;
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c Sat Jun 27 19:22:00 2009 -0600
@@ -133,6 +133,7 @@
zp->z_dbuf = NULL;
zp->z_dirlocks = NULL;
+ zp->z_acl_cached = NULL;
return (0);
}
@@ -1081,6 +1082,11 @@
list_remove(&zfsvfs->z_all_znodes, zp);
mutex_exit(&zfsvfs->z_znodes_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
kmem_cache_free(znode_cache, zp);
VFS_RELE(zfsvfs->z_vfs);