usr/src/uts/common/fs/ufs/ufs_inode.c
author Casper H.S. Dik <Casper.Dik@Sun.COM>
Wed, 28 Apr 2010 10:01:37 +0200
changeset 12273 63678502e95e
parent 11066 cebb50cbe4f9
permissions -rw-r--r--
PSARC 2009/377 In-kernel pfexec implementation. PSARC 2009/378 Basic File Privileges PSARC 2010/072 RBAC update: user attrs from profiles 4912090 pfzsh(1) should exist 4912093 pfbash(1) should exist 4912096 pftcsh(1) should exist 6440298 Expand the basic privilege set in order to restrict file access 6859862 Move pfexec into the kernel 6919171 cred_t sidesteps kmem_debug; we need to be able to detect bad hold/free when they occur 6923721 The new SYS_SMB privilege is not backward compatible 6937562 autofs doesn't remove its door when the zone shuts down 6937727 Zones stuck on deathrow; netstack_zone keeps a credential reference to the zone 6940159 Implement PSARC 2010/072

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/bitmap.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/vfs.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/dnlc.h>
#include <sys/mode.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/acl.h>
#include <sys/var.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_acl.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_log.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <sys/swap.h>
#include <sys/cpuvar.h>
#include <sys/sysmacros.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <fs/fs_subr.h>
#include <sys/policy.h>

struct kmem_cache *inode_cache;		/* cache of free inodes */

/* UFS Inode Cache Stats -- Not protected */
struct	instats ins = {
	{ "size",		KSTAT_DATA_ULONG },
	{ "maxsize",		KSTAT_DATA_ULONG },
	{ "hits",		KSTAT_DATA_ULONG },
	{ "misses",		KSTAT_DATA_ULONG },
	{ "kmem allocs",	KSTAT_DATA_ULONG },
	{ "kmem frees",		KSTAT_DATA_ULONG },
	{ "maxsize reached",	KSTAT_DATA_ULONG },
	{ "puts at frontlist",	KSTAT_DATA_ULONG },
	{ "puts at backlist",	KSTAT_DATA_ULONG },
	{ "queues to free",	KSTAT_DATA_ULONG },
	{ "scans",		KSTAT_DATA_ULONG },
	{ "thread idles",	KSTAT_DATA_ULONG },
	{ "lookup idles",	KSTAT_DATA_ULONG },
	{ "vget idles",		KSTAT_DATA_ULONG },
	{ "cache allocs",	KSTAT_DATA_ULONG },
	{ "cache frees",	KSTAT_DATA_ULONG },
	{ "pushes at close",	KSTAT_DATA_ULONG }
};

/* kstat data */
static kstat_t		*ufs_inode_kstat = NULL;

union ihead *ihead;	/* inode LRU cache, Chris Maltby */
kmutex_t *ih_lock;	/* protect inode cache hash table */
static int ino_hashlen = 4;	/* desired average hash chain length */
int inohsz;		/* number of buckets in the hash table */

kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
kmutex_t	ufsvfs_mutex;
struct ufsvfs	*oldufsvfslist, *ufsvfslist;

/*
 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
 * I/Os are going on.
 */
clock_t	ufs_iowait;

/*
 * the threads that process idle inodes and free (deleted) inodes
 * have high water marks that are set in ufsinit().
 * These values but can be no less then the minimum shown below
 */
int	ufs_idle_max;	/* # of allowable idle inodes */
ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
#define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */

/*
 * Tunables for ufs write throttling.
 * These are validated in ufs_iinit() since improper settings
 * can lead to filesystem hangs.
 */
#define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
#define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
int	ufs_HW = UFS_HW_DEFAULT;
int	ufs_LW = UFS_LW_DEFAULT;

static void ihinit(void);
extern int hash2ints(int, int);

static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
    struct cred *, int);

/* ARGSUSED */
static int
ufs_inode_kstat_update(kstat_t *ksp, int rw)
{
	if (rw == KSTAT_WRITE)
		return (EACCES);

	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "slab_alloc");
	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "slab_free");
	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "alloc");
	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "free");
	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "buf_inuse");
	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
	    "buf_max");
	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;

	return (0);
}

void
ufs_iinit(void)
{
	/*
	 * Validate that ufs_HW > ufs_LW.
	 * The default values for these two tunables have been increased.
	 * There is now a range of values for ufs_HW that used to be
	 * legal on previous Solaris versions but no longer is now.
	 * Upgrading a machine which has an /etc/system setting for ufs_HW
	 * from that range can lead to filesystem hangs unless the values
	 * are checked here.
	 */
	if (ufs_HW <= ufs_LW) {
		cmn_err(CE_WARN,
		    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
		    ufs_HW, ufs_LW);
		ufs_LW = UFS_LW_DEFAULT;
		ufs_HW = UFS_HW_DEFAULT;
		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
		    ufs_HW, ufs_LW);
	}

	/*
	 * Adjust the tunable `ufs_ninode' to a reasonable value
	 */
	if (ufs_ninode <= 0)
		ufs_ninode = ncsize;
	if (ufs_inode_max == 0)
		ufs_inode_max =
		    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
		    ufs_inode_max);
		ufs_ninode = ufs_inode_max;
	}
	/*
	 * Wait till third call of ufs_update to declare that no I/Os are
	 * going on. This allows deferred access times to be flushed to disk.
	 */
	ufs_iowait = v.v_autoup * hz * 2;

	/*
	 * idle thread runs when 25% of ufs_ninode entries are on the queue
	 */
	if (ufs_idle_max == 0)
		ufs_idle_max = ufs_ninode >> 2;
	if (ufs_idle_max < UFS_IDLE_MAX)
		ufs_idle_max = UFS_IDLE_MAX;
	if (ufs_idle_max > ufs_ninode)
		ufs_idle_max = ufs_ninode;
	/*
	 * This is really a misnomer, it is ufs_queue_init
	 */
	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);

	/*
	 * global hlock thread
	 */
	ufs_thread_init(&ufs_hlock, 1);
	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);

	ihinit();
	qtinit();
	ins.in_maxsize.value.ul = ufs_ninode;
	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
	    KSTAT_FLAG_VIRTUAL)) != NULL) {
		ufs_inode_kstat->ks_data = (void *)&ins;
		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
		kstat_install(ufs_inode_kstat);
	}
	ufsfx_init();		/* fix-on-panic initialization */
	si_cache_init();
	ufs_directio_init();
	lufs_init();
	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
}

/* ARGSUSED */
static int
ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
	struct inode *ip = buf;
	struct vnode *vp;

	vp = ip->i_vnode = vn_alloc(kmflags);
	if (vp == NULL) {
		return (-1);
	}
	vn_setops(vp, ufs_vnodeops);
	vp->v_data = ip;

	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
	dnlc_dir_init(&ip->i_danchor);

	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);

	return (0);
}

/* ARGSUSED */
static void
ufs_inode_cache_destructor(void *buf, void *cdrarg)
{
	struct inode *ip = buf;
	struct vnode *vp;

	vp = ITOV(ip);

	rw_destroy(&ip->i_rwlock);
	rw_destroy(&ip->i_contents);
	mutex_destroy(&ip->i_tlock);
	if (vp->v_type == VDIR) {
		dnlc_dir_fini(&ip->i_danchor);
	}

	cv_destroy(&ip->i_wrcv);

	vn_free(vp);
}

/*
 * Initialize hash links for inodes
 * and build inode free list.
 */
void
ihinit(void)
{
	int i;
	union	ihead *ih = ihead;

	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);

	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);

	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
		ih->ih_head[0] = ih;
		ih->ih_head[1] = ih;
		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
	}
	inode_cache = kmem_cache_create("ufs_inode_cache",
	    sizeof (struct inode), 0, ufs_inode_cache_constructor,
	    ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
	    NULL, NULL, 0);
}

/*
 * Free an inode structure
 */
void
ufs_free_inode(struct inode *ip)
{
	vn_invalid(ITOV(ip));
	kmem_cache_free(inode_cache, ip);
}

/*
 * Allocate an inode structure
 */
struct inode *
ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
{
	struct inode *ip;
	vnode_t *vp;

	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
	/*
	 * at this point we have a newly allocated inode
	 */
	ip->i_freef = ip;
	ip->i_freeb = ip;
	ip->i_flag = IREF;
	ip->i_seq = 0xFF;	/* Unique initial value */
	ip->i_dev = ufsvfsp->vfs_dev;
	ip->i_ufsvfs = ufsvfsp;
	ip->i_devvp = ufsvfsp->vfs_devvp;
	ip->i_number = ino;
	ip->i_diroff = 0;
	ip->i_nextr = 0;
	ip->i_map = NULL;
	ip->i_rdev = 0;
	ip->i_writes = 0;
	ip->i_mode = 0;
	ip->i_delaylen = 0;
	ip->i_delayoff = 0;
	ip->i_nextrio = 0;
	ip->i_ufs_acl = NULL;
	ip->i_cflags = 0;
	ip->i_mapcnt = 0;
	ip->i_dquot = NULL;
	ip->i_cachedir = CD_ENABLED;
	ip->i_writer = NULL;

	/*
	 * the vnode for this inode was allocated by the constructor
	 */
	vp = ITOV(ip);
	vn_reinit(vp);
	if (ino == (ino_t)UFSROOTINO)
		vp->v_flag = VROOT;
	vp->v_vfsp = ufsvfsp->vfs_vfs;
	vn_exists(vp);
	return (ip);
}

/*
 * Look up an inode by device, inumber.  If it is in core (in the
 * inode structure), honor the locking protocol.  If it is not in
 * core, read it in from the specified device after freeing any pages.
 * In all cases, a pointer to a VN_HELD inode structure is returned.
 */
int
ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
{
	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
}

/*
 * A version of ufs_iget which returns only allocated, linked inodes.
 * This is appropriate for any callers who do not expect a free inode.
 */
int
ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    struct cred *cr)
{
	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
}

/*
 * Set vnode attributes based on v_type, this should be called whenever
 * an inode's i_mode is changed.
 */
void
ufs_reset_vnode(vnode_t *vp)
{
	/*
	 * an old DBE hack
	 */
	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
		vp->v_flag |= VSWAPLIKE;
	else
		vp->v_flag &= ~VSWAPLIKE;

	/*
	 * if not swap like and it's just a regular file, we want
	 * to maintain the vnode's pages sorted by clean/modified
	 * for faster sync'ing to disk
	 */
	if (vp->v_type == VREG)
		vp->v_flag |= VMODSORT;
	else
		vp->v_flag &= ~VMODSORT;

	/*
	 * Is this an attribute hidden dir?
	 */
	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
		vp->v_flag |= V_XATTRDIR;
	else
		vp->v_flag &= ~V_XATTRDIR;
}

/*
 * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
 * flag is used to distinguish the two; when true, we validate that the inode
 * being retrieved looks like a linked and allocated inode.
 */
/* ARGSUSED */
static int
ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    struct cred *cr, int validate)
{
	struct inode *ip, *sp;
	union ihead *ih;
	kmutex_t *ihm;
	struct buf *bp;
	struct dinode *dp;
	struct vnode *vp;
	extern vfs_t EIO_vfs;
	int error;
	int ftype;	/* XXX - Remove later on */
	dev_t vfs_dev;
	struct ufsvfs *ufsvfsp;
	struct fs *fs;
	int hno;
	daddr_t bno;
	ulong_t ioff;

	CPU_STATS_ADD_K(sys, ufsiget, 1);

	/*
	 * Lookup inode in cache.
	 */
	vfs_dev = vfsp->vfs_dev;
	hno = INOHASH(ino);
	ih = &ihead[hno];
	ihm = &ih_lock[hno];

again:
	mutex_enter(ihm);
	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
		    (ip->i_flag & ISTALE))
			continue;

		/*
		 * Found the interesting inode; hold it and drop the cache lock
		 */
		vp = ITOV(ip);	/* for locknest */
		VN_HOLD(vp);
		mutex_exit(ihm);
		rw_enter(&ip->i_contents, RW_READER);

		/*
		 * if necessary, remove from idle list
		 */
		if ((ip->i_flag & IREF) == 0) {
			if (ufs_rmidle(ip))
				VN_RELE(vp);
		}

		/*
		 * Could the inode be read from disk?
		 */
		if (ip->i_flag & ISTALE) {
			rw_exit(&ip->i_contents);
			VN_RELE(vp);
			goto again;
		}

		ins.in_hits.value.ul++;
		*ipp = ip;

		/*
		 * Reset the vnode's attribute flags
		 */
		mutex_enter(&vp->v_lock);
		ufs_reset_vnode(vp);
		mutex_exit(&vp->v_lock);

		rw_exit(&ip->i_contents);

		return (0);
	}
	mutex_exit(ihm);

	/*
	 * Inode was not in cache.
	 *
	 * Allocate a new entry
	 */
	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
	fs = ufsvfsp->vfs_fs;

	ip = ufs_alloc_inode(ufsvfsp, ino);
	vp = ITOV(ip);

	bno = fsbtodb(fs, itod(fs, ino));
	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
	ip->i_doff = (offset_t)ioff + ldbtob(bno);

	/*
	 * put a place holder in the cache (if not already there)
	 */
	mutex_enter(ihm);
	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
		    ((sp->i_flag & ISTALE) == 0)) {
			mutex_exit(ihm);
			ufs_free_inode(ip);
			goto again;
		}
	/*
	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
	 * here, but if we do, then shadow inode allocations panic the
	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
	 * and the ufs_iget() parameters don't tell us what we are getting
	 * so we have no way of knowing this is a ufs_iget() call from
	 * a ufs_ialloc() call for a shadow inode.
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	insque(ip, ih);
	mutex_exit(ihm);
	/*
	 * read the dinode
	 */
	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);

	/*
	 * Check I/O errors
	 */
	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
	if (error) {
		brelse(bp);
		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
		rw_exit(&ip->i_contents);
		vp->v_vfsp = &EIO_vfs;
		VN_RELE(vp);
		return (error);
	}
	/*
	 * initialize the inode's dinode
	 */
	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
	ip->i_ic = dp->di_ic;			/* structure assignment */
	brelse(bp);

	/*
	 * Maintain compatibility with Solaris 1.x UFS
	 */
	if (ip->i_suid != UID_LONG)
		ip->i_uid = ip->i_suid;
	if (ip->i_sgid != GID_LONG)
		ip->i_gid = ip->i_sgid;

	ftype = ip->i_mode & IFMT;
	if (ftype == IFBLK || ftype == IFCHR) {
		dev_t dv;
		uint_t top16 = ip->i_ordev & 0xffff0000u;

		if (top16 == 0 || top16 == 0xffff0000u)
			dv = expdev(ip->i_ordev);
		else
			dv = expldev(ip->i_ordev);
		vp->v_rdev = ip->i_rdev = dv;
	}

	/*
	 * if our caller only expects allocated inodes, verify that
	 * this inode looks good; throw it out if it's bad.
	 */
	if (validate) {
		if ((ftype == 0) || (ip->i_nlink <= 0)) {
			ip->i_flag |= ISTALE;
			rw_exit(&ip->i_contents);
			vp->v_vfsp = &EIO_vfs;
			VN_RELE(vp);
			cmn_err(CE_NOTE,
			    "%s: unexpected free inode %d, run fsck(1M)%s",
			    fs->fs_fsmnt, (int)ino,
			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
			return (EIO);
		}
	}

	/*
	 * Finish initializing the vnode, special handling for shadow inodes
	 * because IFTOVT() will produce a v_type of VNON which is not what we
	 * want, set v_type to VREG explicitly in that case.
	 */
	if (ftype == IFSHAD) {
		vp->v_type = VREG;
	} else {
		vp->v_type = IFTOVT((mode_t)ip->i_mode);
	}

	ufs_reset_vnode(vp);

	/*
	 * read the shadow
	 */
	if (ftype != 0 && ip->i_shadow != 0) {
		if ((error = ufs_si_load(ip, cr)) != 0) {
			ip->i_flag |= ISTALE;
			ip->i_ufs_acl = NULL;
			rw_exit(&ip->i_contents);
			vp->v_vfsp = &EIO_vfs;
			VN_RELE(vp);
			return (error);
		}
	}

	/*
	 * Only attach quota information if the inode has a type and if
	 * that type is not a shadow inode.
	 */
	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
		ip->i_dquot = getinoquota(ip);
	}
	TRANS_MATA_IGET(ufsvfsp, ip);
	*ipp = ip;
	rw_exit(&ip->i_contents);

	return (0);
}

/*
 * Vnode is no longer referenced, write the inode out
 * and if necessary, truncate and deallocate the file.
 */
void
ufs_iinactive(struct inode *ip)
{
	int		front;
	struct inode	*iq;
	struct inode	*hip;
	struct ufs_q	*uq;
	struct vnode	*vp = ITOV(ip);
	struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;

	/*
	 * Because the vnode type might have been changed,
	 * the dnlc_dir_purge must be called unconditionally.
	 */
	dnlc_dir_purge(&ip->i_danchor);

	/*
	 * Get exclusive access to inode data.
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	ASSERT(ip->i_flag & IREF);

	/*
	 * Make sure no one reclaimed the inode before we put it on
	 * the freelist or destroy it. We keep our 'hold' on the vnode
	 * from vn_rele until we are ready to do something with the inode.
	 *
	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
	 * operation via an async putpage, so we must make sure
	 * we don't free/destroy the inode more than once. ufs_iget
	 * may also put a VN_HOLD on the inode before it grabs
	 * the i_contents lock. This is done so we don't free
	 * an inode that a thread is waiting on.
	 */
	mutex_enter(&vp->v_lock);

	if (vp->v_count > 1) {
		vp->v_count--;  /* release our hold from vn_rele */
		mutex_exit(&vp->v_lock);
		rw_exit(&ip->i_contents);
		return;
	}
	mutex_exit(&vp->v_lock);

	/*
	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
	 * and clean.  It can be safely destroyed (cyf).
	 */
	if (ip->i_ufsvfs == NULL) {
		rw_exit(&ip->i_contents);
		ufs_si_del(ip);
		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
		ufs_free_inode(ip);
		return;
	}

	/*
	 * queue idle inode to appropriate thread. Will check v_count == 1
	 * prior to putting this on the appropriate queue.
	 * Stale inodes will be unhashed and freed by the ufs idle thread
	 * in ufs_idle_free()
	 */
	front = 1;
	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
	    ip->i_mode && ip->i_nlink <= 0) {
		/*
		 * Mark the i_flag to indicate that inode is being deleted.
		 * This flag will be cleared when the deletion is complete.
		 * This prevents nfs from sneaking in via ufs_vget() while
		 * the delete is in progress (bugid 1242481).
		 */
		ip->i_flag |= IDEL;

		/*
		 * NOIDEL means that deletes are not allowed at this time;
		 * whoever resets NOIDEL will also send this inode back
		 * through ufs_iinactive.  IREF remains set.
		 */
		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
			mutex_enter(&vp->v_lock);
			vp->v_count--;
			mutex_exit(&vp->v_lock);
			rw_exit(&ip->i_contents);
			return;
		}
		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
			rw_exit(&ip->i_contents);
			ufs_delete(ip->i_ufsvfs, ip, 0);
			return;
		}

		/* queue to delete thread; IREF remains set */
		ins.in_qfree.value.ul++;
		uq = &ip->i_ufsvfs->vfs_delete;

		mutex_enter(&uq->uq_mutex);

		/* add to q */
		if ((iq = uq->uq_ihead) != 0) {
			ip->i_freef = iq;
			ip->i_freeb = iq->i_freeb;
			iq->i_freeb->i_freef = ip;
			iq->i_freeb = ip;
			if (front)
				uq->uq_ihead = ip;
		} else {
			uq->uq_ihead = ip;
			ip->i_freef = ip;
			ip->i_freeb = ip;
		}

		delq_info->delq_unreclaimed_files += 1;
		delq_info->delq_unreclaimed_blocks += ip->i_blocks;
	} else {
		/*
		 * queue to idle thread
		 *  Check the v_count == 1 again.
		 *
		 */
		mutex_enter(&vp->v_lock);
		if (vp->v_count > 1) {
			vp->v_count--;  /* release our hold from vn_rele */
			mutex_exit(&vp->v_lock);
			rw_exit(&ip->i_contents);
			return;
		}
		mutex_exit(&vp->v_lock);
		uq = &ufs_idle_q;

		/*
		 * useful iff it has pages or is a fastsymlink; otherwise junk
		 */
		mutex_enter(&uq->uq_mutex);

		/* clear IREF means `on idle list' */
		ip->i_flag &= ~(IREF | IDIRECTIO);

		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
			ins.in_frback.value.ul++;
			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
			ufs_nuseful_iq++;
		} else {
			ins.in_frfront.value.ul++;
			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
			ip->i_flag |= IJUNKIQ;
			ufs_njunk_iq++;
		}
		ip->i_freef = hip;
		ip->i_freeb = hip->i_freeb;
		hip->i_freeb->i_freef = ip;
		hip->i_freeb = ip;
	}

	/* wakeup thread(s) if q is overfull */
	if (++uq->uq_ne == uq->uq_lowat)
		cv_broadcast(&uq->uq_cv);

	/* all done, release the q and inode */
	mutex_exit(&uq->uq_mutex);
	rw_exit(&ip->i_contents);
}

/*
 * Check accessed and update flags on an inode structure.
 * If any are on, update the inode with the (unique) current time.
 * If waitfor is given, insure I/O order so wait for write to complete.
 */
void
ufs_iupdat(struct inode *ip, int waitfor)
{
	struct buf	*bp;
	struct fs	*fp;
	struct dinode	*dp;
	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
	int 		i;
	int		do_trans_times;
	ushort_t	flag;
	o_uid_t		suid;
	o_gid_t		sgid;

	/*
	 * This function is now safe to be called with either the reader
	 * or writer i_contents lock.
	 */
	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	/*
	 * Return if file system has been forcibly umounted.
	 */
	if (ufsvfsp == NULL)
		return;

	flag = ip->i_flag;	/* Atomic read */
	/*
	 * We better not update the disk inode from a stale inode.
	 */
	if (flag & ISTALE)
		return;

	fp = ip->i_fs;

	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
		if (fp->fs_ronly) {
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
			mutex_exit(&ip->i_tlock);
			return;
		}
		/*
		 * fs is active while metadata is being written
		 */
		mutex_enter(&ufsvfsp->vfs_lock);
		ufs_notclean(ufsvfsp);
		/*
		 * get the dinode
		 */
		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
		    (int)fp->fs_bsize);
		if (bp->b_flags & B_ERROR) {
			mutex_enter(&ip->i_tlock);
			ip->i_flag &=
			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
			mutex_exit(&ip->i_tlock);
			brelse(bp);
			return;
		}
		/*
		 * munge inode fields
		 */
		mutex_enter(&ip->i_tlock);
		ITIMES_NOLOCK(ip);
		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
		mutex_exit(&ip->i_tlock);

		/*
		 * For reads and concurrent re-writes, no deltas were
		 * entered for the access time changes - do it now.
		 */
		if (do_trans_times) {
			TRANS_INODE_TIMES(ufsvfsp, ip);
		}

		/*
		 * For SunOS 5.0->5.4, these lines below read:
		 *
		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
		 *
		 * where MAXUID was set to 60002.  This was incorrect -
		 * the uids should have been constrained to what fitted into
		 * a 16-bit word.
		 *
		 * This means that files from 4.x filesystems that have an
		 * i_suid field larger than 60002 will have that field
		 * changed to 65535.
		 *
		 * Security note: 4.x UFS could never create a i_suid of
		 * UID_LONG since that would've corresponded to -1.
		 */
		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
		    UID_LONG : ip->i_uid;
		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
		    GID_LONG : ip->i_gid;

		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
			ip->i_suid = suid;
			ip->i_sgid = sgid;
			TRANS_INODE(ufsvfsp, ip);
		}

		if ((ip->i_mode & IFMT) == IFBLK ||
		    (ip->i_mode & IFMT) == IFCHR) {
			dev_t d = ip->i_rdev;
			dev32_t dev32;

			/*
			 * load first direct block only if special device
			 */
			if (!cmpldev(&dev32, d)) {
				/*
				 * We panic here because there's "no way"
				 * we should have been able to create a large
				 * inode with a large dev_t.  Earlier layers
				 * should've caught this.
				 */
				panic("ip %p: i_rdev too big", (void *)ip);
			}

			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
				ip->i_ordev = dev32;	/* can't use old fmt. */
			} else {
				ip->i_ordev = cmpdev(d);
			}
		}

		/*
		 * copy inode to dinode (zero fastsymlnk in dinode)
		 */
		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
		dp->di_ic = ip->i_ic;	/* structure assignment */
		if (flag & IFASTSYMLNK) {
			for (i = 1; i < NDADDR; i++)
				dp->di_db[i] = 0;
			for (i = 0; i < NIADDR; i++)
				dp->di_ib[i] = 0;
		}
		if (TRANS_ISTRANS(ufsvfsp)) {
			/*
			 * Pass only a sector size buffer containing
			 * the inode, otherwise when the buffer is copied
			 * into a cached roll buffer then too much memory
			 * gets consumed if 8KB inode buffers are passed.
			 */
			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
			    sizeof (struct dinode),
			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
			    DEV_BSIZE);

			brelse(bp);
		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
			UFS_BRWRITE(ufsvfsp, bp);

			/*
			 * Synchronous write has guaranteed that inode
			 * has been written on disk so clear the flag
			 */
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~IBDWRITE;
			mutex_exit(&ip->i_tlock);
		} else {
			bdrwrite(bp);

			/*
			 * This write hasn't guaranteed that inode has been
			 * written on the disk.
			 * Since, all updat flags on inode are cleared, we must
			 * remember the condition in case inode is to be updated
			 * synchronously later (e.g.- fsync()/fdatasync())
			 * and inode has not been modified yet.
			 */
			mutex_enter(&ip->i_tlock);
			ip->i_flag |= IBDWRITE;
			mutex_exit(&ip->i_tlock);
		}
	} else {
		/*
		 * In case previous inode update was done asynchronously
		 * (IBDWRITE) and this inode update request wants guaranteed
		 * (synchronous) disk update, flush the inode.
		 */
		if (waitfor && (flag & IBDWRITE)) {
			blkflush(ip->i_dev,
			    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~IBDWRITE;
			mutex_exit(&ip->i_tlock);
		}
	}
}

#define	SINGLE	0	/* index of single indirect block */
#define	DOUBLE	1	/* index of double indirect block */
#define	TRIPLE	2	/* index of triple indirect block */

/*
 * Release blocks associated with the inode ip and
 * stored in the indirect block bn.  Blocks are free'd
 * in LIFO order up to (but not including) lastbn.  If
 * level is greater than SINGLE, the block is an indirect
 * block and recursive calls to indirtrunc must be used to
 * cleanse other indirect blocks.
 *
 * N.B.: triple indirect blocks are untested.
 */
static long
indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
{
	int i;
	struct buf *bp, *copy;
	daddr32_t *bap;
	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
	struct fs *fs = ufsvfsp->vfs_fs;
	daddr_t nb, last;
	long factor;
	int blocksreleased = 0, nblocks;

	ASSERT(RW_WRITE_HELD(&ip->i_contents));
	/*
	 * Calculate index in current block of last
	 * block to be kept.  -1 indicates the entire
	 * block so we need not calculate the index.
	 */
	factor = 1;
	for (i = SINGLE; i < level; i++)
		factor *= NINDIR(fs);
	last = lastbn;
	if (lastbn > 0)
		last /= factor;
	nblocks = btodb(fs->fs_bsize);
	/*
	 * Get buffer of block pointers, zero those
	 * entries corresponding to blocks to be free'd,
	 * and update on disk copy first.
	 * *Unless* the root pointer has been synchronously
	 * written to disk.  If nothing points to this
	 * indirect block then don't bother zero'ing and
	 * writing it.
	 */
	bp = UFS_BREAD(ufsvfsp,
	    ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
	if (bp->b_flags & B_ERROR) {
		brelse(bp);
		return (0);
	}
	bap = bp->b_un.b_daddr;
	if ((flags & I_CHEAP) == 0) {
		uint_t	zb;

		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));

		if (zb) {
			/*
			 * push any data into the log before we zero it
			 */
			if (bp->b_flags & B_DELWRI)
				TRANS_LOG(ufsvfsp, (caddr_t)bap,
				    ldbtob(bp->b_blkno), bp->b_bcount,
				    bp->b_un.b_addr, bp->b_bcount);
			copy = ngeteblk(fs->fs_bsize);
			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
			    (uint_t)fs->fs_bsize);
			bzero((caddr_t)&bap[last + 1], zb);

			TRANS_BUF(ufsvfsp,
			    (caddr_t)&bap[last + 1] - (caddr_t)bap,
			    zb, bp, DT_ABZERO);

			UFS_BRWRITE(ufsvfsp, bp);
			bp = copy, bap = bp->b_un.b_daddr;
		}
	} else {
		/* make sure write retries are also cleared */
		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
		bp->b_flags |= B_STALE | B_AGE;
	}

	/*
	 * Recursively free totally unused blocks.
	 */
	flags |= I_CHEAP;
	for (i = NINDIR(fs) - 1; i > last; i--) {
		nb = bap[i];
		if (nb == 0)
			continue;
		if (level > SINGLE) {
			blocksreleased +=
			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
		} else
			free(ip, nb, (off_t)fs->fs_bsize, flags);
		blocksreleased += nblocks;
	}
	flags &= ~I_CHEAP;

	/*
	 * Recursively free last partial block.
	 */
	if (level > SINGLE && lastbn >= 0) {
		last = lastbn % factor;
		nb = bap[i];
		if (nb != 0)
			blocksreleased +=
			    indirtrunc(ip, nb, last, level - 1, flags);
	}
	brelse(bp);
	return (blocksreleased);
}

/*
 * Truncate the inode ip to at most length size.
 * Free affected disk blocks -- the blocks of the
 * file are removed in reverse order.
 *
 * N.B.: triple indirect blocks are untested.
 */
static int i_genrand = 1234;
int
ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
{
	struct fs *fs = oip->i_fs;
	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
	struct inode *ip;
	daddr_t lastblock;
	off_t bsize;
	int boff;
	daddr_t bn, lastiblock[NIADDR];
	int level;
	long nblocks, blocksreleased = 0;
	int i;
	ushort_t mode;
	struct inode tip;
	int err;
	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);

	/*
	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
	 * other uses need the reader lock. opendq() holds the writer lock.
	 */
	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
	    RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
	ASSERT(RW_WRITE_HELD(&oip->i_contents));
	/*
	 * We only allow truncation of regular files and directories
	 * to arbitrary lengths here.  In addition, we allow symbolic
	 * links to be truncated only to zero length.  Other inode
	 * types cannot have their length set here.  Disk blocks are
	 * being dealt with - especially device inodes where
	 * ip->i_ordev is actually being stored in ip->i_db[0]!
	 */
	TRANS_INODE(ufsvfsp, oip);
	mode = oip->i_mode & IFMT;
	if (flags & I_FREE) {
		i_genrand *= 16843009;  /* turns into shift and adds */
		i_genrand++;
		oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
		oip->i_flag |= ICHG |IUPD;
		oip->i_seq++;
		if (length == oip->i_size)
			return (0);
		flags |= I_CHEAP;
	}
	if (mode == IFIFO)
		return (0);
	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
		return (EINVAL);
	if (length > maxoffset)
		return (EFBIG);
	if ((mode == IFDIR) || (mode == IFATTRDIR))
		flags |= I_DIR;
	if (mode == IFSHAD)
		flags |= I_SHAD;
	if (oip == ufsvfsp->vfs_qinod)
		flags |= I_QUOTA;
	if (length == oip->i_size) {
		/* update ctime and mtime to please POSIX tests */
		oip->i_flag |= ICHG |IUPD;
		oip->i_seq++;
		if (length == 0) {
			/* nothing to cache so clear the flag */
			oip->i_flag &= ~IFASTSYMLNK;
		}
		return (0);
	}
	/* wipe out fast symlink till next access */
	if (oip->i_flag & IFASTSYMLNK) {
		int j;

		ASSERT(ITOV(oip)->v_type == VLNK);

		oip->i_flag &= ~IFASTSYMLNK;

		for (j = 1; j < NDADDR; j++)
			oip->i_db[j] = 0;
		for (j = 0; j < NIADDR; j++)
			oip->i_ib[j] = 0;
	}

	boff = (int)blkoff(fs, length);

	if (length > oip->i_size) {
		/*
		 * Trunc up case.  BMAPALLOC will insure that the right blocks
		 * are allocated.  This includes extending the old frag to a
		 * full block (if needed) in addition to doing any work
		 * needed for allocating the last block.
		 */
		if (boff == 0)
			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
		else
			err = BMAPALLOC(oip, length - 1, boff, cr);

		if (err == 0) {
			/*
			 * Save old size and set inode's size now
			 * so that we don't cause too much of the
			 * file to be zero'd and pushed.
			 */
			u_offset_t osize = oip->i_size;
			oip->i_size  = length;
			/*
			 * Make sure we zero out the remaining bytes of
			 * the page in case a mmap scribbled on it. We
			 * can't prevent a mmap from writing beyond EOF
			 * on the last page of a file.
			 *
			 */
			if ((boff = (int)blkoff(fs, osize)) != 0) {
				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
				    fs->fs_bsize : fragroundup(fs, boff);
				pvn_vpzero(ITOV(oip), osize,
				    (size_t)(bsize - boff));
			}
			oip->i_flag |= ICHG|IATTCHG;
			oip->i_seq++;
			ITIMES_NOLOCK(oip);
			/*
			 * MAXOFF32_T is old 2GB size limit. If
			 * this operation caused a large file to be
			 * created, turn on the superblock flag
			 * and update the superblock, if the flag
			 * is not already on.
			 */
			if ((length > (u_offset_t)MAXOFF32_T) &&
			    !(fs->fs_flags & FSLARGEFILES)) {
				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
				mutex_enter(&ufsvfsp->vfs_lock);
				fs->fs_flags |= FSLARGEFILES;
				ufs_sbwrite(ufsvfsp);
				mutex_exit(&ufsvfsp->vfs_lock);
			}
		}

		return (err);
	}

	/*
	 * Update the pages of the file.  If the file is not being
	 * truncated to a block boundary, the contents of the
	 * pages following the end of the file must be zero'ed
	 * in case it ever become accessible again because
	 * of subsequent file growth.
	 */
	if (boff == 0) {
		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
		    B_INVAL | B_TRUNC, CRED());
	} else {
		/*
		 * Make sure that the last block is properly allocated.
		 * We only really have to do this if the last block is
		 * actually allocated since ufs_bmap will now handle the case
		 * of an fragment which has no block allocated.  Just to
		 * be sure, we do it now independent of current allocation.
		 */
		err = BMAPALLOC(oip, length - 1, boff, cr);
		if (err)
			return (err);

		/*
		 * BMAPALLOC will call bmap_write which defers i_seq
		 * processing.  If the timestamps were changed, update
		 * i_seq before rdip drops i_contents or syncs the inode.
		 */
		if (oip->i_flag & (ICHG|IUPD))
			oip->i_seq++;

		/*
		 * BugId 4069932
		 * Make sure that the relevant partial page appears in
		 * the v_pages list, so that pvn_vpzero() will do its
		 * job.  Since doing this correctly requires everything
		 * in rdip() except for the uiomove(), it's easier and
		 * safer to do the uiomove() rather than duplicate the
		 * rest of rdip() here.
		 *
		 * To get here, we know that length indicates a byte
		 * that is not the first byte of a block.  (length - 1)
		 * is the last actual byte known to exist.  Deduction
		 * shows it is in the same block as byte (length).
		 * Thus, this rdip() invocation should always succeed
		 * except in the face of i/o errors, and give us the
		 * block we care about.
		 *
		 * rdip() makes the same locking assertions and
		 * assumptions as we do.  We do not acquire any locks
		 * before calling it, so we have not changed the locking
		 * situation.  Finally, there do not appear to be any
		 * paths whereby rdip() ends up invoking us again.
		 * Thus, infinite recursion is avoided.
		 */
		{
			uio_t uio;
			iovec_t iov[1];
			char buffer;

			uio.uio_iov = iov;
			uio.uio_iovcnt = 1;
			uio.uio_loffset = length - 1;
			uio.uio_resid = 1;
			uio.uio_segflg = UIO_SYSSPACE;
			uio.uio_extflg = UIO_COPY_CACHED;

			iov[0].iov_base = &buffer;
			iov[0].iov_len = 1;

			err = rdip(oip, &uio, UIO_READ, NULL);
			if (err)
				return (err);
		}

		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
		    fs->fs_bsize : fragroundup(fs, boff);
		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
		/*
		 * Ensure full fs block is marked as dirty.
		 */
		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
	}

	/*
	 * Calculate index into inode's block list of
	 * last direct and indirect blocks (if any)
	 * which we want to keep.  Lastblock is -1 when
	 * the file is truncated to 0.
	 */
	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
	lastiblock[SINGLE] = lastblock - NDADDR;
	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
	nblocks = btodb(fs->fs_bsize);

	/*
	 * Update file and block pointers
	 * on disk before we start freeing blocks.
	 * If we crash before free'ing blocks below,
	 * the blocks will be returned to the free list.
	 * lastiblock values are also normalized to -1
	 * for calls to indirtrunc below.
	 */
	tip = *oip;			/* structure copy */
	ip = &tip;

	for (level = TRIPLE; level >= SINGLE; level--)
		if (lastiblock[level] < 0) {
			oip->i_ib[level] = 0;
			lastiblock[level] = -1;
		}
	for (i = NDADDR - 1; i > lastblock; i--) {
		oip->i_db[i] = 0;
		flags |= I_CHEAP;
	}
	oip->i_size = length;
	oip->i_flag |= ICHG|IUPD|IATTCHG;
	oip->i_seq++;
	if (!TRANS_ISTRANS(ufsvfsp))
		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */

	/*
	 * Indirect blocks first.
	 */
	for (level = TRIPLE; level >= SINGLE; level--) {
		bn = ip->i_ib[level];
		if (bn != 0) {
			blocksreleased +=
			    indirtrunc(ip, bn, lastiblock[level], level, flags);
			if (lastiblock[level] < 0) {
				ip->i_ib[level] = 0;
				free(ip, bn, (off_t)fs->fs_bsize,
				    flags | I_IBLK);
				blocksreleased += nblocks;
			}
		}
		if (lastiblock[level] >= 0)
			goto done;
	}

	/*
	 * All whole direct blocks or frags.
	 */
	for (i = NDADDR - 1; i > lastblock; i--) {
		bn = ip->i_db[i];
		if (bn == 0)
			continue;
		ip->i_db[i] = 0;
		bsize = (off_t)blksize(fs, ip, i);
		free(ip, bn, bsize, flags);
		blocksreleased += btodb(bsize);
	}
	if (lastblock < 0)
		goto done;

	/*
	 * Finally, look for a change in size of the
	 * last direct block; release any frags.
	 */
	bn = ip->i_db[lastblock];
	if (bn != 0) {
		off_t oldspace, newspace;

		/*
		 * Calculate amount of space we're giving
		 * back as old block size minus new block size.
		 */
		oldspace = blksize(fs, ip, lastblock);
		UFS_SET_ISIZE(length, ip);
		newspace = blksize(fs, ip, lastblock);
		if (newspace == 0) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
			return (err);
		}
		if (oldspace - newspace > 0) {
			/*
			 * Block number of space to be free'd is
			 * the old block # plus the number of frags
			 * required for the storage we're keeping.
			 */
			bn += numfrags(fs, newspace);
			free(ip, bn, oldspace - newspace, flags);
			blocksreleased += btodb(oldspace - newspace);
		}
	}
done:
/* BEGIN PARANOIA */
	for (level = SINGLE; level <= TRIPLE; level++)
		if (ip->i_ib[level] != oip->i_ib[level]) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
			return (err);
		}

	for (i = 0; i < NDADDR; i++)
		if (ip->i_db[i] != oip->i_db[i]) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
			return (err);
		}
/* END PARANOIA */
	oip->i_blocks -= blocksreleased;

	if (oip->i_blocks < 0) {		/* sanity */
		cmn_err(CE_NOTE,
		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
		    (int)oip->i_blocks);
		oip->i_blocks = 0;
	}
	oip->i_flag |= ICHG|IATTCHG;
	oip->i_seq++;
	/* blocksreleased is >= zero, so this can not fail */
	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
	    (size_t *)NULL);
	return (0);
}

/*
 * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
 * In the case of WRITE, the read-only status of the file system
 * is checked.  Depending on the calling user, the appropriate
 * mode bits are selected; privileges to override missing permission
 * bits are checked through secpolicy_vnode_access().
 * The i_contens lock must be held as reader here to prevent racing with
 * the acl subsystem removing/setting/changing acls on this inode.
 * The caller is responsible for indicating whether or not the i_contents
 * lock needs to be acquired here or if already held.
 */
int
ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
{
	int shift = 0;
	int ret = 0;

	if (dolock)
		rw_enter(&ip->i_contents, RW_READER);
	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	if (mode & IWRITE) {
		/*
		 * Disallow write attempts on read-only
		 * file systems, unless the file is a block
		 * or character device or a FIFO.
		 */
		if (ip->i_fs->fs_ronly != 0) {
			if ((ip->i_mode & IFMT) != IFCHR &&
			    (ip->i_mode & IFMT) != IFBLK &&
			    (ip->i_mode & IFMT) != IFIFO) {
				ret = EROFS;
				goto out;
			}
		}
	}
	/*
	 * If there is an acl, check the acl and return.
	 */
	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
		ret = ufs_acl_access(ip, mode, cr);
		goto out;
	}

	/*
	 * Access check is based on only one of owner, group, public.
	 * If not owner, then check group.
	 * If not a member of the group, then check public access.
	 */
	if (crgetuid(cr) != ip->i_uid) {
		shift += 3;
		if (!groupmember((uid_t)ip->i_gid, cr))
			shift += 3;
	}

	/* test missing privilege bits */
	ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
	    ip->i_mode << shift, mode);
out:
	if (dolock)
		rw_exit(&ip->i_contents);
	return (ret);
}

/*
 * if necessary, remove an inode from the free list
 *	i_contents is held except at unmount
 *
 * Return 1 if the inode is taken off of the ufs_idle_q,
 * and the caller is expected to call VN_RELE.
 *
 * Return 0 otherwise.
 */
int
ufs_rmidle(struct inode *ip)
{
	int rval = 0;

	mutex_enter(&ip->i_tlock);
	if ((ip->i_flag & IREF) == 0) {
		mutex_enter(&ufs_idle_q.uq_mutex);
		ip->i_freef->i_freeb = ip->i_freeb;
		ip->i_freeb->i_freef = ip->i_freef;
		ip->i_freef = ip;
		ip->i_freeb = ip;
		ip->i_flag |= IREF;
		ufs_idle_q.uq_ne--;
		if (ip->i_flag & IJUNKIQ) {
			ufs_njunk_iq--;
			ip->i_flag &= ~IJUNKIQ;
		} else {
			ufs_nuseful_iq--;
		}
		mutex_exit(&ufs_idle_q.uq_mutex);
		rval = 1;
	}
	mutex_exit(&ip->i_tlock);
	return (rval);
}

/*
 * scan the hash of inodes and call func with the inode locked
 */
int
ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
		struct ufsvfs *ufsvfsp)
{
	struct inode		*ip;		/* current inode */
	struct inode		*lip = NULL;	/* last/previous inode */
	union ihead		*ih;		/* current hash chain */
	int			error, i;
	int			saverror = 0;
	int			lip_held;	/* lip needs a VN_RELE() */

	/*
	 * If ufsvfsp is NULL, then our caller should be holding
	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
	 * ufs_update().  Otherwise, to avoid false-positives in
	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
	 * those inodes that are in the file system our caller cares
	 * about.
	 *
	 * We know that ip is a valid inode in the hash chain (and thus
	 * we can trust i_ufsvfs) because the inode we chained from
	 * (lip) is still in the hash chain.  This is true because either:
	 *
	 * 1. We did not drop the hash chain lock since the last
	 *    iteration (because we were not interested in the last inode),
	 * or
	 * 2. We maintained a hold on the last inode while we
	 *    we were processing it, so it could not be removed
	 *    from the hash chain.
	 *
	 * The whole reason we're dropping and re-grabbing the chain
	 * lock on every inode is so that we don't present a major
	 * choke point on throughput, particularly when we've been
	 * called on behalf of fsflush.
	 */

	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
		mutex_enter(&ih_lock[i]);
		for (ip = ih->ih_chain[0], lip_held = 0;
		    ip != (struct inode *)ih;
		    ip = lip->i_forw) {

			ins.in_scan.value.ul++;

			/*
			 * Undo the previous iteration's VN_HOLD(), but
			 * only if one was done.
			 */
			if (lip_held)
				VN_RELE(ITOV(lip));

			lip = ip;
			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
				/*
				 * We're not processing all inodes, and
				 * this inode is not in the filesystem of
				 * interest, so skip it.  No need to do a
				 * VN_HOLD() since we're not dropping the
				 * hash chain lock until after we've
				 * done the i_forw traversal above.
				 */
				lip_held = 0;
				continue;
			}
			VN_HOLD(ITOV(ip));
			lip_held = 1;
			mutex_exit(&ih_lock[i]);

			/*
			 * Acquire the contents lock as writer to make
			 * sure that the inode has been initialized in
			 * the cache or removed from the idle list by
			 * ufs_iget().  This works because ufs_iget()
			 * acquires the contents lock before putting
			 * the inode into the cache.  If we can lock
			 * it, then he's done with it.
			 */

			if (rwtry) {
				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
					mutex_enter(&ih_lock[i]);
					continue;
				}
			} else {
				rw_enter(&ip->i_contents, RW_WRITER);
			}

			rw_exit(&ip->i_contents);

			/*
			 * ISTALE means the inode couldn't be read
			 *
			 * We don't have to hold the i_contents lock
			 * for this check for a couple of
			 * reasons. First, if ISTALE is set then the
			 * flag cannot be cleared until the inode is
			 * removed from the cache and that cannot
			 * happen until after we VN_RELE() it.
			 * Second, if ISTALE is not set, then the
			 * inode is in the cache and does not need to
			 * be read from disk so ISTALE cannot be set
			 * while we are not looking.
			 */
			if ((ip->i_flag & ISTALE) == 0) {
				if ((error = (*func)(ip, arg)) != 0)
					saverror = error;
			}

			mutex_enter(&ih_lock[i]);
		}
		if (lip_held)
			VN_RELE(ITOV(lip));
		mutex_exit(&ih_lock[i]);
	}
	return (saverror);
}

/*
 * Mark inode with the current time, plus a unique increment.
 *
 * Since we only keep 32-bit time on disk, if UFS is still alive
 * beyond 2038, filesystem times will simply stick at the last
 * possible second of 32-bit time. Not ideal, but probably better
 * than going into the remote past, or confusing applications with
 * negative time.
 */
void
ufs_imark(struct inode *ip)
{
	timestruc_t now;
	int32_t usec, nsec;

	/*
	 * The update of i_seq may have been deferred, increase i_seq here
	 * to make sure it is in sync with the timestamps.
	 */
	if (ip->i_flag & ISEQ) {
		ASSERT(ip->i_flag & (IUPD|ICHG));
		ip->i_seq++;
		ip->i_flag &= ~ISEQ;
	}

	gethrestime(&now);

	/*
	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
	 * in common/os/timers.c for a full description.
	 */
	nsec = now.tv_nsec;
	usec = nsec + (nsec >> 2);
	usec = nsec + (usec >> 1);
	usec = nsec + (usec >> 2);
	usec = nsec + (usec >> 4);
	usec = nsec - (usec >> 3);
	usec = nsec + (usec >> 2);
	usec = nsec + (usec >> 3);
	usec = nsec + (usec >> 4);
	usec = nsec + (usec >> 1);
	usec = nsec + (usec >> 6);
	usec = usec >> 10;

	mutex_enter(&ufs_iuniqtime_lock);
	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
	    usec > iuniqtime.tv_usec) {
		if (now.tv_sec < TIME32_MAX) {
			iuniqtime.tv_sec = (time32_t)now.tv_sec;
			iuniqtime.tv_usec = usec;
		}
	} else {
		if (iuniqtime.tv_sec < TIME32_MAX) {
			iuniqtime.tv_usec++;
			/* Check for usec overflow */
			if (iuniqtime.tv_usec >= MICROSEC) {
				iuniqtime.tv_sec++;
				iuniqtime.tv_usec = 0;
			}
		}
	}

	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
		ip->i_atime = iuniqtime;
	}
	if (ip->i_flag & IUPD) {
		ip->i_mtime = iuniqtime;
		ip->i_flag |= IMODTIME;
	}
	if (ip->i_flag & ICHG) {
		ip->i_diroff = 0;
		ip->i_ctime = iuniqtime;
	}
	mutex_exit(&ufs_iuniqtime_lock);
}

/*
 * Update timestamps in inode.
 */
void
ufs_itimes_nolock(struct inode *ip)
{

	/*
	 * if noatime is set and the inode access time is the only field that
	 * must be changed, exit immediately.
	 */
	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
	    (ip->i_ufsvfs->vfs_noatime)) {
		return;
	}

	if (ip->i_flag & (IUPD|IACC|ICHG)) {
		if (ip->i_flag & ICHG)
			ip->i_flag |= IMOD;
		else
			ip->i_flag |= IMODACC;
		ufs_imark(ip);
		ip->i_flag &= ~(IACC|IUPD|ICHG);
	}
}