upstream/illumos/illumos-gate: usr/src/uts/common/fs/ufs/ufs


/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/bitmap.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/vfs.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/dnlc.h>
#include <sys/mode.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/acl.h>
#include <sys/var.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_acl.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_log.h>
#include <vm/hat.h>
#include <vm/as.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <sys/swap.h>
#include <sys/cpuvar.h>
#include <sys/sysmacros.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <fs/fs_subr.h>
#include <sys/policy.h>

struct kmem_cache *inode_cache;		/* cache of free inodes */

/* UFS Inode Cache Stats -- Not protected */
struct	instats ins = {
	{ "size",		KSTAT_DATA_ULONG },
	{ "maxsize",		KSTAT_DATA_ULONG },
	{ "hits",		KSTAT_DATA_ULONG },
	{ "misses",		KSTAT_DATA_ULONG },
	{ "kmem allocs",	KSTAT_DATA_ULONG },
	{ "kmem frees",		KSTAT_DATA_ULONG },
	{ "maxsize reached",	KSTAT_DATA_ULONG },
	{ "puts at frontlist",	KSTAT_DATA_ULONG },
	{ "puts at backlist",	KSTAT_DATA_ULONG },
	{ "queues to free",	KSTAT_DATA_ULONG },
	{ "scans",		KSTAT_DATA_ULONG },
	{ "thread idles",	KSTAT_DATA_ULONG },
	{ "lookup idles",	KSTAT_DATA_ULONG },
	{ "vget idles",		KSTAT_DATA_ULONG },
	{ "cache allocs",	KSTAT_DATA_ULONG },
	{ "cache frees",	KSTAT_DATA_ULONG },
	{ "pushes at close",	KSTAT_DATA_ULONG }
};

/* kstat data */
static kstat_t		*ufs_inode_kstat = NULL;

union ihead *ihead;	/* inode LRU cache, Chris Maltby */
kmutex_t *ih_lock;	/* protect inode cache hash table */
static int ino_hashlen = 4;	/* desired average hash chain length */
int inohsz;		/* number of buckets in the hash table */

kmutex_t	ufs_scan_lock;	/* stop racing multiple ufs_scan_inodes() */
kmutex_t	ufs_iuniqtime_lock; /* protect iuniqtime */
kmutex_t	ufsvfs_mutex;
struct ufsvfs	*oldufsvfslist, *ufsvfslist;

/*
 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no
 * I/Os are going on.
 */
clock_t	ufs_iowait;

/*
 * the threads that process idle inodes and free (deleted) inodes
 * have high water marks that are set in ufsinit().
 * These values but can be no less then the minimum shown below
 */
int	ufs_idle_max;	/* # of allowable idle inodes */
ulong_t	ufs_inode_max;	/* hard limit of allowable idle inodes */
#define	UFS_IDLE_MAX	(16)	/* min # of allowable idle inodes */

/*
 * Tunables for ufs write throttling.
 * These are validated in ufs_iinit() since improper settings
 * can lead to filesystem hangs.
 */
#define	UFS_HW_DEFAULT	(16 * 1024 * 1024)
#define	UFS_LW_DEFAULT	(8 * 1024 * 1024)
int	ufs_HW = UFS_HW_DEFAULT;
int	ufs_LW = UFS_LW_DEFAULT;

static void ihinit(void);
extern int hash2ints(int, int);

static int ufs_iget_internal(struct vfs *, ino_t, struct inode **,
    struct cred *, int);

/* ARGSUSED */
static int
ufs_inode_kstat_update(kstat_t *ksp, int rw)
{
	if (rw == KSTAT_WRITE)
		return (EACCES);

	ins.in_malloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "slab_alloc");
	ins.in_mfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "slab_free");
	ins.in_kcalloc.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "alloc");
	ins.in_kcfree.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "free");
	ins.in_size.value.ul	= (ulong_t)kmem_cache_stat(inode_cache,
	    "buf_inuse");
	ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache,
	    "buf_max");
	ins.in_misses.value.ul = ins.in_kcalloc.value.ul;

	return (0);
}

void
ufs_iinit(void)
{
	/*
	 * Validate that ufs_HW > ufs_LW.
	 * The default values for these two tunables have been increased.
	 * There is now a range of values for ufs_HW that used to be
	 * legal on previous Solaris versions but no longer is now.
	 * Upgrading a machine which has an /etc/system setting for ufs_HW
	 * from that range can lead to filesystem hangs unless the values
	 * are checked here.
	 */
	if (ufs_HW <= ufs_LW) {
		cmn_err(CE_WARN,
		    "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.",
		    ufs_HW, ufs_LW);
		ufs_LW = UFS_LW_DEFAULT;
		ufs_HW = UFS_HW_DEFAULT;
		cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n",
		    ufs_HW, ufs_LW);
	}

	/*
	 * Adjust the tunable `ufs_ninode' to a reasonable value
	 */
	if (ufs_ninode <= 0)
		ufs_ninode = ncsize;
	if (ufs_inode_max == 0)
		ufs_inode_max =
		    (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode));
	if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) {
		cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld",
		    ufs_inode_max);
		ufs_ninode = ufs_inode_max;
	}
	/*
	 * Wait till third call of ufs_update to declare that no I/Os are
	 * going on. This allows deferred access times to be flushed to disk.
	 */
	ufs_iowait = v.v_autoup * hz * 2;

	/*
	 * idle thread runs when 25% of ufs_ninode entries are on the queue
	 */
	if (ufs_idle_max == 0)
		ufs_idle_max = ufs_ninode >> 2;
	if (ufs_idle_max < UFS_IDLE_MAX)
		ufs_idle_max = UFS_IDLE_MAX;
	if (ufs_idle_max > ufs_ninode)
		ufs_idle_max = ufs_ninode;
	/*
	 * This is really a misnomer, it is ufs_queue_init
	 */
	ufs_thread_init(&ufs_idle_q, ufs_idle_max);
	ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL);

	/*
	 * global hlock thread
	 */
	ufs_thread_init(&ufs_hlock, 1);
	ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL);

	ihinit();
	qtinit();
	ins.in_maxsize.value.ul = ufs_ninode;
	if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs",
	    KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t),
	    KSTAT_FLAG_VIRTUAL)) != NULL) {
		ufs_inode_kstat->ks_data = (void *)&ins;
		ufs_inode_kstat->ks_update = ufs_inode_kstat_update;
		kstat_install(ufs_inode_kstat);
	}
	ufsfx_init();		/* fix-on-panic initialization */
	si_cache_init();
	ufs_directio_init();
	lufs_init();
	mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL);
}

/* ARGSUSED */
static int
ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
	struct inode *ip = buf;
	struct vnode *vp;

	vp = ip->i_vnode = vn_alloc(kmflags);
	if (vp == NULL) {
		return (-1);
	}
	vn_setops(vp, ufs_vnodeops);
	vp->v_data = ip;

	rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL);
	rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL);
	mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL);
	dnlc_dir_init(&ip->i_danchor);

	cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL);

	return (0);
}

/* ARGSUSED */
static void
ufs_inode_cache_destructor(void *buf, void *cdrarg)
{
	struct inode *ip = buf;
	struct vnode *vp;

	vp = ITOV(ip);

	rw_destroy(&ip->i_rwlock);
	rw_destroy(&ip->i_contents);
	mutex_destroy(&ip->i_tlock);
	if (vp->v_type == VDIR) {
		dnlc_dir_fini(&ip->i_danchor);
	}

	cv_destroy(&ip->i_wrcv);

	vn_free(vp);
}

/*
 * Initialize hash links for inodes
 * and build inode free list.
 */
void
ihinit(void)
{
	int i;
	union	ihead *ih = ihead;

	mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL);

	inohsz = 1 << highbit(ufs_ninode / ino_hashlen);
	ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP);
	ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP);

	for (i = 0, ih = ihead; i < inohsz; i++,  ih++) {
		ih->ih_head[0] = ih;
		ih->ih_head[1] = ih;
		mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL);
	}
	inode_cache = kmem_cache_create("ufs_inode_cache",
	    sizeof (struct inode), 0, ufs_inode_cache_constructor,
	    ufs_inode_cache_destructor, ufs_inode_cache_reclaim,
	    NULL, NULL, 0);
}

/*
 * Free an inode structure
 */
void
ufs_free_inode(struct inode *ip)
{
	vn_invalid(ITOV(ip));
	kmem_cache_free(inode_cache, ip);
}

/*
 * Allocate an inode structure
 */
struct inode *
ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino)
{
	struct inode *ip;
	vnode_t *vp;

	ip = kmem_cache_alloc(inode_cache, KM_SLEEP);
	/*
	 * at this point we have a newly allocated inode
	 */
	ip->i_freef = ip;
	ip->i_freeb = ip;
	ip->i_flag = IREF;
	ip->i_seq = 0xFF;	/* Unique initial value */
	ip->i_dev = ufsvfsp->vfs_dev;
	ip->i_ufsvfs = ufsvfsp;
	ip->i_devvp = ufsvfsp->vfs_devvp;
	ip->i_number = ino;
	ip->i_diroff = 0;
	ip->i_nextr = 0;
	ip->i_map = NULL;
	ip->i_rdev = 0;
	ip->i_writes = 0;
	ip->i_mode = 0;
	ip->i_delaylen = 0;
	ip->i_delayoff = 0;
	ip->i_nextrio = 0;
	ip->i_ufs_acl = NULL;
	ip->i_cflags = 0;
	ip->i_mapcnt = 0;
	ip->i_dquot = NULL;
	ip->i_cachedir = CD_ENABLED;
	ip->i_writer = NULL;

	/*
	 * the vnode for this inode was allocated by the constructor
	 */
	vp = ITOV(ip);
	vn_reinit(vp);
	if (ino == (ino_t)UFSROOTINO)
		vp->v_flag = VROOT;
	vp->v_vfsp = ufsvfsp->vfs_vfs;
	vn_exists(vp);
	return (ip);
}

/*
 * Look up an inode by device, inumber.  If it is in core (in the
 * inode structure), honor the locking protocol.  If it is not in
 * core, read it in from the specified device after freeing any pages.
 * In all cases, a pointer to a VN_HELD inode structure is returned.
 */
int
ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr)
{
	return (ufs_iget_internal(vfsp, ino, ipp, cr, 0));
}

/*
 * A version of ufs_iget which returns only allocated, linked inodes.
 * This is appropriate for any callers who do not expect a free inode.
 */
int
ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    struct cred *cr)
{
	return (ufs_iget_internal(vfsp, ino, ipp, cr, 1));
}

/*
 * Set vnode attributes based on v_type, this should be called whenever
 * an inode's i_mode is changed.
 */
void
ufs_reset_vnode(vnode_t *vp)
{
	/*
	 * an old DBE hack
	 */
	if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
		vp->v_flag |= VSWAPLIKE;
	else
		vp->v_flag &= ~VSWAPLIKE;

	/*
	 * if not swap like and it's just a regular file, we want
	 * to maintain the vnode's pages sorted by clean/modified
	 * for faster sync'ing to disk
	 */
	if (vp->v_type == VREG)
		vp->v_flag |= VMODSORT;
	else
		vp->v_flag &= ~VMODSORT;

	/*
	 * Is this an attribute hidden dir?
	 */
	if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR)
		vp->v_flag |= V_XATTRDIR;
	else
		vp->v_flag &= ~V_XATTRDIR;
}

/*
 * Shared implementation of ufs_iget and ufs_iget_alloced.  The 'validate'
 * flag is used to distinguish the two; when true, we validate that the inode
 * being retrieved looks like a linked and allocated inode.
 */
/* ARGSUSED */
static int
ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp,
    struct cred *cr, int validate)
{
	struct inode *ip, *sp;
	union ihead *ih;
	kmutex_t *ihm;
	struct buf *bp;
	struct dinode *dp;
	struct vnode *vp;
	extern vfs_t EIO_vfs;
	int error;
	int ftype;	/* XXX - Remove later on */
	dev_t vfs_dev;
	struct ufsvfs *ufsvfsp;
	struct fs *fs;
	int hno;
	daddr_t bno;
	ulong_t ioff;

	CPU_STATS_ADD_K(sys, ufsiget, 1);

	/*
	 * Lookup inode in cache.
	 */
	vfs_dev = vfsp->vfs_dev;
	hno = INOHASH(ino);
	ih = &ihead[hno];
	ihm = &ih_lock[hno];

again:
	mutex_enter(ihm);
	for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) {
		if (ino != ip->i_number || vfs_dev != ip->i_dev ||
		    (ip->i_flag & ISTALE))
			continue;

		/*
		 * Found the interesting inode; hold it and drop the cache lock
		 */
		vp = ITOV(ip);	/* for locknest */
		VN_HOLD(vp);
		mutex_exit(ihm);
		rw_enter(&ip->i_contents, RW_READER);

		/*
		 * if necessary, remove from idle list
		 */
		if ((ip->i_flag & IREF) == 0) {
			if (ufs_rmidle(ip))
				VN_RELE(vp);
		}

		/*
		 * Could the inode be read from disk?
		 */
		if (ip->i_flag & ISTALE) {
			rw_exit(&ip->i_contents);
			VN_RELE(vp);
			goto again;
		}

		ins.in_hits.value.ul++;
		*ipp = ip;

		/*
		 * Reset the vnode's attribute flags
		 */
		mutex_enter(&vp->v_lock);
		ufs_reset_vnode(vp);
		mutex_exit(&vp->v_lock);

		rw_exit(&ip->i_contents);

		return (0);
	}
	mutex_exit(ihm);

	/*
	 * Inode was not in cache.
	 *
	 * Allocate a new entry
	 */
	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
	fs = ufsvfsp->vfs_fs;

	ip = ufs_alloc_inode(ufsvfsp, ino);
	vp = ITOV(ip);

	bno = fsbtodb(fs, itod(fs, ino));
	ioff = (sizeof (struct dinode)) * (itoo(fs, ino));
	ip->i_doff = (offset_t)ioff + ldbtob(bno);

	/*
	 * put a place holder in the cache (if not already there)
	 */
	mutex_enter(ihm);
	for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw)
		if (ino == sp->i_number && vfs_dev == sp->i_dev &&
		    ((sp->i_flag & ISTALE) == 0)) {
			mutex_exit(ihm);
			ufs_free_inode(ip);
			goto again;
		}
	/*
	 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock))
	 * here, but if we do, then shadow inode allocations panic the
	 * system.  We don't have to hold vfs_dqrwlock for shadow inodes
	 * and the ufs_iget() parameters don't tell us what we are getting
	 * so we have no way of knowing this is a ufs_iget() call from
	 * a ufs_ialloc() call for a shadow inode.
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	insque(ip, ih);
	mutex_exit(ihm);
	/*
	 * read the dinode
	 */
	bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize);

	/*
	 * Check I/O errors
	 */
	error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0);
	if (error) {
		brelse(bp);
		ip->i_flag |= ISTALE;	/* in case someone is looking it up */
		rw_exit(&ip->i_contents);
		vp->v_vfsp = &EIO_vfs;
		VN_RELE(vp);
		return (error);
	}
	/*
	 * initialize the inode's dinode
	 */
	dp = (struct dinode *)(ioff + bp->b_un.b_addr);
	ip->i_ic = dp->di_ic;			/* structure assignment */
	brelse(bp);

	/*
	 * Maintain compatibility with Solaris 1.x UFS
	 */
	if (ip->i_suid != UID_LONG)
		ip->i_uid = ip->i_suid;
	if (ip->i_sgid != GID_LONG)
		ip->i_gid = ip->i_sgid;

	ftype = ip->i_mode & IFMT;
	if (ftype == IFBLK || ftype == IFCHR) {
		dev_t dv;
		uint_t top16 = ip->i_ordev & 0xffff0000u;

		if (top16 == 0 || top16 == 0xffff0000u)
			dv = expdev(ip->i_ordev);
		else
			dv = expldev(ip->i_ordev);
		vp->v_rdev = ip->i_rdev = dv;
	}

	/*
	 * if our caller only expects allocated inodes, verify that
	 * this inode looks good; throw it out if it's bad.
	 */
	if (validate) {
		if ((ftype == 0) || (ip->i_nlink <= 0)) {
			ip->i_flag |= ISTALE;
			rw_exit(&ip->i_contents);
			vp->v_vfsp = &EIO_vfs;
			VN_RELE(vp);
			cmn_err(CE_NOTE,
			    "%s: unexpected free inode %d, run fsck(1M)%s",
			    fs->fs_fsmnt, (int)ino,
			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
			return (EIO);
		}
	}

	/*
	 * Finish initializing the vnode, special handling for shadow inodes
	 * because IFTOVT() will produce a v_type of VNON which is not what we
	 * want, set v_type to VREG explicitly in that case.
	 */
	if (ftype == IFSHAD) {
		vp->v_type = VREG;
	} else {
		vp->v_type = IFTOVT((mode_t)ip->i_mode);
	}

	ufs_reset_vnode(vp);

	/*
	 * read the shadow
	 */
	if (ftype != 0 && ip->i_shadow != 0) {
		if ((error = ufs_si_load(ip, cr)) != 0) {
			ip->i_flag |= ISTALE;
			ip->i_ufs_acl = NULL;
			rw_exit(&ip->i_contents);
			vp->v_vfsp = &EIO_vfs;
			VN_RELE(vp);
			return (error);
		}
	}

	/*
	 * Only attach quota information if the inode has a type and if
	 * that type is not a shadow inode.
	 */
	if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) &&
	    ((ip->i_mode & IFMT) != IFATTRDIR)) {
		ip->i_dquot = getinoquota(ip);
	}
	TRANS_MATA_IGET(ufsvfsp, ip);
	*ipp = ip;
	rw_exit(&ip->i_contents);

	return (0);
}

/*
 * Vnode is no longer referenced, write the inode out
 * and if necessary, truncate and deallocate the file.
 */
void
ufs_iinactive(struct inode *ip)
{
	int		front;
	struct inode	*iq;
	struct inode	*hip;
	struct ufs_q	*uq;
	struct vnode	*vp = ITOV(ip);
	struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
	struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;

	/*
	 * Because the vnode type might have been changed,
	 * the dnlc_dir_purge must be called unconditionally.
	 */
	dnlc_dir_purge(&ip->i_danchor);

	/*
	 * Get exclusive access to inode data.
	 */
	rw_enter(&ip->i_contents, RW_WRITER);
	ASSERT(ip->i_flag & IREF);

	/*
	 * Make sure no one reclaimed the inode before we put it on
	 * the freelist or destroy it. We keep our 'hold' on the vnode
	 * from vn_rele until we are ready to do something with the inode.
	 *
	 * Pageout may put a VN_HOLD/VN_RELE at anytime during this
	 * operation via an async putpage, so we must make sure
	 * we don't free/destroy the inode more than once. ufs_iget
	 * may also put a VN_HOLD on the inode before it grabs
	 * the i_contents lock. This is done so we don't free
	 * an inode that a thread is waiting on.
	 */
	mutex_enter(&vp->v_lock);

	if (vp->v_count > 1) {
		vp->v_count--;  /* release our hold from vn_rele */
		mutex_exit(&vp->v_lock);
		rw_exit(&ip->i_contents);
		return;
	}
	mutex_exit(&vp->v_lock);

	/*
	 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed
	 * and clean.  It can be safely destroyed (cyf).
	 */
	if (ip->i_ufsvfs == NULL) {
		rw_exit(&ip->i_contents);
		ufs_si_del(ip);
		ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp));
		ufs_free_inode(ip);
		return;
	}

	/*
	 * queue idle inode to appropriate thread. Will check v_count == 1
	 * prior to putting this on the appropriate queue.
	 * Stale inodes will be unhashed and freed by the ufs idle thread
	 * in ufs_idle_free()
	 */
	front = 1;
	if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 &&
	    ip->i_mode && ip->i_nlink <= 0) {
		/*
		 * Mark the i_flag to indicate that inode is being deleted.
		 * This flag will be cleared when the deletion is complete.
		 * This prevents nfs from sneaking in via ufs_vget() while
		 * the delete is in progress (bugid 1242481).
		 */
		ip->i_flag |= IDEL;

		/*
		 * NOIDEL means that deletes are not allowed at this time;
		 * whoever resets NOIDEL will also send this inode back
		 * through ufs_iinactive.  IREF remains set.
		 */
		if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) {
			mutex_enter(&vp->v_lock);
			vp->v_count--;
			mutex_exit(&vp->v_lock);
			rw_exit(&ip->i_contents);
			return;
		}
		if (!TRANS_ISTRANS(ip->i_ufsvfs)) {
			rw_exit(&ip->i_contents);
			ufs_delete(ip->i_ufsvfs, ip, 0);
			return;
		}

		/* queue to delete thread; IREF remains set */
		ins.in_qfree.value.ul++;
		uq = &ip->i_ufsvfs->vfs_delete;

		mutex_enter(&uq->uq_mutex);

		/* add to q */
		if ((iq = uq->uq_ihead) != 0) {
			ip->i_freef = iq;
			ip->i_freeb = iq->i_freeb;
			iq->i_freeb->i_freef = ip;
			iq->i_freeb = ip;
			if (front)
				uq->uq_ihead = ip;
		} else {
			uq->uq_ihead = ip;
			ip->i_freef = ip;
			ip->i_freeb = ip;
		}

		delq_info->delq_unreclaimed_files += 1;
		delq_info->delq_unreclaimed_blocks += ip->i_blocks;
	} else {
		/*
		 * queue to idle thread
		 *  Check the v_count == 1 again.
		 *
		 */
		mutex_enter(&vp->v_lock);
		if (vp->v_count > 1) {
			vp->v_count--;  /* release our hold from vn_rele */
			mutex_exit(&vp->v_lock);
			rw_exit(&ip->i_contents);
			return;
		}
		mutex_exit(&vp->v_lock);
		uq = &ufs_idle_q;

		/*
		 * useful iff it has pages or is a fastsymlink; otherwise junk
		 */
		mutex_enter(&uq->uq_mutex);

		/* clear IREF means `on idle list' */
		ip->i_flag &= ~(IREF | IDIRECTIO);

		if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) {
			ins.in_frback.value.ul++;
			hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)];
			ufs_nuseful_iq++;
		} else {
			ins.in_frfront.value.ul++;
			hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)];
			ip->i_flag |= IJUNKIQ;
			ufs_njunk_iq++;
		}
		ip->i_freef = hip;
		ip->i_freeb = hip->i_freeb;
		hip->i_freeb->i_freef = ip;
		hip->i_freeb = ip;
	}

	/* wakeup thread(s) if q is overfull */
	if (++uq->uq_ne == uq->uq_lowat)
		cv_broadcast(&uq->uq_cv);

	/* all done, release the q and inode */
	mutex_exit(&uq->uq_mutex);
	rw_exit(&ip->i_contents);
}

/*
 * Check accessed and update flags on an inode structure.
 * If any are on, update the inode with the (unique) current time.
 * If waitfor is given, insure I/O order so wait for write to complete.
 */
void
ufs_iupdat(struct inode *ip, int waitfor)
{
	struct buf	*bp;
	struct fs	*fp;
	struct dinode	*dp;
	struct ufsvfs	*ufsvfsp 	= ip->i_ufsvfs;
	int 		i;
	int		do_trans_times;
	ushort_t	flag;
	o_uid_t		suid;
	o_gid_t		sgid;

	/*
	 * This function is now safe to be called with either the reader
	 * or writer i_contents lock.
	 */
	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	/*
	 * Return if file system has been forcibly umounted.
	 */
	if (ufsvfsp == NULL)
		return;

	flag = ip->i_flag;	/* Atomic read */
	/*
	 * We better not update the disk inode from a stale inode.
	 */
	if (flag & ISTALE)
		return;

	fp = ip->i_fs;

	if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) {
		if (fp->fs_ronly) {
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
			mutex_exit(&ip->i_tlock);
			return;
		}
		/*
		 * fs is active while metadata is being written
		 */
		mutex_enter(&ufsvfsp->vfs_lock);
		ufs_notclean(ufsvfsp);
		/*
		 * get the dinode
		 */
		bp = UFS_BREAD(ufsvfsp, ip->i_dev,
		    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)),
		    (int)fp->fs_bsize);
		if (bp->b_flags & B_ERROR) {
			mutex_enter(&ip->i_tlock);
			ip->i_flag &=
			    ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
			mutex_exit(&ip->i_tlock);
			brelse(bp);
			return;
		}
		/*
		 * munge inode fields
		 */
		mutex_enter(&ip->i_tlock);
		ITIMES_NOLOCK(ip);
		do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC);
		ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG);
		mutex_exit(&ip->i_tlock);

		/*
		 * For reads and concurrent re-writes, no deltas were
		 * entered for the access time changes - do it now.
		 */
		if (do_trans_times) {
			TRANS_INODE_TIMES(ufsvfsp, ip);
		}

		/*
		 * For SunOS 5.0->5.4, these lines below read:
		 *
		 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
		 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
		 *
		 * where MAXUID was set to 60002.  This was incorrect -
		 * the uids should have been constrained to what fitted into
		 * a 16-bit word.
		 *
		 * This means that files from 4.x filesystems that have an
		 * i_suid field larger than 60002 will have that field
		 * changed to 65535.
		 *
		 * Security note: 4.x UFS could never create a i_suid of
		 * UID_LONG since that would've corresponded to -1.
		 */
		suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
		    UID_LONG : ip->i_uid;
		sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
		    GID_LONG : ip->i_gid;

		if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) {
			ip->i_suid = suid;
			ip->i_sgid = sgid;
			TRANS_INODE(ufsvfsp, ip);
		}

		if ((ip->i_mode & IFMT) == IFBLK ||
		    (ip->i_mode & IFMT) == IFCHR) {
			dev_t d = ip->i_rdev;
			dev32_t dev32;

			/*
			 * load first direct block only if special device
			 */
			if (!cmpldev(&dev32, d)) {
				/*
				 * We panic here because there's "no way"
				 * we should have been able to create a large
				 * inode with a large dev_t.  Earlier layers
				 * should've caught this.
				 */
				panic("ip %p: i_rdev too big", (void *)ip);
			}

			if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
				ip->i_ordev = dev32;	/* can't use old fmt. */
			} else {
				ip->i_ordev = cmpdev(d);
			}
		}

		/*
		 * copy inode to dinode (zero fastsymlnk in dinode)
		 */
		dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number);
		dp->di_ic = ip->i_ic;	/* structure assignment */
		if (flag & IFASTSYMLNK) {
			for (i = 1; i < NDADDR; i++)
				dp->di_db[i] = 0;
			for (i = 0; i < NIADDR; i++)
				dp->di_ib[i] = 0;
		}
		if (TRANS_ISTRANS(ufsvfsp)) {
			/*
			 * Pass only a sector size buffer containing
			 * the inode, otherwise when the buffer is copied
			 * into a cached roll buffer then too much memory
			 * gets consumed if 8KB inode buffers are passed.
			 */
			TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff,
			    sizeof (struct dinode),
			    (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE),
			    DEV_BSIZE);

			brelse(bp);
		} else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) {
			UFS_BRWRITE(ufsvfsp, bp);

			/*
			 * Synchronous write has guaranteed that inode
			 * has been written on disk so clear the flag
			 */
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~IBDWRITE;
			mutex_exit(&ip->i_tlock);
		} else {
			bdrwrite(bp);

			/*
			 * This write hasn't guaranteed that inode has been
			 * written on the disk.
			 * Since, all updat flags on inode are cleared, we must
			 * remember the condition in case inode is to be updated
			 * synchronously later (e.g.- fsync()/fdatasync())
			 * and inode has not been modified yet.
			 */
			mutex_enter(&ip->i_tlock);
			ip->i_flag |= IBDWRITE;
			mutex_exit(&ip->i_tlock);
		}
	} else {
		/*
		 * In case previous inode update was done asynchronously
		 * (IBDWRITE) and this inode update request wants guaranteed
		 * (synchronous) disk update, flush the inode.
		 */
		if (waitfor && (flag & IBDWRITE)) {
			blkflush(ip->i_dev,
			    (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)));
			mutex_enter(&ip->i_tlock);
			ip->i_flag &= ~IBDWRITE;
			mutex_exit(&ip->i_tlock);
		}
	}
}

#define	SINGLE	0	/* index of single indirect block */
#define	DOUBLE	1	/* index of double indirect block */
#define	TRIPLE	2	/* index of triple indirect block */

/*
 * Release blocks associated with the inode ip and
 * stored in the indirect block bn.  Blocks are free'd
 * in LIFO order up to (but not including) lastbn.  If
 * level is greater than SINGLE, the block is an indirect
 * block and recursive calls to indirtrunc must be used to
 * cleanse other indirect blocks.
 *
 * N.B.: triple indirect blocks are untested.
 */
static long
indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags)
{
	int i;
	struct buf *bp, *copy;
	daddr32_t *bap;
	struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
	struct fs *fs = ufsvfsp->vfs_fs;
	daddr_t nb, last;
	long factor;
	int blocksreleased = 0, nblocks;

	ASSERT(RW_WRITE_HELD(&ip->i_contents));
	/*
	 * Calculate index in current block of last
	 * block to be kept.  -1 indicates the entire
	 * block so we need not calculate the index.
	 */
	factor = 1;
	for (i = SINGLE; i < level; i++)
		factor *= NINDIR(fs);
	last = lastbn;
	if (lastbn > 0)
		last /= factor;
	nblocks = btodb(fs->fs_bsize);
	/*
	 * Get buffer of block pointers, zero those
	 * entries corresponding to blocks to be free'd,
	 * and update on disk copy first.
	 * *Unless* the root pointer has been synchronously
	 * written to disk.  If nothing points to this
	 * indirect block then don't bother zero'ing and
	 * writing it.
	 */
	bp = UFS_BREAD(ufsvfsp,
	    ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize);
	if (bp->b_flags & B_ERROR) {
		brelse(bp);
		return (0);
	}
	bap = bp->b_un.b_daddr;
	if ((flags & I_CHEAP) == 0) {
		uint_t	zb;

		zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t));

		if (zb) {
			/*
			 * push any data into the log before we zero it
			 */
			if (bp->b_flags & B_DELWRI)
				TRANS_LOG(ufsvfsp, (caddr_t)bap,
				    ldbtob(bp->b_blkno), bp->b_bcount,
				    bp->b_un.b_addr, bp->b_bcount);
			copy = ngeteblk(fs->fs_bsize);
			bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr,
			    (uint_t)fs->fs_bsize);
			bzero((caddr_t)&bap[last + 1], zb);

			TRANS_BUF(ufsvfsp,
			    (caddr_t)&bap[last + 1] - (caddr_t)bap,
			    zb, bp, DT_ABZERO);

			UFS_BRWRITE(ufsvfsp, bp);
			bp = copy, bap = bp->b_un.b_daddr;
		}
	} else {
		/* make sure write retries are also cleared */
		bp->b_flags &= ~(B_DELWRI | B_RETRYWRI);
		bp->b_flags |= B_STALE | B_AGE;
	}

	/*
	 * Recursively free totally unused blocks.
	 */
	flags |= I_CHEAP;
	for (i = NINDIR(fs) - 1; i > last; i--) {
		nb = bap[i];
		if (nb == 0)
			continue;
		if (level > SINGLE) {
			blocksreleased +=
			    indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags);
			free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK);
		} else
			free(ip, nb, (off_t)fs->fs_bsize, flags);
		blocksreleased += nblocks;
	}
	flags &= ~I_CHEAP;

	/*
	 * Recursively free last partial block.
	 */
	if (level > SINGLE && lastbn >= 0) {
		last = lastbn % factor;
		nb = bap[i];
		if (nb != 0)
			blocksreleased +=
			    indirtrunc(ip, nb, last, level - 1, flags);
	}
	brelse(bp);
	return (blocksreleased);
}

/*
 * Truncate the inode ip to at most length size.
 * Free affected disk blocks -- the blocks of the
 * file are removed in reverse order.
 *
 * N.B.: triple indirect blocks are untested.
 */
static int i_genrand = 1234;
int
ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr)
{
	struct fs *fs = oip->i_fs;
	struct ufsvfs *ufsvfsp = oip->i_ufsvfs;
	struct inode *ip;
	daddr_t lastblock;
	off_t bsize;
	int boff;
	daddr_t bn, lastiblock[NIADDR];
	int level;
	long nblocks, blocksreleased = 0;
	int i;
	ushort_t mode;
	struct inode tip;
	int err;
	u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ?
	    (UFS_MAXOFFSET_T) : (MAXOFF32_T);

	/*
	 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most
	 * other uses need the reader lock. opendq() holds the writer lock.
	 */
	ASSERT((oip->i_mode & IFMT) == IFSHAD ||
	    RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
	ASSERT(RW_WRITE_HELD(&oip->i_contents));
	/*
	 * We only allow truncation of regular files and directories
	 * to arbitrary lengths here.  In addition, we allow symbolic
	 * links to be truncated only to zero length.  Other inode
	 * types cannot have their length set here.  Disk blocks are
	 * being dealt with - especially device inodes where
	 * ip->i_ordev is actually being stored in ip->i_db[0]!
	 */
	TRANS_INODE(ufsvfsp, oip);
	mode = oip->i_mode & IFMT;
	if (flags & I_FREE) {
		i_genrand *= 16843009;  /* turns into shift and adds */
		i_genrand++;
		oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1;
		oip->i_flag |= ICHG |IUPD;
		oip->i_seq++;
		if (length == oip->i_size)
			return (0);
		flags |= I_CHEAP;
	}
	if (mode == IFIFO)
		return (0);
	if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR &&
	    !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD)
		return (EINVAL);
	if (length > maxoffset)
		return (EFBIG);
	if ((mode == IFDIR) || (mode == IFATTRDIR))
		flags |= I_DIR;
	if (mode == IFSHAD)
		flags |= I_SHAD;
	if (oip == ufsvfsp->vfs_qinod)
		flags |= I_QUOTA;
	if (length == oip->i_size) {
		/* update ctime and mtime to please POSIX tests */
		oip->i_flag |= ICHG |IUPD;
		oip->i_seq++;
		if (length == 0) {
			/* nothing to cache so clear the flag */
			oip->i_flag &= ~IFASTSYMLNK;
		}
		return (0);
	}
	/* wipe out fast symlink till next access */
	if (oip->i_flag & IFASTSYMLNK) {
		int j;

		ASSERT(ITOV(oip)->v_type == VLNK);

		oip->i_flag &= ~IFASTSYMLNK;

		for (j = 1; j < NDADDR; j++)
			oip->i_db[j] = 0;
		for (j = 0; j < NIADDR; j++)
			oip->i_ib[j] = 0;
	}

	boff = (int)blkoff(fs, length);

	if (length > oip->i_size) {
		/*
		 * Trunc up case.  BMAPALLOC will insure that the right blocks
		 * are allocated.  This includes extending the old frag to a
		 * full block (if needed) in addition to doing any work
		 * needed for allocating the last block.
		 */
		if (boff == 0)
			err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr);
		else
			err = BMAPALLOC(oip, length - 1, boff, cr);

		if (err == 0) {
			/*
			 * Save old size and set inode's size now
			 * so that we don't cause too much of the
			 * file to be zero'd and pushed.
			 */
			u_offset_t osize = oip->i_size;
			oip->i_size  = length;
			/*
			 * Make sure we zero out the remaining bytes of
			 * the page in case a mmap scribbled on it. We
			 * can't prevent a mmap from writing beyond EOF
			 * on the last page of a file.
			 *
			 */
			if ((boff = (int)blkoff(fs, osize)) != 0) {
				bsize = (int)lblkno(fs, osize - 1) >= NDADDR ?
				    fs->fs_bsize : fragroundup(fs, boff);
				pvn_vpzero(ITOV(oip), osize,
				    (size_t)(bsize - boff));
			}
			oip->i_flag |= ICHG|IATTCHG;
			oip->i_seq++;
			ITIMES_NOLOCK(oip);
			/*
			 * MAXOFF32_T is old 2GB size limit. If
			 * this operation caused a large file to be
			 * created, turn on the superblock flag
			 * and update the superblock, if the flag
			 * is not already on.
			 */
			if ((length > (u_offset_t)MAXOFF32_T) &&
			    !(fs->fs_flags & FSLARGEFILES)) {
				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
				mutex_enter(&ufsvfsp->vfs_lock);
				fs->fs_flags |= FSLARGEFILES;
				ufs_sbwrite(ufsvfsp);
				mutex_exit(&ufsvfsp->vfs_lock);
			}
		}

		return (err);
	}

	/*
	 * Update the pages of the file.  If the file is not being
	 * truncated to a block boundary, the contents of the
	 * pages following the end of the file must be zero'ed
	 * in case it ever become accessible again because
	 * of subsequent file growth.
	 */
	if (boff == 0) {
		(void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage,
		    B_INVAL | B_TRUNC, CRED());
	} else {
		/*
		 * Make sure that the last block is properly allocated.
		 * We only really have to do this if the last block is
		 * actually allocated since ufs_bmap will now handle the case
		 * of an fragment which has no block allocated.  Just to
		 * be sure, we do it now independent of current allocation.
		 */
		err = BMAPALLOC(oip, length - 1, boff, cr);
		if (err)
			return (err);

		/*
		 * BMAPALLOC will call bmap_write which defers i_seq
		 * processing.  If the timestamps were changed, update
		 * i_seq before rdip drops i_contents or syncs the inode.
		 */
		if (oip->i_flag & (ICHG|IUPD))
			oip->i_seq++;

		/*
		 * BugId 4069932
		 * Make sure that the relevant partial page appears in
		 * the v_pages list, so that pvn_vpzero() will do its
		 * job.  Since doing this correctly requires everything
		 * in rdip() except for the uiomove(), it's easier and
		 * safer to do the uiomove() rather than duplicate the
		 * rest of rdip() here.
		 *
		 * To get here, we know that length indicates a byte
		 * that is not the first byte of a block.  (length - 1)
		 * is the last actual byte known to exist.  Deduction
		 * shows it is in the same block as byte (length).
		 * Thus, this rdip() invocation should always succeed
		 * except in the face of i/o errors, and give us the
		 * block we care about.
		 *
		 * rdip() makes the same locking assertions and
		 * assumptions as we do.  We do not acquire any locks
		 * before calling it, so we have not changed the locking
		 * situation.  Finally, there do not appear to be any
		 * paths whereby rdip() ends up invoking us again.
		 * Thus, infinite recursion is avoided.
		 */
		{
			uio_t uio;
			iovec_t iov[1];
			char buffer;

			uio.uio_iov = iov;
			uio.uio_iovcnt = 1;
			uio.uio_loffset = length - 1;
			uio.uio_resid = 1;
			uio.uio_segflg = UIO_SYSSPACE;
			uio.uio_extflg = UIO_COPY_CACHED;

			iov[0].iov_base = &buffer;
			iov[0].iov_len = 1;

			err = rdip(oip, &uio, UIO_READ, NULL);
			if (err)
				return (err);
		}

		bsize = (int)lblkno(fs, length - 1) >= NDADDR ?
		    fs->fs_bsize : fragroundup(fs, boff);
		pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff));
		/*
		 * Ensure full fs block is marked as dirty.
		 */
		(void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff),
		    ufs_putapage, B_INVAL | B_TRUNC, CRED());
	}

	/*
	 * Calculate index into inode's block list of
	 * last direct and indirect blocks (if any)
	 * which we want to keep.  Lastblock is -1 when
	 * the file is truncated to 0.
	 */
	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
	lastiblock[SINGLE] = lastblock - NDADDR;
	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
	nblocks = btodb(fs->fs_bsize);

	/*
	 * Update file and block pointers
	 * on disk before we start freeing blocks.
	 * If we crash before free'ing blocks below,
	 * the blocks will be returned to the free list.
	 * lastiblock values are also normalized to -1
	 * for calls to indirtrunc below.
	 */
	tip = *oip;			/* structure copy */
	ip = &tip;

	for (level = TRIPLE; level >= SINGLE; level--)
		if (lastiblock[level] < 0) {
			oip->i_ib[level] = 0;
			lastiblock[level] = -1;
		}
	for (i = NDADDR - 1; i > lastblock; i--) {
		oip->i_db[i] = 0;
		flags |= I_CHEAP;
	}
	oip->i_size = length;
	oip->i_flag |= ICHG|IUPD|IATTCHG;
	oip->i_seq++;
	if (!TRANS_ISTRANS(ufsvfsp))
		ufs_iupdat(oip, I_SYNC);	/* do sync inode update */

	/*
	 * Indirect blocks first.
	 */
	for (level = TRIPLE; level >= SINGLE; level--) {
		bn = ip->i_ib[level];
		if (bn != 0) {
			blocksreleased +=
			    indirtrunc(ip, bn, lastiblock[level], level, flags);
			if (lastiblock[level] < 0) {
				ip->i_ib[level] = 0;
				free(ip, bn, (off_t)fs->fs_bsize,
				    flags | I_IBLK);
				blocksreleased += nblocks;
			}
		}
		if (lastiblock[level] >= 0)
			goto done;
	}

	/*
	 * All whole direct blocks or frags.
	 */
	for (i = NDADDR - 1; i > lastblock; i--) {
		bn = ip->i_db[i];
		if (bn == 0)
			continue;
		ip->i_db[i] = 0;
		bsize = (off_t)blksize(fs, ip, i);
		free(ip, bn, bsize, flags);
		blocksreleased += btodb(bsize);
	}
	if (lastblock < 0)
		goto done;

	/*
	 * Finally, look for a change in size of the
	 * last direct block; release any frags.
	 */
	bn = ip->i_db[lastblock];
	if (bn != 0) {
		off_t oldspace, newspace;

		/*
		 * Calculate amount of space we're giving
		 * back as old block size minus new block size.
		 */
		oldspace = blksize(fs, ip, lastblock);
		UFS_SET_ISIZE(length, ip);
		newspace = blksize(fs, ip, lastblock);
		if (newspace == 0) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0");
			return (err);
		}
		if (oldspace - newspace > 0) {
			/*
			 * Block number of space to be free'd is
			 * the old block # plus the number of frags
			 * required for the storage we're keeping.
			 */
			bn += numfrags(fs, newspace);
			free(ip, bn, oldspace - newspace, flags);
			blocksreleased += btodb(oldspace - newspace);
		}
	}
done:
/* BEGIN PARANOIA */
	for (level = SINGLE; level <= TRIPLE; level++)
		if (ip->i_ib[level] != oip->i_ib[level]) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block");
			return (err);
		}

	for (i = 0; i < NDADDR; i++)
		if (ip->i_db[i] != oip->i_db[i]) {
			err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block");
			return (err);
		}
/* END PARANOIA */
	oip->i_blocks -= blocksreleased;

	if (oip->i_blocks < 0) {		/* sanity */
		cmn_err(CE_NOTE,
		    "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n",
		    fs->fs_fsmnt, (int)oip->i_number, oip->i_size,
		    (int)oip->i_blocks);
		oip->i_blocks = 0;
	}
	oip->i_flag |= ICHG|IATTCHG;
	oip->i_seq++;
	/* blocksreleased is >= zero, so this can not fail */
	(void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL,
	    (size_t *)NULL);
	return (0);
}

/*
 * Check mode permission on inode.  Mode is READ, WRITE or EXEC.
 * In the case of WRITE, the read-only status of the file system
 * is checked.  Depending on the calling user, the appropriate
 * mode bits are selected; privileges to override missing permission
 * bits are checked through secpolicy_vnode_access().
 * The i_contens lock must be held as reader here to prevent racing with
 * the acl subsystem removing/setting/changing acls on this inode.
 * The caller is responsible for indicating whether or not the i_contents
 * lock needs to be acquired here or if already held.
 */
int
ufs_iaccess(struct inode  *ip, int mode, struct cred *cr, int dolock)
{
	int shift = 0;
	int ret = 0;

	if (dolock)
		rw_enter(&ip->i_contents, RW_READER);
	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	if (mode & IWRITE) {
		/*
		 * Disallow write attempts on read-only
		 * file systems, unless the file is a block
		 * or character device or a FIFO.
		 */
		if (ip->i_fs->fs_ronly != 0) {
			if ((ip->i_mode & IFMT) != IFCHR &&
			    (ip->i_mode & IFMT) != IFBLK &&
			    (ip->i_mode & IFMT) != IFIFO) {
				ret = EROFS;
				goto out;
			}
		}
	}
	/*
	 * If there is an acl, check the acl and return.
	 */
	if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) {
		ret = ufs_acl_access(ip, mode, cr);
		goto out;
	}

	/*
	 * Access check is based on only one of owner, group, public.
	 * If not owner, then check group.
	 * If not a member of the group, then check public access.
	 */
	if (crgetuid(cr) != ip->i_uid) {
		shift += 3;
		if (!groupmember((uid_t)ip->i_gid, cr))
			shift += 3;
	}

	/* test missing privilege bits */
	ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid,
	    ip->i_mode << shift, mode);
out:
	if (dolock)
		rw_exit(&ip->i_contents);
	return (ret);
}

/*
 * if necessary, remove an inode from the free list
 *	i_contents is held except at unmount
 *
 * Return 1 if the inode is taken off of the ufs_idle_q,
 * and the caller is expected to call VN_RELE.
 *
 * Return 0 otherwise.
 */
int
ufs_rmidle(struct inode *ip)
{
	int rval = 0;

	mutex_enter(&ip->i_tlock);
	if ((ip->i_flag & IREF) == 0) {
		mutex_enter(&ufs_idle_q.uq_mutex);
		ip->i_freef->i_freeb = ip->i_freeb;
		ip->i_freeb->i_freef = ip->i_freef;
		ip->i_freef = ip;
		ip->i_freeb = ip;
		ip->i_flag |= IREF;
		ufs_idle_q.uq_ne--;
		if (ip->i_flag & IJUNKIQ) {
			ufs_njunk_iq--;
			ip->i_flag &= ~IJUNKIQ;
		} else {
			ufs_nuseful_iq--;
		}
		mutex_exit(&ufs_idle_q.uq_mutex);
		rval = 1;
	}
	mutex_exit(&ip->i_tlock);
	return (rval);
}

/*
 * scan the hash of inodes and call func with the inode locked
 */
int
ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg,
		struct ufsvfs *ufsvfsp)
{
	struct inode		*ip;		/* current inode */
	struct inode		*lip = NULL;	/* last/previous inode */
	union ihead		*ih;		/* current hash chain */
	int			error, i;
	int			saverror = 0;
	int			lip_held;	/* lip needs a VN_RELE() */

	/*
	 * If ufsvfsp is NULL, then our caller should be holding
	 * ufs_scan_lock to avoid conflicts between ufs_unmount() and
	 * ufs_update().  Otherwise, to avoid false-positives in
	 * ufs_unmount()'s v_count-based EBUSY check, we only hold
	 * those inodes that are in the file system our caller cares
	 * about.
	 *
	 * We know that ip is a valid inode in the hash chain (and thus
	 * we can trust i_ufsvfs) because the inode we chained from
	 * (lip) is still in the hash chain.  This is true because either:
	 *
	 * 1. We did not drop the hash chain lock since the last
	 *    iteration (because we were not interested in the last inode),
	 * or
	 * 2. We maintained a hold on the last inode while we
	 *    we were processing it, so it could not be removed
	 *    from the hash chain.
	 *
	 * The whole reason we're dropping and re-grabbing the chain
	 * lock on every inode is so that we don't present a major
	 * choke point on throughput, particularly when we've been
	 * called on behalf of fsflush.
	 */

	for (i = 0, ih = ihead; i < inohsz; i++, ih++) {
		mutex_enter(&ih_lock[i]);
		for (ip = ih->ih_chain[0], lip_held = 0;
		    ip != (struct inode *)ih;
		    ip = lip->i_forw) {

			ins.in_scan.value.ul++;

			/*
			 * Undo the previous iteration's VN_HOLD(), but
			 * only if one was done.
			 */
			if (lip_held)
				VN_RELE(ITOV(lip));

			lip = ip;
			if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) {
				/*
				 * We're not processing all inodes, and
				 * this inode is not in the filesystem of
				 * interest, so skip it.  No need to do a
				 * VN_HOLD() since we're not dropping the
				 * hash chain lock until after we've
				 * done the i_forw traversal above.
				 */
				lip_held = 0;
				continue;
			}
			VN_HOLD(ITOV(ip));
			lip_held = 1;
			mutex_exit(&ih_lock[i]);

			/*
			 * Acquire the contents lock as writer to make
			 * sure that the inode has been initialized in
			 * the cache or removed from the idle list by
			 * ufs_iget().  This works because ufs_iget()
			 * acquires the contents lock before putting
			 * the inode into the cache.  If we can lock
			 * it, then he's done with it.
			 */

			if (rwtry) {
				if (!rw_tryenter(&ip->i_contents, RW_WRITER)) {
					mutex_enter(&ih_lock[i]);
					continue;
				}
			} else {
				rw_enter(&ip->i_contents, RW_WRITER);
			}

			rw_exit(&ip->i_contents);

			/*
			 * ISTALE means the inode couldn't be read
			 *
			 * We don't have to hold the i_contents lock
			 * for this check for a couple of
			 * reasons. First, if ISTALE is set then the
			 * flag cannot be cleared until the inode is
			 * removed from the cache and that cannot
			 * happen until after we VN_RELE() it.
			 * Second, if ISTALE is not set, then the
			 * inode is in the cache and does not need to
			 * be read from disk so ISTALE cannot be set
			 * while we are not looking.
			 */
			if ((ip->i_flag & ISTALE) == 0) {
				if ((error = (*func)(ip, arg)) != 0)
					saverror = error;
			}

			mutex_enter(&ih_lock[i]);
		}
		if (lip_held)
			VN_RELE(ITOV(lip));
		mutex_exit(&ih_lock[i]);
	}
	return (saverror);
}

/*
 * Mark inode with the current time, plus a unique increment.
 *
 * Since we only keep 32-bit time on disk, if UFS is still alive
 * beyond 2038, filesystem times will simply stick at the last
 * possible second of 32-bit time. Not ideal, but probably better
 * than going into the remote past, or confusing applications with
 * negative time.
 */
void
ufs_imark(struct inode *ip)
{
	timestruc_t now;
	int32_t usec, nsec;

	/*
	 * The update of i_seq may have been deferred, increase i_seq here
	 * to make sure it is in sync with the timestamps.
	 */
	if (ip->i_flag & ISEQ) {
		ASSERT(ip->i_flag & (IUPD|ICHG));
		ip->i_seq++;
		ip->i_flag &= ~ISEQ;
	}

	gethrestime(&now);

	/*
	 * Fast algorithm to convert nsec to usec -- see hrt2ts()
	 * in common/os/timers.c for a full description.
	 */
	nsec = now.tv_nsec;
	usec = nsec + (nsec >> 2);
	usec = nsec + (usec >> 1);
	usec = nsec + (usec >> 2);
	usec = nsec + (usec >> 4);
	usec = nsec - (usec >> 3);
	usec = nsec + (usec >> 2);
	usec = nsec + (usec >> 3);
	usec = nsec + (usec >> 4);
	usec = nsec + (usec >> 1);
	usec = nsec + (usec >> 6);
	usec = usec >> 10;

	mutex_enter(&ufs_iuniqtime_lock);
	if (now.tv_sec > (time_t)iuniqtime.tv_sec ||
	    usec > iuniqtime.tv_usec) {
		if (now.tv_sec < TIME32_MAX) {
			iuniqtime.tv_sec = (time32_t)now.tv_sec;
			iuniqtime.tv_usec = usec;
		}
	} else {
		if (iuniqtime.tv_sec < TIME32_MAX) {
			iuniqtime.tv_usec++;
			/* Check for usec overflow */
			if (iuniqtime.tv_usec >= MICROSEC) {
				iuniqtime.tv_sec++;
				iuniqtime.tv_usec = 0;
			}
		}
	}

	if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) {
		ip->i_atime = iuniqtime;
	}
	if (ip->i_flag & IUPD) {
		ip->i_mtime = iuniqtime;
		ip->i_flag |= IMODTIME;
	}
	if (ip->i_flag & ICHG) {
		ip->i_diroff = 0;
		ip->i_ctime = iuniqtime;
	}
	mutex_exit(&ufs_iuniqtime_lock);
}

/*
 * Update timestamps in inode.
 */
void
ufs_itimes_nolock(struct inode *ip)
{

	/*
	 * if noatime is set and the inode access time is the only field that
	 * must be changed, exit immediately.
	 */
	if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) &&
	    (ip->i_ufsvfs->vfs_noatime)) {
		return;
	}

	if (ip->i_flag & (IUPD|IACC|ICHG)) {
		if (ip->i_flag & ICHG)
			ip->i_flag |= IMOD;
		else
			ip->i_flag |= IMODACC;
		ufs_imark(ip);
		ip->i_flag &= ~(IACC|IUPD|ICHG);
	}
}
author	Casper H.S. Dik <Casper.Dik@Sun.COM>
	Wed, 28 Apr 2010 10:01:37 +0200
changeset 12273	63678502e95e
parent 11066	cebb50cbe4f9
permissions	-rw-r--r--