usr/src/uts/common/cpr/cpr_misc.c
author randyf
Sat, 20 Oct 2007 16:00:42 -0700
changeset 5295 a21f2449e5f9
parent 4582 da3c7347dfa5
child 5331 3047ad28a67b
permissions -rw-r--r--
PSARC/2005/469 X86 Energy Star compliance PSARC/2006/632 PSMI extension for state save and restore 6330209 nge needs to support DDI_SUSPEND/DDI_RESUME 6381827 Suspend to RAM on x86 6393154 audio810 needs to support DDI_SUSPEND/DDI_RESUME 6397047 fd, fdc needs to support Suspend/Resume 6401974 cannot enter S3 with ohci PME enable set on Tyan 2865 with Sun or Tyan 2.01 BIOS 6422613 memscrubber doesn't re-acquire lock before CALLB_CPR_EXIT 6455736 ata/dadk/cmdk should support DDI_SUSPEND/DDI_RESUME 6511370 CPR on SPARC regression 6586018 TODOP Macros in i86pc/sys/machclock.h not in sun4u/sun4v equivilent (Sparc only) 6610124 It takes more than 3 minutes after printing "pci_pre_resume nv_sata:0" 6617143 powerd/pmconfig emits a different default message for an existing on or off action.

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/cpuvar.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/pathname.h>
#include <sys/callb.h>
#include <sys/fs/ufs_inode.h>
#include <vm/anon.h>
#include <sys/fs/swapnode.h>	/* for swapfs_minfree */
#include <sys/kmem.h>
#include <sys/cpr.h>
#include <sys/conf.h>
#include <sys/machclock.h>

/*
 * CPR miscellaneous support routines
 */
#define	cpr_open(path, mode,  vpp)	(vn_open(path, UIO_SYSSPACE, \
		mode, 0600, vpp, CRCREAT, 0))
#define	cpr_rdwr(rw, vp, basep, cnt)	(vn_rdwr(rw, vp,  (caddr_t)(basep), \
		cnt, 0LL, UIO_SYSSPACE, 0, (rlim64_t)MAXOFF_T, CRED(), \
		(ssize_t *)NULL))

extern void clkset(time_t);
extern cpu_t *i_cpr_bootcpu(void);
extern caddr_t i_cpr_map_setup(void);
extern void i_cpr_free_memory_resources(void);

extern kmutex_t cpr_slock;
extern size_t cpr_buf_size;
extern char *cpr_buf;
extern size_t cpr_pagedata_size;
extern char *cpr_pagedata;
extern int cpr_bufs_allocated;
extern int cpr_bitmaps_allocated;

#if defined(__sparc)
static struct cprconfig cprconfig;
static int cprconfig_loaded = 0;
static int cpr_statefile_ok(vnode_t *, int);
static int cpr_p_online(cpu_t *, int);
static void cpr_save_mp_state(void);
#endif

int cpr_is_ufs(struct vfs *);

char cpr_default_path[] = CPR_DEFAULT;

#define	COMPRESS_PERCENT 40	/* approx compression ratio in percent */
#define	SIZE_RATE	115	/* increase size by 15% */
#define	INTEGRAL	100	/* for integer math */


/*
 * cmn_err() followed by a 1/4 second delay; this gives the
 * logging service a chance to flush messages and helps avoid
 * intermixing output from prom_printf().
 */
/*PRINTFLIKE2*/
void
cpr_err(int ce, const char *fmt, ...)
{
	va_list adx;

	va_start(adx, fmt);
	vcmn_err(ce, fmt, adx);
	va_end(adx);
	drv_usecwait(MICROSEC >> 2);
}


int
cpr_init(int fcn)
{
	/*
	 * Allow only one suspend/resume process.
	 */
	if (mutex_tryenter(&cpr_slock) == 0)
		return (EBUSY);

	CPR->c_flags = 0;
	CPR->c_substate = 0;
	CPR->c_cprboot_magic = 0;
	CPR->c_alloc_cnt = 0;

	CPR->c_fcn = fcn;
	if (fcn == AD_CPR_REUSABLE)
		CPR->c_flags |= C_REUSABLE;
	else
		CPR->c_flags |= C_SUSPENDING;
	if (fcn == AD_SUSPEND_TO_RAM || fcn == DEV_SUSPEND_TO_RAM) {
		return (0);
	}
#if defined(__sparc)
	if (fcn != AD_CPR_NOCOMPRESS && fcn != AD_CPR_TESTNOZ)
		CPR->c_flags |= C_COMPRESSING;
	/*
	 * reserve CPR_MAXCONTIG virtual pages for cpr_dump()
	 */
	CPR->c_mapping_area = i_cpr_map_setup();
	if (CPR->c_mapping_area == 0) {		/* no space in kernelmap */
		cpr_err(CE_CONT, "Unable to alloc from kernelmap.\n");
		mutex_exit(&cpr_slock);
		return (EAGAIN);
	}
	if (cpr_debug & CPR_DEBUG3)
		cpr_err(CE_CONT, "Reserved virtual range from 0x%p for writing "
		    "kas\n", (void *)CPR->c_mapping_area);
#endif

	return (0);
}

/*
 * This routine releases any resources used during the checkpoint.
 */
void
cpr_done(void)
{
	cpr_stat_cleanup();
	i_cpr_bitmap_cleanup();

	/*
	 * Free pages used by cpr buffers.
	 */
	if (cpr_buf) {
		kmem_free(cpr_buf, cpr_buf_size);
		cpr_buf = NULL;
	}
	if (cpr_pagedata) {
		kmem_free(cpr_pagedata, cpr_pagedata_size);
		cpr_pagedata = NULL;
	}

	i_cpr_free_memory_resources();
	mutex_exit(&cpr_slock);
	cpr_err(CE_CONT, "System has been resumed.\n");
}


#if defined(__sparc)
/*
 * reads config data into cprconfig
 */
static int
cpr_get_config(void)
{
	static char config_path[] = CPR_CONFIG;
	struct cprconfig *cf = &cprconfig;
	struct vnode *vp;
	char *fmt;
	int err;

	if (cprconfig_loaded)
		return (0);

	fmt = "cannot %s config file \"%s\", error %d\n";
	if (err = vn_open(config_path, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0)) {
		cpr_err(CE_CONT, fmt, "open", config_path, err);
		return (err);
	}

	err = cpr_rdwr(UIO_READ, vp, cf, sizeof (*cf));
	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
	VN_RELE(vp);
	if (err) {
		cpr_err(CE_CONT, fmt, "read", config_path, err);
		return (err);
	}

	if (cf->cf_magic == CPR_CONFIG_MAGIC)
		cprconfig_loaded = 1;
	else {
		cpr_err(CE_CONT, "invalid config file \"%s\", "
		    "rerun pmconfig(1M)\n", config_path);
		err = EINVAL;
	}

	return (err);
}


/*
 * concat fs and path fields of the cprconfig structure;
 * returns pointer to the base of static data
 */
static char *
cpr_cprconfig_to_path(void)
{
	static char full_path[MAXNAMELEN];
	struct cprconfig *cf = &cprconfig;
	char *ptr;

	/*
	 * build /fs/path without extra '/'
	 */
	(void) strcpy(full_path, cf->cf_fs);
	if (strcmp(cf->cf_fs, "/"))
		(void) strcat(full_path, "/");
	ptr = cf->cf_path;
	if (*ptr == '/')
		ptr++;
	(void) strcat(full_path, ptr);
	return (full_path);
}


/*
 * Verify that the information in the configuration file regarding the
 * location for the statefile is still valid, depending on cf_type.
 * for CFT_UFS, cf_fs must still be a mounted filesystem, it must be
 *	mounted on the same device as when pmconfig was last run,
 *	and the translation of that device to a node in the prom's
 *	device tree must be the same as when pmconfig was last run.
 * for CFT_SPEC, cf_path must be the path to a block special file,
 *	it must have no file system mounted on it,
 *	and the translation of that device to a node in the prom's
 *	device tree must be the same as when pmconfig was last run.
 */
static int
cpr_verify_statefile_path(void)
{
	struct cprconfig *cf = &cprconfig;
	static const char long_name[] = "Statefile pathname is too long.\n";
	static const char lookup_fmt[] = "Lookup failed for "
	    "cpr statefile device %s.\n";
	static const char path_chg_fmt[] = "Device path for statefile "
	    "has changed from %s to %s.\t%s\n";
	static const char rerun[] = "Please rerun pmconfig(1m).";
	struct vfs *vfsp = NULL, *vfsp_save = rootvfs;
	ufsvfs_t *ufsvfsp = (ufsvfs_t *)rootvfs->vfs_data;
	ufsvfs_t *ufsvfsp_save = ufsvfsp;
	int error;
	struct vnode *vp;
	char *slash, *tail, *longest;
	char *errstr;
	int found = 0;
	union {
		char un_devpath[OBP_MAXPATHLEN];
		char un_sfpath[MAXNAMELEN];
	} un;
#define	devpath	un.un_devpath
#define	sfpath	un.un_sfpath

	ASSERT(cprconfig_loaded);
	/*
	 * We need not worry about locking or the timing of releasing
	 * the vnode, since we are single-threaded now.
	 */

	switch (cf->cf_type) {
	case CFT_SPEC:
		if (strlen(cf->cf_path) > sizeof (sfpath)) {
			cpr_err(CE_CONT, long_name);
			return (ENAMETOOLONG);
		}
		if ((error = lookupname(cf->cf_devfs,
		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
			cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
			return (error);
		}
		if (vp->v_type != VBLK)
			errstr = "statefile must be a block device";
		else if (vfs_devismounted(vp->v_rdev))
			errstr = "statefile device must not "
			    "have a file system mounted on it";
		else if (IS_SWAPVP(vp))
			errstr = "statefile device must not "
			    "be configured as swap file";
		else
			errstr = NULL;

		VN_RELE(vp);
		if (errstr) {
			cpr_err(CE_CONT, "%s.\n", errstr);
			return (ENOTSUP);
		}

		error = i_devname_to_promname(cf->cf_devfs, devpath,
		    OBP_MAXPATHLEN);
		if (error || strcmp(devpath, cf->cf_dev_prom)) {
			cpr_err(CE_CONT, path_chg_fmt,
			    cf->cf_dev_prom, devpath, rerun);
		}
		return (error);
	case CFT_UFS:
		break;		/* don't indent all the original code */
	default:
		cpr_err(CE_PANIC, "invalid cf_type");
	}

	/*
	 * The original code for UFS statefile
	 */
	if (strlen(cf->cf_fs) + strlen(cf->cf_path) + 2 > sizeof (sfpath)) {
		cpr_err(CE_CONT, long_name);
		return (ENAMETOOLONG);
	}

	bzero(sfpath, sizeof (sfpath));
	(void) strcpy(sfpath, cpr_cprconfig_to_path());

	if (*sfpath != '/') {
		cpr_err(CE_CONT, "Statefile pathname %s "
		    "must begin with a /\n", sfpath);
		return (EINVAL);
	}

	/*
	 * Find the longest prefix of the statefile pathname which
	 * is the mountpoint of a filesystem.  This string must
	 * match the cf_fs field we read from the config file.  Other-
	 * wise the user has changed things without running pmconfig.
	 */
	tail = longest = sfpath + 1;	/* pt beyond the leading "/" */
	while ((slash = strchr(tail, '/')) != NULL) {
		*slash = '\0';	  /* temporarily terminate the string */
		if ((error = lookupname(sfpath,
		    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
			*slash = '/';
			cpr_err(CE_CONT, "A directory in the "
			    "statefile path %s was not found.\n", sfpath);
			VN_RELE(vp);

			return (error);
		}

		vfs_list_read_lock();
		vfsp = rootvfs;
		do {
			ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
			if (ufsvfsp != NULL && ufsvfsp->vfs_root == vp) {
				found = 1;
				break;
			}
			vfsp = vfsp->vfs_next;
		} while (vfsp != rootvfs);
		vfs_list_unlock();

		/*
		 * If we have found a filesystem mounted on the current
		 * path prefix, remember the end of the string in
		 * "longest".  If it happens to be the the exact fs
		 * saved in the configuration file, save the current
		 * ufsvfsp so we can make additional checks further down.
		 */
		if (found) {
			longest = slash;
			if (strcmp(cf->cf_fs, sfpath) == 0) {
				ufsvfsp_save = ufsvfsp;
				vfsp_save = vfsp;
			}
			found = 0;
		}

		VN_RELE(vp);
		*slash = '/';
		tail = slash + 1;
	}
	*longest = '\0';
	if (cpr_is_ufs(vfsp_save) == 0 || strcmp(cf->cf_fs, sfpath)) {
		cpr_err(CE_CONT, "Filesystem containing "
		    "the statefile when pmconfig was run (%s) has "
		    "changed to %s. %s\n", cf->cf_fs, sfpath, rerun);
		return (EINVAL);
	}

	if ((error = lookupname(cf->cf_devfs,
	    UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) {
		cpr_err(CE_CONT, lookup_fmt, cf->cf_devfs);
		return (error);
	}

	if (ufsvfsp_save->vfs_devvp->v_rdev != vp->v_rdev) {
		cpr_err(CE_CONT, "Filesystem containing "
		    "statefile no longer mounted on device %s. "
		    "See power.conf(4).", cf->cf_devfs);
		VN_RELE(vp);
		return (ENXIO);
	}
	VN_RELE(vp);

	error = i_devname_to_promname(cf->cf_devfs, devpath, OBP_MAXPATHLEN);
	if (error || strcmp(devpath, cf->cf_dev_prom)) {
		cpr_err(CE_CONT, path_chg_fmt,
		    cf->cf_dev_prom, devpath, rerun);
		return (error);
	}

	return (0);
}

/*
 * Make sure that the statefile can be used as a block special statefile
 * (meaning that is exists and has nothing mounted on it)
 * Returns errno if not a valid statefile.
 */
int
cpr_check_spec_statefile(void)
{
	int err;

	if (err = cpr_get_config())
		return (err);
	ASSERT(cprconfig.cf_type == CFT_SPEC);

	if (cprconfig.cf_devfs == NULL)
		return (ENXIO);

	return (cpr_verify_statefile_path());

}

int
cpr_alloc_statefile(int alloc_retry)
{
	register int rc = 0;
	char *str;

	/*
	 * Statefile size validation. If checkpoint the first time, disk blocks
	 * allocation will be done; otherwise, just do file size check.
	 * if statefile allocation is being retried, C_VP will be inited
	 */
	if (alloc_retry) {
		str = "\n-->Retrying statefile allocation...";
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
			prom_printf(str);
		if (C_VP->v_type != VBLK)
			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
	} else {
		/*
		 * Open an exiting file for writing, the state file needs to be
		 * pre-allocated since we can't and don't want to do allocation
		 * during checkpoint (too much of the OS is disabled).
		 *    - do a preliminary size checking here, if it is too small,
		 *	allocate more space internally and retry.
		 *    - check the vp to make sure it's the right type.
		 */
		char *path = cpr_build_statefile_path();

		if (path == NULL)
			return (ENXIO);
		else if (rc = cpr_verify_statefile_path())
			return (rc);

		if (rc = vn_open(path, UIO_SYSSPACE,
		    FCREAT|FWRITE, 0600, &C_VP, CRCREAT, 0)) {
			cpr_err(CE_WARN, "cannot open statefile %s", path);
			return (rc);
		}
	}

	/*
	 * Only ufs and block special statefiles supported
	 */
	if (C_VP->v_type != VREG && C_VP->v_type != VBLK) {
		cpr_err(CE_CONT,
		    "Statefile must be regular file or block special file.");
		return (EACCES);
	}

	if (rc = cpr_statefile_ok(C_VP, alloc_retry))
		return (rc);

	if (C_VP->v_type != VBLK) {
		/*
		 * sync out the fs change due to the statefile reservation.
		 */
		(void) VFS_SYNC(C_VP->v_vfsp, 0, CRED());

		/*
		 * Validate disk blocks allocation for the state file.
		 * Ask the file system prepare itself for the dump operation.
		 */
		if (rc = VOP_DUMPCTL(C_VP, DUMP_ALLOC, NULL)) {
			cpr_err(CE_CONT, "Error allocating "
			    "blocks for cpr statefile.");
			return (rc);
		}
	}
	return (0);
}


/*
 * Lookup device size and return available space in bytes.
 * NOTE: Since prop_op(9E) can't tell the difference between a character
 * and a block reference, it is ok to ask for "Size" instead of "Nblocks".
 */
size_t
cpr_get_devsize(dev_t dev)
{
	size_t bytes = 0;

	bytes = cdev_Size(dev);
	if (bytes == 0)
		bytes = cdev_size(dev);

	if (bytes > CPR_SPEC_OFFSET)
		bytes -= CPR_SPEC_OFFSET;
	else
		bytes = 0;

	return (bytes);
}


/*
 * increase statefile size
 */
static int
cpr_grow_statefile(vnode_t *vp, u_longlong_t newsize)
{
	extern uchar_t cpr_pagecopy[];
	struct inode *ip = VTOI(vp);
	u_longlong_t offset;
	int error, increase;
	ssize_t resid;

	rw_enter(&ip->i_contents, RW_READER);
	increase = (ip->i_size < newsize);
	offset = ip->i_size;
	rw_exit(&ip->i_contents);

	if (increase == 0)
		return (0);

	/*
	 * write to each logical block to reserve disk space
	 */
	error = 0;
	cpr_pagecopy[0] = '1';
	for (; offset < newsize; offset += ip->i_fs->fs_bsize) {
		if (error = vn_rdwr(UIO_WRITE, vp, (caddr_t)cpr_pagecopy,
		    ip->i_fs->fs_bsize, (offset_t)offset, UIO_SYSSPACE, 0,
		    (rlim64_t)MAXOFF_T, CRED(), &resid)) {
			if (error == ENOSPC) {
				cpr_err(CE_WARN, "error %d while reserving "
				    "disk space for statefile %s\n"
				    "wanted %lld bytes, file is %lld short",
				    error, cpr_cprconfig_to_path(),
				    newsize, newsize - offset);
			}
			break;
		}
	}
	return (error);
}


/*
 * do a simple estimate of the space needed to hold the statefile
 * taking compression into account, but be fairly conservative
 * so we have a better chance of completing; when dump fails,
 * the retry cost is fairly high.
 *
 * Do disk blocks allocation for the state file if no space has
 * been allocated yet. Since the state file will not be removed,
 * allocation should only be done once.
 */
static int
cpr_statefile_ok(vnode_t *vp, int alloc_retry)
{
	extern size_t cpr_bitmap_size;
	struct inode *ip = VTOI(vp);
	const int UCOMP_RATE = 20; /* comp. ratio*10 for user pages */
	u_longlong_t size, isize, ksize, raw_data;
	char *str, *est_fmt;
	size_t space;
	int error;

	/*
	 * number of pages short for swapping.
	 */
	STAT->cs_nosw_pages = k_anoninfo.ani_mem_resv;
	if (STAT->cs_nosw_pages < 0)
		STAT->cs_nosw_pages = 0;

	str = "cpr_statefile_ok:";

	CPR_DEBUG(CPR_DEBUG9, "Phys swap: max=%lu resv=%lu\n",
	    k_anoninfo.ani_max, k_anoninfo.ani_phys_resv);
	CPR_DEBUG(CPR_DEBUG9, "Mem swap: max=%ld resv=%lu\n",
	    MAX(availrmem - swapfs_minfree, 0),
	    k_anoninfo.ani_mem_resv);
	CPR_DEBUG(CPR_DEBUG9, "Total available swap: %ld\n",
	    CURRENT_TOTAL_AVAILABLE_SWAP);

	/*
	 * try increasing filesize by 15%
	 */
	if (alloc_retry) {
		/*
		 * block device doesn't get any bigger
		 */
		if (vp->v_type == VBLK) {
			if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
				prom_printf(
				    "Retry statefile on special file\n");
			return (ENOMEM);
		} else {
			rw_enter(&ip->i_contents, RW_READER);
			size = (ip->i_size * SIZE_RATE) / INTEGRAL;
			rw_exit(&ip->i_contents);
		}
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
			prom_printf("Retry statefile size = %lld\n", size);
	} else {
		u_longlong_t cpd_size;
		pgcnt_t npages, nback;
		int ndvram;

		ndvram = 0;
		(void) callb_execute_class(CB_CL_CPR_FB,
		    (int)(uintptr_t)&ndvram);
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
			prom_printf("ndvram size = %d\n", ndvram);

		/*
		 * estimate 1 cpd_t for every (CPR_MAXCONTIG / 2) pages
		 */
		npages = cpr_count_kpages(REGULAR_BITMAP, cpr_nobit);
		cpd_size = sizeof (cpd_t) * (npages / (CPR_MAXCONTIG / 2));
		raw_data = cpd_size + cpr_bitmap_size;
		ksize = ndvram + mmu_ptob(npages);

		est_fmt = "%s estimated size with "
		    "%scompression %lld, ksize %lld\n";
		nback = mmu_ptob(STAT->cs_nosw_pages);
		if (CPR->c_flags & C_COMPRESSING) {
			size = ((ksize * COMPRESS_PERCENT) / INTEGRAL) +
			    raw_data + ((nback * 10) / UCOMP_RATE);
			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "", size, ksize);
		} else {
			size = ksize + raw_data + nback;
			CPR_DEBUG(CPR_DEBUG1, est_fmt, str, "no ",
			    size, ksize);
		}
	}

	/*
	 * All this is much simpler for a block device
	 */
	if (vp->v_type == VBLK) {
		space = cpr_get_devsize(vp->v_rdev);
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
			prom_printf("statefile dev size %lu\n", space);

		/*
		 * Export the estimated filesize info, this value will be
		 * compared before dumping out the statefile in the case of
		 * no compression.
		 */
		STAT->cs_est_statefsz = size;
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6))
			prom_printf("%s Estimated statefile size %llu, "
			    "space %lu\n", str, size, space);
		if (size > space) {
			cpr_err(CE_CONT, "Statefile partition too small.");
			return (ENOMEM);
		}
		return (0);
	} else {
		if (CPR->c_alloc_cnt++ > C_MAX_ALLOC_RETRY) {
			cpr_err(CE_CONT, "Statefile allocation retry failed\n");
			return (ENOMEM);
		}

		/*
		 * Estimate space needed for the state file.
		 *
		 * State file size in bytes:
		 * 	kernel size + non-cache pte seg +
		 *	bitmap size + cpr state file headers size
		 * (round up to fs->fs_bsize)
		 */
		size = blkroundup(ip->i_fs, size);

		/*
		 * Export the estimated filesize info, this value will be
		 * compared before dumping out the statefile in the case of
		 * no compression.
		 */
		STAT->cs_est_statefsz = size;
		error = cpr_grow_statefile(vp, size);
		if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)) {
			rw_enter(&ip->i_contents, RW_READER);
			isize = ip->i_size;
			rw_exit(&ip->i_contents);
			prom_printf("%s Estimated statefile size %lld, "
			    "i_size %lld\n", str, size, isize);
		}

		return (error);
	}
}


void
cpr_statef_close(void)
{
	if (C_VP) {
		if (!cpr_reusable_mode)
			(void) VOP_DUMPCTL(C_VP, DUMP_FREE, NULL);
		(void) VOP_CLOSE(C_VP, FWRITE, 1, (offset_t)0, CRED());
		VN_RELE(C_VP);
		C_VP = 0;
	}
}


/*
 * open cpr default file and display error
 */
int
cpr_open_deffile(int mode, vnode_t **vpp)
{
	int error;

	if (error = cpr_open(cpr_default_path, mode, vpp))
		cpr_err(CE_CONT, "cannot open \"%s\", error %d\n",
		    cpr_default_path, error);
	return (error);
}


/*
 * write cdef_t to disk.  This contains the original values of prom
 * properties that we modify.  We fill in the magic number of the file
 * here as a signal to the booter code that the state file is valid.
 * Be sure the file gets synced, since we may be shutting down the OS.
 */
int
cpr_write_deffile(cdef_t *cdef)
{
	struct vnode *vp;
	char *str;
	int rc;

	if (rc = cpr_open_deffile(FCREAT|FWRITE, &vp))
		return (rc);

	if (rc = cpr_rdwr(UIO_WRITE, vp, cdef, sizeof (*cdef)))
		str = "write";
	else if (rc = VOP_FSYNC(vp, FSYNC, CRED()))
		str = "fsync";
	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
	VN_RELE(vp);

	if (rc) {
		cpr_err(CE_WARN, "%s error %d, file \"%s\"",
		    str, rc, cpr_default_path);
	}
	return (rc);
}

/*
 * Clear the magic number in the defaults file.  This tells the booter
 * program that the state file is not current and thus prevents
 * any attempt to restore from an obsolete state file.
 */
void
cpr_clear_definfo(void)
{
	struct vnode *vp;
	cmini_t mini;

	if ((CPR->c_cprboot_magic != CPR_DEFAULT_MAGIC) ||
	    cpr_open_deffile(FCREAT|FWRITE, &vp))
		return;
	mini.magic = mini.reusable = 0;
	(void) cpr_rdwr(UIO_WRITE, vp, &mini, sizeof (mini));
	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
	VN_RELE(vp);
}

/*
 * If the cpr default file is invalid, then we must not be in reusable mode
 * if it is valid, it tells us our mode
 */
int
cpr_get_reusable_mode(void)
{
	struct vnode *vp;
	cmini_t mini;
	int rc;

	if (cpr_open(cpr_default_path, FREAD, &vp))
		return (0);

	rc = cpr_rdwr(UIO_READ, vp, &mini, sizeof (mini));
	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
	VN_RELE(vp);
	if (rc == 0 && mini.magic == CPR_DEFAULT_MAGIC)
		return (mini.reusable);

	return (0);
}
#endif

/*
 * clock/time related routines
 */
static time_t   cpr_time_stamp;


void
cpr_tod_get(cpr_time_t *ctp)
{
	timestruc_t ts;

	mutex_enter(&tod_lock);
	ts = TODOP_GET(tod_ops);
	mutex_exit(&tod_lock);
	ctp->tv_sec = (time32_t)ts.tv_sec;
	ctp->tv_nsec = (int32_t)ts.tv_nsec;
}

void
cpr_tod_fault_reset(void)
{
	mutex_enter(&tod_lock);
	tod_fault_reset();
	mutex_exit(&tod_lock);
}

void
cpr_save_time(void)
{
	cpr_time_stamp = gethrestime_sec();
}

/*
 * correct time based on saved time stamp or hardware clock
 */
void
cpr_restore_time(void)
{
	clkset(cpr_time_stamp);
}

#if defined(__sparc)
/*
 * CPU ONLINE/OFFLINE CODE
 */
int
cpr_mp_offline(void)
{
	cpu_t *cp, *bootcpu;
	int rc = 0;
	int brought_up_boot = 0;

	/*
	 * Do nothing for UP.
	 */
	if (ncpus == 1)
		return (0);

	mutex_enter(&cpu_lock);

	cpr_save_mp_state();

	bootcpu = i_cpr_bootcpu();
	if (!CPU_ACTIVE(bootcpu)) {
		if ((rc = cpr_p_online(bootcpu, CPU_CPR_ONLINE))) {
			mutex_exit(&cpu_lock);
			return (rc);
		}
		brought_up_boot = 1;
	}

	cp = cpu_list;
	do {
		if (cp == bootcpu)
			continue;
		if (cp->cpu_flags & CPU_OFFLINE)
			continue;
		if ((rc = cpr_p_online(cp, CPU_CPR_OFFLINE))) {
			mutex_exit(&cpu_lock);
			return (rc);
		}
	} while ((cp = cp->cpu_next) != cpu_list);
	if (brought_up_boot && (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG6)))
		prom_printf("changed cpu %p to state %d\n",
		    bootcpu, CPU_CPR_ONLINE);
	mutex_exit(&cpu_lock);

	return (rc);
}

int
cpr_mp_online(void)
{
	cpu_t *cp, *bootcpu = CPU;
	int rc = 0;

	/*
	 * Do nothing for UP.
	 */
	if (ncpus == 1)
		return (0);

	/*
	 * cpr_save_mp_state() sets CPU_CPR_ONLINE in cpu_cpr_flags
	 * to indicate a cpu was online at the time of cpr_suspend();
	 * now restart those cpus that were marked as CPU_CPR_ONLINE
	 * and actually are offline.
	 */
	mutex_enter(&cpu_lock);
	for (cp = bootcpu->cpu_next; cp != bootcpu; cp = cp->cpu_next) {
		/*
		 * Clear the CPU_FROZEN flag in all cases.
		 */
		cp->cpu_flags &= ~CPU_FROZEN;

		if (CPU_CPR_IS_OFFLINE(cp))
			continue;
		if (CPU_ACTIVE(cp))
			continue;
		if ((rc = cpr_p_online(cp, CPU_CPR_ONLINE))) {
			mutex_exit(&cpu_lock);
			return (rc);
		}
	}

	/*
	 * turn off the boot cpu if it was offlined
	 */
	if (CPU_CPR_IS_OFFLINE(bootcpu)) {
		if ((rc = cpr_p_online(bootcpu, CPU_CPR_OFFLINE))) {
			mutex_exit(&cpu_lock);
			return (rc);
		}
	}
	mutex_exit(&cpu_lock);
	return (0);
}

static void
cpr_save_mp_state(void)
{
	cpu_t *cp;

	ASSERT(MUTEX_HELD(&cpu_lock));

	cp = cpu_list;
	do {
		cp->cpu_cpr_flags &= ~CPU_CPR_ONLINE;
		if (CPU_ACTIVE(cp))
			CPU_SET_CPR_FLAGS(cp, CPU_CPR_ONLINE);
	} while ((cp = cp->cpu_next) != cpu_list);
}

/*
 * change cpu to online/offline
 */
static int
cpr_p_online(cpu_t *cp, int state)
{
	int rc;

	ASSERT(MUTEX_HELD(&cpu_lock));

	switch (state) {
	case CPU_CPR_ONLINE:
		rc = cpu_online(cp);
		break;
	case CPU_CPR_OFFLINE:
		rc = cpu_offline(cp, CPU_FORCED);
		break;
	}
	if (rc) {
		cpr_err(CE_WARN, "Failed to change processor %d to "
		    "state %d, (errno %d)", cp->cpu_id, state, rc);
	}
	return (rc);
}

/*
 * Construct the pathname of the state file and return a pointer to
 * caller.  Read the config file to get the mount point of the
 * filesystem and the pathname within fs.
 */
char *
cpr_build_statefile_path(void)
{
	struct cprconfig *cf = &cprconfig;

	if (cpr_get_config())
		return (NULL);

	switch (cf->cf_type) {
	case CFT_UFS:
		if (strlen(cf->cf_path) + strlen(cf->cf_fs) >= MAXNAMELEN - 1) {
			cpr_err(CE_CONT, "Statefile path is too long.\n");
			return (NULL);
		}
		return (cpr_cprconfig_to_path());
	case CFT_SPEC:
		return (cf->cf_devfs);
	default:
		cpr_err(CE_PANIC, "invalid statefile type");
		/*NOTREACHED*/
		return (NULL);
	}
}

int
cpr_statefile_is_spec(void)
{
	if (cpr_get_config())
		return (0);
	return (cprconfig.cf_type == CFT_SPEC);
}

char *
cpr_get_statefile_prom_path(void)
{
	struct cprconfig *cf = &cprconfig;

	ASSERT(cprconfig_loaded);
	ASSERT(cf->cf_magic == CPR_CONFIG_MAGIC);
	ASSERT(cf->cf_type == CFT_SPEC);
	return (cf->cf_dev_prom);
}


/*
 * XXX The following routines need to be in the vfs source code.
 */

int
cpr_is_ufs(struct vfs *vfsp)
{
	char *fsname;

	fsname = vfssw[vfsp->vfs_fstype].vsw_name;
	return (strcmp(fsname, "ufs") == 0);
}

/*
 * This is a list of file systems that are allowed to be writeable when a
 * reusable statefile checkpoint is taken.  They must not have any state that
 * cannot be restored to consistency by simply rebooting using the checkpoint.
 * (In contrast to ufs, cachefs and pcfs which have disk state that could get
 * out of sync with the in-kernel data).
 */
int
cpr_reusable_mount_check(void)
{
	struct vfs *vfsp;
	char *fsname;
	char **cpp;
	static char *cpr_writeok_fss[] = {
		"autofs", "devfs", "fd", "lofs", "mntfs", "namefs", "nfs",
		"proc", "tmpfs", "ctfs", "objfs", "dev", NULL
	};

	vfs_list_read_lock();
	vfsp = rootvfs;
	do {
		if (vfsp->vfs_flag & VFS_RDONLY) {
			vfsp = vfsp->vfs_next;
			continue;
		}
		fsname = vfssw[vfsp->vfs_fstype].vsw_name;
		for (cpp = cpr_writeok_fss; *cpp; cpp++) {
			if (strcmp(fsname, *cpp) == 0)
				break;
		}
		/*
		 * if the inner loop reached the NULL terminator,
		 * the current fs-type does not match any OK-type
		 */
		if (*cpp == NULL) {
			cpr_err(CE_CONT, "a filesystem of type %s is "
			    "mounted read/write.\nReusable statefile requires "
			    "no writeable filesystem of this type be mounted\n",
			    fsname);
			vfs_list_unlock();
			return (EINVAL);
		}
		vfsp = vfsp->vfs_next;
	} while (vfsp != rootvfs);
	vfs_list_unlock();
	return (0);
}

/*
 * return statefile offset in DEV_BSIZE units
 */
int
cpr_statefile_offset(void)
{
	return (cpr_statefile_is_spec() ? btod(CPR_SPEC_OFFSET) : 0);
}

/*
 * Force a fresh read of the cprinfo per uadmin 3 call
 */
void
cpr_forget_cprconfig(void)
{
	cprconfig_loaded = 0;
}
#endif