upstream/illumos/illumos-gate: comparison usr/src/uts/common/fs/fsflush.c

equal deleted inserted replaced

--1:000000000000
+:68f95e015346
+/*
+* CDDL HEADER START
+*
+* The contents of this file are subject to the terms of the
+* Common Development and Distribution License, Version 1.0 only
+* (the "License").  You may not use this file except in compliance
+* with the License.
+*
+* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+* or http://www.opensolaris.org/os/licensing.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+* When distributing Covered Code, include this CDDL HEADER in each
+* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+* If applicable, add the following below this CDDL HEADER, with the
+* fields enclosed by brackets "[]" replaced with your own identifying
+* information: Portions Copyright [yyyy] [name of copyright owner]
+*
+* CDDL HEADER END
+*/
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+/*
+* Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+* Use is subject to license terms.
+*/
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/tuneable.h>
+#include <sys/inline.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/var.h>
+#include <sys/buf.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/vnode.h>
+#include <sys/swap.h>
+#include <sys/vm.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/sysinfo.h>
+#include <sys/callb.h>
+#include <sys/reboot.h>
+#include <sys/time.h>
+#include <sys/fs/ufs_inode.h>
+#include <sys/fs/ufs_bio.h>
+#include <vm/hat.h>
+#include <vm/page.h>
+#include <vm/pvn.h>
+#include <vm/seg_kmem.h>
+int doiflush = 1;	/* non-zero to turn inode flushing on */
+int dopageflush = 1;	/* non-zero to turn page flushing on */
+/*
+* To improve boot performance, don't run the inode flushing loop until
+* the specified number of seconds after boot.  To revert to the old
+* behavior, set fsflush_iflush_delay to 0.  We have not created any new
+* filesystem danger that did not exist previously, since there is always a
+* window in between when fsflush does the inode flush loop during which the
+* system could crash, fail to sync the filesystem, and fsck will be needed
+* to recover.  We have, however, widened this window.  Finally,
+* we never delay inode flushing if we're booting into single user mode,
+* where the administrator may be modifying files or using fsck.  This
+* modification avoids inode flushes during boot whose only purpose is to
+* update atimes on files which have been accessed during boot.
+*/
+int fsflush_iflush_delay = 60;
+kcondvar_t fsflush_cv;
+static kmutex_t fsflush_lock;	/* just for the cv_wait */
+ksema_t fsflush_sema;		/* to serialize with reboot */
+/*
+* some statistics for fsflush_do_pages
+*/
+typedef struct {
+	ulong_t fsf_scan;	/* number of pages scanned */
+	ulong_t fsf_examined;	/* number of page_t's actually examined, can */
+				/* be less than fsf_scan due to large pages */
+	ulong_t fsf_locked;	/* pages we actually page_lock()ed */
+	ulong_t fsf_modified;	/* number of modified pages found */
+	ulong_t fsf_coalesce;	/* number of page coalesces done */
+	ulong_t fsf_time;	/* nanoseconds of run time */
+	ulong_t fsf_releases;	/* number of page_release() done */
+} fsf_stat_t;
+fsf_stat_t fsf_recent;	/* counts for most recent duty cycle */
+fsf_stat_t fsf_total;	/* total of counts */
+ulong_t fsf_cycles;	/* number of runs refelected in fsf_total */
+/*
+* data used to determine when we can coalese consecutive free pages
+* into larger pages.
+*/
+#define	MAX_PAGESIZES	32
+static ulong_t		fsf_npgsz;
+static pgcnt_t		fsf_pgcnt[MAX_PAGESIZES];
+static pgcnt_t		fsf_mask[MAX_PAGESIZES];
+/*
+* Scan page_t's and issue I/O's for modified pages.
+*
+* Also coalesces consecutive small sized free pages into the next larger
+* pagesize. This costs a tiny bit of time in fsflush, but will reduce time
+* spent scanning on later passes and for anybody allocating large pages.
+*/
+static void
+fsflush_do_pages()
+{
+	vnode_t		*vp;
+	ulong_t		pcount;
+	hrtime_t	timer = gethrtime();
+	ulong_t		releases = 0;
+	ulong_t		nexamined = 0;
+	ulong_t		nlocked = 0;
+	ulong_t		nmodified = 0;
+	ulong_t		ncoalesce = 0;
+	int		mod;
+	u_offset_t	offset;
+	uint_t		szc;
+	page_t		*coal_page = NULL;  /* 1st page in group to coalese */
+	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
+	uint_t		coal_cnt = 0;	    /* count of pages seen */
+	static ulong_t	nscan = 0;
+	static pgcnt_t	last_total_pages = 0;
+	static void	*pp_cookie = NULL;
+	static page_t	*pp;
+	/*
+	 * Check to see if total_pages has changed.
+	 */
+	if (total_pages != last_total_pages) {
+		last_total_pages = total_pages;
+		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
+	}
+	/*
+	 * On first time through initialize the cookie used for page_t scans
+	 */
+	if (pp_cookie == NULL)
+		pp = page_next_scan_init(&pp_cookie);
+	pcount = 0;
+	while (pcount <= nscan) {
+		/*
+		 * move to the next page, skipping over large pages
+		 * and issuing prefetches.
+		 */
+		pp = page_next_scan_large(pp, &pcount, &pp_cookie);
+		prefetch_page_r((void *)pp);
+		ASSERT(pp != NULL);
+		/*
+		 * Do a bunch of dirty tests (ie. no locking) to determine
+		 * if we can quickly skip this page. These tests are repeated
+		 * after acquiring the page lock.
+		 */
+		++nexamined;
+		if (PP_ISSWAP(pp)) {
+			coal_page = NULL;
+			continue;
+		}
+		/*
+		 * skip free pages too, but try coalescing them into larger
+		 * pagesizes
+		 */
+		if (PP_ISFREE(pp)) {
+			/*
+			 * skip pages with a file system identity or that
+			 * are already maximum size
+			 */
+			szc = pp->p_szc;
+			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
+				coal_page = NULL;
+				continue;
+			}
+			/*
+			 * If not in a coalescing candidate page or the size
+			 * codes are different, start a new candidate.
+			 */
+			if (coal_page == NULL || coal_szc != szc) {
+				/*
+				 * page must be properly aligned
+				 */
+				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
+					coal_page = NULL;
+					continue;
+				}
+				coal_page = pp;
+				coal_szc = szc;
+				coal_cnt = 1;
+				continue;
+			}
+			/*
+			 * acceptable to add this to existing candidate page
+			 */
+			++coal_cnt;
+			if (coal_cnt < fsf_pgcnt[coal_szc])
+				continue;
+			/*
+			 * We've got enough pages to coalesce, so do it.
+			 * After promoting, we clear coal_page, so it will
+			 * take another pass to promote this to an even
+			 * larger page.
+			 */
+			++ncoalesce;
+			(void) page_promote_size(coal_page, coal_szc);
+			coal_page = NULL;
+			continue;
+		} else {
+			coal_page = NULL;
+		}
+		if (pp->p_vnode == &kvp ||
+		    PAGE_LOCKED(pp) ||
+		    pp->p_lckcnt != 0 ||
+		    pp->p_cowcnt != 0)
+			continue;
+		/*
+		 * Reject pages that can't be "exclusively" locked.
+		 */
+		if (!page_trylock(pp, SE_EXCL))
+			continue;
+		++nlocked;
+		/*
+		 * After locking the page, redo the above checks.
+		 * Since we locked the page, leave out the PAGE_LOCKED() test.
+		 */
+		vp = pp->p_vnode;
+		if (PP_ISSWAP(pp) ||
+		    PP_ISFREE(pp) ||
+		    vp == NULL ||
+		    vp == &kvp ||
+		    pp->p_lckcnt != 0 ||
+		    pp->p_cowcnt != 0 ||
+		    (vp->v_flag & VISSWAP) != 0) {
+			page_unlock(pp);
+			continue;
+		}
+		ASSERT(vp->v_type != VCHR);
+		/*
+		 * Check the modified bit. Leaving the bit alone in hardware.
+		 * It will be cleared if we do the putpage.
+		 */
+		if (IS_VMODSORT(vp))
+			mod = hat_ismod(pp);
+		else
+			mod = hat_pagesync(pp,
+			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
+		if (mod) {
+			++nmodified;
+			offset = pp->p_offset;
+			/*
+			 * Hold the vnode before releasing the page lock
+			 * to prevent it from being freed and re-used by
+			 * some other thread.
+			 */
+			VN_HOLD(vp);
+			page_unlock(pp);
+			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
+			    kcred);
+			VN_RELE(vp);
+		} else {
+			/*
+			 * Catch any pages which should be on the cache list,
+			 * but aren't yet.
+			 */
+			if (hat_page_is_mapped(pp) == 0) {
+				++releases;
+				(void) page_release(pp, 1);
+			} else {
+				page_unlock(pp);
+			}
+		}
+	}
+	/*
+	 * maintain statistics
+	 * reset every million wakeups, just to avoid overflow
+	 */
+	if (++fsf_cycles == 1000000) {
+		fsf_cycles = 0;
+		fsf_total.fsf_scan = 0;
+		fsf_total.fsf_examined = 0;
+		fsf_total.fsf_locked = 0;
+		fsf_total.fsf_modified = 0;
+		fsf_total.fsf_coalesce = 0;
+		fsf_total.fsf_time = 0;
+		fsf_total.fsf_releases = 0;
+	} else {
+		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
+		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
+		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
+		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
+		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
+		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
+		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
+	}
+}
+/*
+* As part of file system hardening, this daemon is awakened
+* every second to flush cached data which includes the
+* buffer cache, the inode cache and mapped pages.
+*/
+void
+fsflush()
+{
+	struct buf *bp, *dwp;
+	struct hbuf *hp;
+	int autoup;
+	unsigned int ix, icount, count = 0;
+	callb_cpr_t cprinfo;
+	uint_t		bcount;
+	kmutex_t	*hmp;
+	struct vfssw *vswp;
+	proc_fsflush = ttoproc(curthread);
+	proc_fsflush->p_cstime = 0;
+	proc_fsflush->p_stime =  0;
+	proc_fsflush->p_cutime =  0;
+	proc_fsflush->p_utime = 0;
+	bcopy("fsflush", u.u_psargs, 8);
+	bcopy("fsflush", u.u_comm, 7);
+	mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
+	sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
+	/*
+	 * Setup page coalescing.
+	 */
+	fsf_npgsz = page_num_pagesizes();
+	ASSERT(fsf_npgsz < MAX_PAGESIZES);
+	for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
+		fsf_pgcnt[ix] =
+		    page_get_pagesize(ix + 1) / page_get_pagesize(ix);
+		fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
+	}
+	autoup = v.v_autoup * hz;
+	icount = v.v_autoup / tune.t_fsflushr;
+	CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
+loop:
+	sema_v(&fsflush_sema);
+	mutex_enter(&fsflush_lock);
+	CALLB_CPR_SAFE_BEGIN(&cprinfo);
+	cv_wait(&fsflush_cv, &fsflush_lock);		/* wait for clock */
+	CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
+	mutex_exit(&fsflush_lock);
+	sema_p(&fsflush_sema);
+	/*
+	 * Write back all old B_DELWRI buffers on the freelist.
+	 */
+	bcount = 0;
+	for (ix = 0; ix < v.v_hbuf; ix++) {
+		hp = &hbuf[ix];
+		dwp = (struct buf *)&dwbuf[ix];
+		bcount += (hp->b_length);
+		if (dwp->av_forw == dwp) {
+			continue;
+		}
+		hmp = &hbuf[ix].b_lock;
+		mutex_enter(hmp);
+		bp = dwp->av_forw;
+		/*
+		 * Go down only on the delayed write lists.
+		 */
+		while (bp != dwp) {
+			ASSERT(bp->b_flags & B_DELWRI);
+			if ((bp->b_flags & B_DELWRI) &&
+			    (lbolt - bp->b_start >= autoup) &&
+			    sema_tryp(&bp->b_sem)) {
+				bp->b_flags |= B_ASYNC;
+				hp->b_length--;
+				notavail(bp);
+				mutex_exit(hmp);
+				if (bp->b_vp == NULL) {
+					BWRITE(bp);
+				} else {
+					UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
+									bp);
+				}
+				mutex_enter(hmp);
+				bp = dwp->av_forw;
+			} else {
+				bp = bp->av_forw;
+			}
+		}
+		mutex_exit(hmp);
+	}
+	/*
+	 *
+	 * There is no need to wakeup any thread waiting on bio_mem_cv
+	 * since brelse will wake them up as soon as IO is complete.
+	 */
+	bfreelist.b_bcount = bcount;
+	if (dopageflush)
+		fsflush_do_pages();
+	if (!doiflush)
+		goto loop;
+	/*
+	 * If the system was not booted to single user mode, skip the
+	 * inode flushing until after fsflush_iflush_delay secs have elapsed.
+	 */
+	if ((boothowto & RB_SINGLE) == 0 &&
+	    (lbolt64 / hz) < fsflush_iflush_delay)
+		goto loop;
+	/*
+	 * Flush cached attribute information (e.g. inodes).
+	 */
+	if (++count >= icount) {
+		count = 0;
+		/*
+		 * Sync back cached data.
+		 */
+		RLOCK_VFSSW();
+		for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
+			if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
+				vfs_refvfssw(vswp);
+				RUNLOCK_VFSSW();
+				(void) fsop_sync_by_kind(vswp - vfssw,
+					SYNC_ATTR, kcred);
+				vfs_unrefvfssw(vswp);
+				RLOCK_VFSSW();
+			}
+		}
+		RUNLOCK_VFSSW();
+	}
+	goto loop;
+}

changeset 0	68f95e015346
child 3290	256464cbb73c