usr/src/uts/common/fs/fsflush.c
changeset 0 68f95e015346
child 3290 256464cbb73c
equal deleted inserted replaced
-1:000000000000 0:68f95e015346
       
     1 /*
       
     2  * CDDL HEADER START
       
     3  *
       
     4  * The contents of this file are subject to the terms of the
       
     5  * Common Development and Distribution License, Version 1.0 only
       
     6  * (the "License").  You may not use this file except in compliance
       
     7  * with the License.
       
     8  *
       
     9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
       
    10  * or http://www.opensolaris.org/os/licensing.
       
    11  * See the License for the specific language governing permissions
       
    12  * and limitations under the License.
       
    13  *
       
    14  * When distributing Covered Code, include this CDDL HEADER in each
       
    15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
       
    16  * If applicable, add the following below this CDDL HEADER, with the
       
    17  * fields enclosed by brackets "[]" replaced with your own identifying
       
    18  * information: Portions Copyright [yyyy] [name of copyright owner]
       
    19  *
       
    20  * CDDL HEADER END
       
    21  */
       
    22 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
       
    23 /*	  All Rights Reserved  	*/
       
    24 
       
    25 
       
    26 /*
       
    27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
       
    28  * Use is subject to license terms.
       
    29  */
       
    30 
       
    31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
       
    32 
       
    33 #include <sys/types.h>
       
    34 #include <sys/t_lock.h>
       
    35 #include <sys/param.h>
       
    36 #include <sys/tuneable.h>
       
    37 #include <sys/inline.h>
       
    38 #include <sys/systm.h>
       
    39 #include <sys/proc.h>
       
    40 #include <sys/user.h>
       
    41 #include <sys/var.h>
       
    42 #include <sys/buf.h>
       
    43 #include <sys/vfs.h>
       
    44 #include <sys/cred.h>
       
    45 #include <sys/kmem.h>
       
    46 #include <sys/vnode.h>
       
    47 #include <sys/swap.h>
       
    48 #include <sys/vm.h>
       
    49 #include <sys/debug.h>
       
    50 #include <sys/cmn_err.h>
       
    51 #include <sys/sysinfo.h>
       
    52 #include <sys/callb.h>
       
    53 #include <sys/reboot.h>
       
    54 #include <sys/time.h>
       
    55 #include <sys/fs/ufs_inode.h>
       
    56 #include <sys/fs/ufs_bio.h>
       
    57 
       
    58 #include <vm/hat.h>
       
    59 #include <vm/page.h>
       
    60 #include <vm/pvn.h>
       
    61 #include <vm/seg_kmem.h>
       
    62 
       
    63 int doiflush = 1;	/* non-zero to turn inode flushing on */
       
    64 int dopageflush = 1;	/* non-zero to turn page flushing on */
       
    65 
       
    66 /*
       
    67  * To improve boot performance, don't run the inode flushing loop until
       
    68  * the specified number of seconds after boot.  To revert to the old
       
    69  * behavior, set fsflush_iflush_delay to 0.  We have not created any new
       
    70  * filesystem danger that did not exist previously, since there is always a
       
    71  * window in between when fsflush does the inode flush loop during which the
       
    72  * system could crash, fail to sync the filesystem, and fsck will be needed
       
    73  * to recover.  We have, however, widened this window.  Finally,
       
    74  * we never delay inode flushing if we're booting into single user mode,
       
    75  * where the administrator may be modifying files or using fsck.  This
       
    76  * modification avoids inode flushes during boot whose only purpose is to
       
    77  * update atimes on files which have been accessed during boot.
       
    78  */
       
    79 int fsflush_iflush_delay = 60;
       
    80 
       
    81 kcondvar_t fsflush_cv;
       
    82 static kmutex_t fsflush_lock;	/* just for the cv_wait */
       
    83 ksema_t fsflush_sema;		/* to serialize with reboot */
       
    84 
       
    85 /*
       
    86  * some statistics for fsflush_do_pages
       
    87  */
       
    88 typedef struct {
       
    89 	ulong_t fsf_scan;	/* number of pages scanned */
       
    90 	ulong_t fsf_examined;	/* number of page_t's actually examined, can */
       
    91 				/* be less than fsf_scan due to large pages */
       
    92 	ulong_t fsf_locked;	/* pages we actually page_lock()ed */
       
    93 	ulong_t fsf_modified;	/* number of modified pages found */
       
    94 	ulong_t fsf_coalesce;	/* number of page coalesces done */
       
    95 	ulong_t fsf_time;	/* nanoseconds of run time */
       
    96 	ulong_t fsf_releases;	/* number of page_release() done */
       
    97 } fsf_stat_t;
       
    98 
       
    99 fsf_stat_t fsf_recent;	/* counts for most recent duty cycle */
       
   100 fsf_stat_t fsf_total;	/* total of counts */
       
   101 ulong_t fsf_cycles;	/* number of runs refelected in fsf_total */
       
   102 
       
   103 /*
       
   104  * data used to determine when we can coalese consecutive free pages
       
   105  * into larger pages.
       
   106  */
       
   107 #define	MAX_PAGESIZES	32
       
   108 static ulong_t		fsf_npgsz;
       
   109 static pgcnt_t		fsf_pgcnt[MAX_PAGESIZES];
       
   110 static pgcnt_t		fsf_mask[MAX_PAGESIZES];
       
   111 
       
   112 
       
   113 /*
       
   114  * Scan page_t's and issue I/O's for modified pages.
       
   115  *
       
   116  * Also coalesces consecutive small sized free pages into the next larger
       
   117  * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
       
   118  * spent scanning on later passes and for anybody allocating large pages.
       
   119  */
       
   120 static void
       
   121 fsflush_do_pages()
       
   122 {
       
   123 	vnode_t		*vp;
       
   124 	ulong_t		pcount;
       
   125 	hrtime_t	timer = gethrtime();
       
   126 	ulong_t		releases = 0;
       
   127 	ulong_t		nexamined = 0;
       
   128 	ulong_t		nlocked = 0;
       
   129 	ulong_t		nmodified = 0;
       
   130 	ulong_t		ncoalesce = 0;
       
   131 	int		mod;
       
   132 	u_offset_t	offset;
       
   133 	uint_t		szc;
       
   134 
       
   135 	page_t		*coal_page = NULL;  /* 1st page in group to coalese */
       
   136 	uint_t		coal_szc = 0;	    /* size code, coal_page->p_szc */
       
   137 	uint_t		coal_cnt = 0;	    /* count of pages seen */
       
   138 
       
   139 	static ulong_t	nscan = 0;
       
   140 	static pgcnt_t	last_total_pages = 0;
       
   141 	static void	*pp_cookie = NULL;
       
   142 	static page_t	*pp;
       
   143 
       
   144 	/*
       
   145 	 * Check to see if total_pages has changed.
       
   146 	 */
       
   147 	if (total_pages != last_total_pages) {
       
   148 		last_total_pages = total_pages;
       
   149 		nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
       
   150 	}
       
   151 
       
   152 	/*
       
   153 	 * On first time through initialize the cookie used for page_t scans
       
   154 	 */
       
   155 	if (pp_cookie == NULL)
       
   156 		pp = page_next_scan_init(&pp_cookie);
       
   157 
       
   158 	pcount = 0;
       
   159 	while (pcount <= nscan) {
       
   160 
       
   161 		/*
       
   162 		 * move to the next page, skipping over large pages
       
   163 		 * and issuing prefetches.
       
   164 		 */
       
   165 		pp = page_next_scan_large(pp, &pcount, &pp_cookie);
       
   166 		prefetch_page_r((void *)pp);
       
   167 		ASSERT(pp != NULL);
       
   168 
       
   169 		/*
       
   170 		 * Do a bunch of dirty tests (ie. no locking) to determine
       
   171 		 * if we can quickly skip this page. These tests are repeated
       
   172 		 * after acquiring the page lock.
       
   173 		 */
       
   174 		++nexamined;
       
   175 		if (PP_ISSWAP(pp)) {
       
   176 			coal_page = NULL;
       
   177 			continue;
       
   178 		}
       
   179 
       
   180 		/*
       
   181 		 * skip free pages too, but try coalescing them into larger
       
   182 		 * pagesizes
       
   183 		 */
       
   184 		if (PP_ISFREE(pp)) {
       
   185 			/*
       
   186 			 * skip pages with a file system identity or that
       
   187 			 * are already maximum size
       
   188 			 */
       
   189 			szc = pp->p_szc;
       
   190 			if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
       
   191 				coal_page = NULL;
       
   192 				continue;
       
   193 			}
       
   194 
       
   195 			/*
       
   196 			 * If not in a coalescing candidate page or the size
       
   197 			 * codes are different, start a new candidate.
       
   198 			 */
       
   199 			if (coal_page == NULL || coal_szc != szc) {
       
   200 
       
   201 				/*
       
   202 				 * page must be properly aligned
       
   203 				 */
       
   204 				if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
       
   205 					coal_page = NULL;
       
   206 					continue;
       
   207 				}
       
   208 				coal_page = pp;
       
   209 				coal_szc = szc;
       
   210 				coal_cnt = 1;
       
   211 				continue;
       
   212 			}
       
   213 
       
   214 			/*
       
   215 			 * acceptable to add this to existing candidate page
       
   216 			 */
       
   217 			++coal_cnt;
       
   218 			if (coal_cnt < fsf_pgcnt[coal_szc])
       
   219 				continue;
       
   220 
       
   221 			/*
       
   222 			 * We've got enough pages to coalesce, so do it.
       
   223 			 * After promoting, we clear coal_page, so it will
       
   224 			 * take another pass to promote this to an even
       
   225 			 * larger page.
       
   226 			 */
       
   227 			++ncoalesce;
       
   228 			(void) page_promote_size(coal_page, coal_szc);
       
   229 			coal_page = NULL;
       
   230 			continue;
       
   231 		} else {
       
   232 			coal_page = NULL;
       
   233 		}
       
   234 
       
   235 		if (pp->p_vnode == &kvp ||
       
   236 		    PAGE_LOCKED(pp) ||
       
   237 		    pp->p_lckcnt != 0 ||
       
   238 		    pp->p_cowcnt != 0)
       
   239 			continue;
       
   240 
       
   241 
       
   242 		/*
       
   243 		 * Reject pages that can't be "exclusively" locked.
       
   244 		 */
       
   245 		if (!page_trylock(pp, SE_EXCL))
       
   246 			continue;
       
   247 		++nlocked;
       
   248 
       
   249 
       
   250 		/*
       
   251 		 * After locking the page, redo the above checks.
       
   252 		 * Since we locked the page, leave out the PAGE_LOCKED() test.
       
   253 		 */
       
   254 		vp = pp->p_vnode;
       
   255 		if (PP_ISSWAP(pp) ||
       
   256 		    PP_ISFREE(pp) ||
       
   257 		    vp == NULL ||
       
   258 		    vp == &kvp ||
       
   259 		    pp->p_lckcnt != 0 ||
       
   260 		    pp->p_cowcnt != 0 ||
       
   261 		    (vp->v_flag & VISSWAP) != 0) {
       
   262 			page_unlock(pp);
       
   263 			continue;
       
   264 		}
       
   265 
       
   266 		ASSERT(vp->v_type != VCHR);
       
   267 
       
   268 		/*
       
   269 		 * Check the modified bit. Leaving the bit alone in hardware.
       
   270 		 * It will be cleared if we do the putpage.
       
   271 		 */
       
   272 		if (IS_VMODSORT(vp))
       
   273 			mod = hat_ismod(pp);
       
   274 		else
       
   275 			mod = hat_pagesync(pp,
       
   276 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
       
   277 
       
   278 		if (mod) {
       
   279 			++nmodified;
       
   280 			offset = pp->p_offset;
       
   281 
       
   282 			/*
       
   283 			 * Hold the vnode before releasing the page lock
       
   284 			 * to prevent it from being freed and re-used by
       
   285 			 * some other thread.
       
   286 			 */
       
   287 			VN_HOLD(vp);
       
   288 
       
   289 			page_unlock(pp);
       
   290 
       
   291 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
       
   292 			    kcred);
       
   293 
       
   294 			VN_RELE(vp);
       
   295 		} else {
       
   296 
       
   297 			/*
       
   298 			 * Catch any pages which should be on the cache list,
       
   299 			 * but aren't yet.
       
   300 			 */
       
   301 			if (hat_page_is_mapped(pp) == 0) {
       
   302 				++releases;
       
   303 				(void) page_release(pp, 1);
       
   304 			} else {
       
   305 				page_unlock(pp);
       
   306 			}
       
   307 		}
       
   308 	}
       
   309 
       
   310 	/*
       
   311 	 * maintain statistics
       
   312 	 * reset every million wakeups, just to avoid overflow
       
   313 	 */
       
   314 	if (++fsf_cycles == 1000000) {
       
   315 		fsf_cycles = 0;
       
   316 		fsf_total.fsf_scan = 0;
       
   317 		fsf_total.fsf_examined = 0;
       
   318 		fsf_total.fsf_locked = 0;
       
   319 		fsf_total.fsf_modified = 0;
       
   320 		fsf_total.fsf_coalesce = 0;
       
   321 		fsf_total.fsf_time = 0;
       
   322 		fsf_total.fsf_releases = 0;
       
   323 	} else {
       
   324 		fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
       
   325 		fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
       
   326 		fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
       
   327 		fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
       
   328 		fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
       
   329 		fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
       
   330 		fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
       
   331 	}
       
   332 }
       
   333 
       
   334 /*
       
   335  * As part of file system hardening, this daemon is awakened
       
   336  * every second to flush cached data which includes the
       
   337  * buffer cache, the inode cache and mapped pages.
       
   338  */
       
   339 void
       
   340 fsflush()
       
   341 {
       
   342 	struct buf *bp, *dwp;
       
   343 	struct hbuf *hp;
       
   344 	int autoup;
       
   345 	unsigned int ix, icount, count = 0;
       
   346 	callb_cpr_t cprinfo;
       
   347 	uint_t		bcount;
       
   348 	kmutex_t	*hmp;
       
   349 	struct vfssw *vswp;
       
   350 
       
   351 	proc_fsflush = ttoproc(curthread);
       
   352 	proc_fsflush->p_cstime = 0;
       
   353 	proc_fsflush->p_stime =  0;
       
   354 	proc_fsflush->p_cutime =  0;
       
   355 	proc_fsflush->p_utime = 0;
       
   356 	bcopy("fsflush", u.u_psargs, 8);
       
   357 	bcopy("fsflush", u.u_comm, 7);
       
   358 
       
   359 	mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
       
   360 	sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
       
   361 
       
   362 	/*
       
   363 	 * Setup page coalescing.
       
   364 	 */
       
   365 	fsf_npgsz = page_num_pagesizes();
       
   366 	ASSERT(fsf_npgsz < MAX_PAGESIZES);
       
   367 	for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
       
   368 		fsf_pgcnt[ix] =
       
   369 		    page_get_pagesize(ix + 1) / page_get_pagesize(ix);
       
   370 		fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
       
   371 	}
       
   372 
       
   373 	autoup = v.v_autoup * hz;
       
   374 	icount = v.v_autoup / tune.t_fsflushr;
       
   375 	CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
       
   376 loop:
       
   377 	sema_v(&fsflush_sema);
       
   378 	mutex_enter(&fsflush_lock);
       
   379 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
       
   380 	cv_wait(&fsflush_cv, &fsflush_lock);		/* wait for clock */
       
   381 	CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
       
   382 	mutex_exit(&fsflush_lock);
       
   383 	sema_p(&fsflush_sema);
       
   384 
       
   385 	/*
       
   386 	 * Write back all old B_DELWRI buffers on the freelist.
       
   387 	 */
       
   388 	bcount = 0;
       
   389 	for (ix = 0; ix < v.v_hbuf; ix++) {
       
   390 
       
   391 		hp = &hbuf[ix];
       
   392 		dwp = (struct buf *)&dwbuf[ix];
       
   393 
       
   394 		bcount += (hp->b_length);
       
   395 
       
   396 		if (dwp->av_forw == dwp) {
       
   397 			continue;
       
   398 		}
       
   399 
       
   400 		hmp = &hbuf[ix].b_lock;
       
   401 		mutex_enter(hmp);
       
   402 		bp = dwp->av_forw;
       
   403 
       
   404 		/*
       
   405 		 * Go down only on the delayed write lists.
       
   406 		 */
       
   407 		while (bp != dwp) {
       
   408 
       
   409 			ASSERT(bp->b_flags & B_DELWRI);
       
   410 
       
   411 			if ((bp->b_flags & B_DELWRI) &&
       
   412 			    (lbolt - bp->b_start >= autoup) &&
       
   413 			    sema_tryp(&bp->b_sem)) {
       
   414 				bp->b_flags |= B_ASYNC;
       
   415 				hp->b_length--;
       
   416 				notavail(bp);
       
   417 				mutex_exit(hmp);
       
   418 				if (bp->b_vp == NULL) {
       
   419 					BWRITE(bp);
       
   420 				} else {
       
   421 					UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
       
   422 									bp);
       
   423 				}
       
   424 				mutex_enter(hmp);
       
   425 				bp = dwp->av_forw;
       
   426 			} else {
       
   427 				bp = bp->av_forw;
       
   428 			}
       
   429 		}
       
   430 		mutex_exit(hmp);
       
   431 	}
       
   432 
       
   433 	/*
       
   434 	 *
       
   435 	 * There is no need to wakeup any thread waiting on bio_mem_cv
       
   436 	 * since brelse will wake them up as soon as IO is complete.
       
   437 	 */
       
   438 	bfreelist.b_bcount = bcount;
       
   439 
       
   440 	if (dopageflush)
       
   441 		fsflush_do_pages();
       
   442 
       
   443 	if (!doiflush)
       
   444 		goto loop;
       
   445 
       
   446 	/*
       
   447 	 * If the system was not booted to single user mode, skip the
       
   448 	 * inode flushing until after fsflush_iflush_delay secs have elapsed.
       
   449 	 */
       
   450 	if ((boothowto & RB_SINGLE) == 0 &&
       
   451 	    (lbolt64 / hz) < fsflush_iflush_delay)
       
   452 		goto loop;
       
   453 
       
   454 	/*
       
   455 	 * Flush cached attribute information (e.g. inodes).
       
   456 	 */
       
   457 	if (++count >= icount) {
       
   458 		count = 0;
       
   459 
       
   460 		/*
       
   461 		 * Sync back cached data.
       
   462 		 */
       
   463 		RLOCK_VFSSW();
       
   464 		for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
       
   465 			if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
       
   466 				vfs_refvfssw(vswp);
       
   467 				RUNLOCK_VFSSW();
       
   468 				(void) fsop_sync_by_kind(vswp - vfssw,
       
   469 					SYNC_ATTR, kcred);
       
   470 				vfs_unrefvfssw(vswp);
       
   471 				RLOCK_VFSSW();
       
   472 			}
       
   473 		}
       
   474 		RUNLOCK_VFSSW();
       
   475 	}
       
   476 	goto loop;
       
   477 }