|
1 /* |
|
2 * CDDL HEADER START |
|
3 * |
|
4 * The contents of this file are subject to the terms of the |
|
5 * Common Development and Distribution License, Version 1.0 only |
|
6 * (the "License"). You may not use this file except in compliance |
|
7 * with the License. |
|
8 * |
|
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
10 * or http://www.opensolaris.org/os/licensing. |
|
11 * See the License for the specific language governing permissions |
|
12 * and limitations under the License. |
|
13 * |
|
14 * When distributing Covered Code, include this CDDL HEADER in each |
|
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
16 * If applicable, add the following below this CDDL HEADER, with the |
|
17 * fields enclosed by brackets "[]" replaced with your own identifying |
|
18 * information: Portions Copyright [yyyy] [name of copyright owner] |
|
19 * |
|
20 * CDDL HEADER END |
|
21 */ |
|
22 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ |
|
23 /* All Rights Reserved */ |
|
24 |
|
25 |
|
26 /* |
|
27 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. |
|
28 * Use is subject to license terms. |
|
29 */ |
|
30 |
|
31 #pragma ident "%Z%%M% %I% %E% SMI" |
|
32 |
|
33 #include <sys/types.h> |
|
34 #include <sys/t_lock.h> |
|
35 #include <sys/param.h> |
|
36 #include <sys/tuneable.h> |
|
37 #include <sys/inline.h> |
|
38 #include <sys/systm.h> |
|
39 #include <sys/proc.h> |
|
40 #include <sys/user.h> |
|
41 #include <sys/var.h> |
|
42 #include <sys/buf.h> |
|
43 #include <sys/vfs.h> |
|
44 #include <sys/cred.h> |
|
45 #include <sys/kmem.h> |
|
46 #include <sys/vnode.h> |
|
47 #include <sys/swap.h> |
|
48 #include <sys/vm.h> |
|
49 #include <sys/debug.h> |
|
50 #include <sys/cmn_err.h> |
|
51 #include <sys/sysinfo.h> |
|
52 #include <sys/callb.h> |
|
53 #include <sys/reboot.h> |
|
54 #include <sys/time.h> |
|
55 #include <sys/fs/ufs_inode.h> |
|
56 #include <sys/fs/ufs_bio.h> |
|
57 |
|
58 #include <vm/hat.h> |
|
59 #include <vm/page.h> |
|
60 #include <vm/pvn.h> |
|
61 #include <vm/seg_kmem.h> |
|
62 |
|
63 int doiflush = 1; /* non-zero to turn inode flushing on */ |
|
64 int dopageflush = 1; /* non-zero to turn page flushing on */ |
|
65 |
|
66 /* |
|
67 * To improve boot performance, don't run the inode flushing loop until |
|
68 * the specified number of seconds after boot. To revert to the old |
|
69 * behavior, set fsflush_iflush_delay to 0. We have not created any new |
|
70 * filesystem danger that did not exist previously, since there is always a |
|
71 * window in between when fsflush does the inode flush loop during which the |
|
72 * system could crash, fail to sync the filesystem, and fsck will be needed |
|
73 * to recover. We have, however, widened this window. Finally, |
|
74 * we never delay inode flushing if we're booting into single user mode, |
|
75 * where the administrator may be modifying files or using fsck. This |
|
76 * modification avoids inode flushes during boot whose only purpose is to |
|
77 * update atimes on files which have been accessed during boot. |
|
78 */ |
|
79 int fsflush_iflush_delay = 60; |
|
80 |
|
81 kcondvar_t fsflush_cv; |
|
82 static kmutex_t fsflush_lock; /* just for the cv_wait */ |
|
83 ksema_t fsflush_sema; /* to serialize with reboot */ |
|
84 |
|
85 /* |
|
86 * some statistics for fsflush_do_pages |
|
87 */ |
|
88 typedef struct { |
|
89 ulong_t fsf_scan; /* number of pages scanned */ |
|
90 ulong_t fsf_examined; /* number of page_t's actually examined, can */ |
|
91 /* be less than fsf_scan due to large pages */ |
|
92 ulong_t fsf_locked; /* pages we actually page_lock()ed */ |
|
93 ulong_t fsf_modified; /* number of modified pages found */ |
|
94 ulong_t fsf_coalesce; /* number of page coalesces done */ |
|
95 ulong_t fsf_time; /* nanoseconds of run time */ |
|
96 ulong_t fsf_releases; /* number of page_release() done */ |
|
97 } fsf_stat_t; |
|
98 |
|
99 fsf_stat_t fsf_recent; /* counts for most recent duty cycle */ |
|
100 fsf_stat_t fsf_total; /* total of counts */ |
|
101 ulong_t fsf_cycles; /* number of runs refelected in fsf_total */ |
|
102 |
|
103 /* |
|
104 * data used to determine when we can coalese consecutive free pages |
|
105 * into larger pages. |
|
106 */ |
|
107 #define MAX_PAGESIZES 32 |
|
108 static ulong_t fsf_npgsz; |
|
109 static pgcnt_t fsf_pgcnt[MAX_PAGESIZES]; |
|
110 static pgcnt_t fsf_mask[MAX_PAGESIZES]; |
|
111 |
|
112 |
|
113 /* |
|
114 * Scan page_t's and issue I/O's for modified pages. |
|
115 * |
|
116 * Also coalesces consecutive small sized free pages into the next larger |
|
117 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time |
|
118 * spent scanning on later passes and for anybody allocating large pages. |
|
119 */ |
|
120 static void |
|
121 fsflush_do_pages() |
|
122 { |
|
123 vnode_t *vp; |
|
124 ulong_t pcount; |
|
125 hrtime_t timer = gethrtime(); |
|
126 ulong_t releases = 0; |
|
127 ulong_t nexamined = 0; |
|
128 ulong_t nlocked = 0; |
|
129 ulong_t nmodified = 0; |
|
130 ulong_t ncoalesce = 0; |
|
131 int mod; |
|
132 u_offset_t offset; |
|
133 uint_t szc; |
|
134 |
|
135 page_t *coal_page = NULL; /* 1st page in group to coalese */ |
|
136 uint_t coal_szc = 0; /* size code, coal_page->p_szc */ |
|
137 uint_t coal_cnt = 0; /* count of pages seen */ |
|
138 |
|
139 static ulong_t nscan = 0; |
|
140 static pgcnt_t last_total_pages = 0; |
|
141 static void *pp_cookie = NULL; |
|
142 static page_t *pp; |
|
143 |
|
144 /* |
|
145 * Check to see if total_pages has changed. |
|
146 */ |
|
147 if (total_pages != last_total_pages) { |
|
148 last_total_pages = total_pages; |
|
149 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup; |
|
150 } |
|
151 |
|
152 /* |
|
153 * On first time through initialize the cookie used for page_t scans |
|
154 */ |
|
155 if (pp_cookie == NULL) |
|
156 pp = page_next_scan_init(&pp_cookie); |
|
157 |
|
158 pcount = 0; |
|
159 while (pcount <= nscan) { |
|
160 |
|
161 /* |
|
162 * move to the next page, skipping over large pages |
|
163 * and issuing prefetches. |
|
164 */ |
|
165 pp = page_next_scan_large(pp, &pcount, &pp_cookie); |
|
166 prefetch_page_r((void *)pp); |
|
167 ASSERT(pp != NULL); |
|
168 |
|
169 /* |
|
170 * Do a bunch of dirty tests (ie. no locking) to determine |
|
171 * if we can quickly skip this page. These tests are repeated |
|
172 * after acquiring the page lock. |
|
173 */ |
|
174 ++nexamined; |
|
175 if (PP_ISSWAP(pp)) { |
|
176 coal_page = NULL; |
|
177 continue; |
|
178 } |
|
179 |
|
180 /* |
|
181 * skip free pages too, but try coalescing them into larger |
|
182 * pagesizes |
|
183 */ |
|
184 if (PP_ISFREE(pp)) { |
|
185 /* |
|
186 * skip pages with a file system identity or that |
|
187 * are already maximum size |
|
188 */ |
|
189 szc = pp->p_szc; |
|
190 if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) { |
|
191 coal_page = NULL; |
|
192 continue; |
|
193 } |
|
194 |
|
195 /* |
|
196 * If not in a coalescing candidate page or the size |
|
197 * codes are different, start a new candidate. |
|
198 */ |
|
199 if (coal_page == NULL || coal_szc != szc) { |
|
200 |
|
201 /* |
|
202 * page must be properly aligned |
|
203 */ |
|
204 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) { |
|
205 coal_page = NULL; |
|
206 continue; |
|
207 } |
|
208 coal_page = pp; |
|
209 coal_szc = szc; |
|
210 coal_cnt = 1; |
|
211 continue; |
|
212 } |
|
213 |
|
214 /* |
|
215 * acceptable to add this to existing candidate page |
|
216 */ |
|
217 ++coal_cnt; |
|
218 if (coal_cnt < fsf_pgcnt[coal_szc]) |
|
219 continue; |
|
220 |
|
221 /* |
|
222 * We've got enough pages to coalesce, so do it. |
|
223 * After promoting, we clear coal_page, so it will |
|
224 * take another pass to promote this to an even |
|
225 * larger page. |
|
226 */ |
|
227 ++ncoalesce; |
|
228 (void) page_promote_size(coal_page, coal_szc); |
|
229 coal_page = NULL; |
|
230 continue; |
|
231 } else { |
|
232 coal_page = NULL; |
|
233 } |
|
234 |
|
235 if (pp->p_vnode == &kvp || |
|
236 PAGE_LOCKED(pp) || |
|
237 pp->p_lckcnt != 0 || |
|
238 pp->p_cowcnt != 0) |
|
239 continue; |
|
240 |
|
241 |
|
242 /* |
|
243 * Reject pages that can't be "exclusively" locked. |
|
244 */ |
|
245 if (!page_trylock(pp, SE_EXCL)) |
|
246 continue; |
|
247 ++nlocked; |
|
248 |
|
249 |
|
250 /* |
|
251 * After locking the page, redo the above checks. |
|
252 * Since we locked the page, leave out the PAGE_LOCKED() test. |
|
253 */ |
|
254 vp = pp->p_vnode; |
|
255 if (PP_ISSWAP(pp) || |
|
256 PP_ISFREE(pp) || |
|
257 vp == NULL || |
|
258 vp == &kvp || |
|
259 pp->p_lckcnt != 0 || |
|
260 pp->p_cowcnt != 0 || |
|
261 (vp->v_flag & VISSWAP) != 0) { |
|
262 page_unlock(pp); |
|
263 continue; |
|
264 } |
|
265 |
|
266 ASSERT(vp->v_type != VCHR); |
|
267 |
|
268 /* |
|
269 * Check the modified bit. Leaving the bit alone in hardware. |
|
270 * It will be cleared if we do the putpage. |
|
271 */ |
|
272 if (IS_VMODSORT(vp)) |
|
273 mod = hat_ismod(pp); |
|
274 else |
|
275 mod = hat_pagesync(pp, |
|
276 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD; |
|
277 |
|
278 if (mod) { |
|
279 ++nmodified; |
|
280 offset = pp->p_offset; |
|
281 |
|
282 /* |
|
283 * Hold the vnode before releasing the page lock |
|
284 * to prevent it from being freed and re-used by |
|
285 * some other thread. |
|
286 */ |
|
287 VN_HOLD(vp); |
|
288 |
|
289 page_unlock(pp); |
|
290 |
|
291 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC, |
|
292 kcred); |
|
293 |
|
294 VN_RELE(vp); |
|
295 } else { |
|
296 |
|
297 /* |
|
298 * Catch any pages which should be on the cache list, |
|
299 * but aren't yet. |
|
300 */ |
|
301 if (hat_page_is_mapped(pp) == 0) { |
|
302 ++releases; |
|
303 (void) page_release(pp, 1); |
|
304 } else { |
|
305 page_unlock(pp); |
|
306 } |
|
307 } |
|
308 } |
|
309 |
|
310 /* |
|
311 * maintain statistics |
|
312 * reset every million wakeups, just to avoid overflow |
|
313 */ |
|
314 if (++fsf_cycles == 1000000) { |
|
315 fsf_cycles = 0; |
|
316 fsf_total.fsf_scan = 0; |
|
317 fsf_total.fsf_examined = 0; |
|
318 fsf_total.fsf_locked = 0; |
|
319 fsf_total.fsf_modified = 0; |
|
320 fsf_total.fsf_coalesce = 0; |
|
321 fsf_total.fsf_time = 0; |
|
322 fsf_total.fsf_releases = 0; |
|
323 } else { |
|
324 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan; |
|
325 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined; |
|
326 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked; |
|
327 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified; |
|
328 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce; |
|
329 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer; |
|
330 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases; |
|
331 } |
|
332 } |
|
333 |
|
334 /* |
|
335 * As part of file system hardening, this daemon is awakened |
|
336 * every second to flush cached data which includes the |
|
337 * buffer cache, the inode cache and mapped pages. |
|
338 */ |
|
339 void |
|
340 fsflush() |
|
341 { |
|
342 struct buf *bp, *dwp; |
|
343 struct hbuf *hp; |
|
344 int autoup; |
|
345 unsigned int ix, icount, count = 0; |
|
346 callb_cpr_t cprinfo; |
|
347 uint_t bcount; |
|
348 kmutex_t *hmp; |
|
349 struct vfssw *vswp; |
|
350 |
|
351 proc_fsflush = ttoproc(curthread); |
|
352 proc_fsflush->p_cstime = 0; |
|
353 proc_fsflush->p_stime = 0; |
|
354 proc_fsflush->p_cutime = 0; |
|
355 proc_fsflush->p_utime = 0; |
|
356 bcopy("fsflush", u.u_psargs, 8); |
|
357 bcopy("fsflush", u.u_comm, 7); |
|
358 |
|
359 mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL); |
|
360 sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL); |
|
361 |
|
362 /* |
|
363 * Setup page coalescing. |
|
364 */ |
|
365 fsf_npgsz = page_num_pagesizes(); |
|
366 ASSERT(fsf_npgsz < MAX_PAGESIZES); |
|
367 for (ix = 0; ix < fsf_npgsz - 1; ++ix) { |
|
368 fsf_pgcnt[ix] = |
|
369 page_get_pagesize(ix + 1) / page_get_pagesize(ix); |
|
370 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1; |
|
371 } |
|
372 |
|
373 autoup = v.v_autoup * hz; |
|
374 icount = v.v_autoup / tune.t_fsflushr; |
|
375 CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush"); |
|
376 loop: |
|
377 sema_v(&fsflush_sema); |
|
378 mutex_enter(&fsflush_lock); |
|
379 CALLB_CPR_SAFE_BEGIN(&cprinfo); |
|
380 cv_wait(&fsflush_cv, &fsflush_lock); /* wait for clock */ |
|
381 CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock); |
|
382 mutex_exit(&fsflush_lock); |
|
383 sema_p(&fsflush_sema); |
|
384 |
|
385 /* |
|
386 * Write back all old B_DELWRI buffers on the freelist. |
|
387 */ |
|
388 bcount = 0; |
|
389 for (ix = 0; ix < v.v_hbuf; ix++) { |
|
390 |
|
391 hp = &hbuf[ix]; |
|
392 dwp = (struct buf *)&dwbuf[ix]; |
|
393 |
|
394 bcount += (hp->b_length); |
|
395 |
|
396 if (dwp->av_forw == dwp) { |
|
397 continue; |
|
398 } |
|
399 |
|
400 hmp = &hbuf[ix].b_lock; |
|
401 mutex_enter(hmp); |
|
402 bp = dwp->av_forw; |
|
403 |
|
404 /* |
|
405 * Go down only on the delayed write lists. |
|
406 */ |
|
407 while (bp != dwp) { |
|
408 |
|
409 ASSERT(bp->b_flags & B_DELWRI); |
|
410 |
|
411 if ((bp->b_flags & B_DELWRI) && |
|
412 (lbolt - bp->b_start >= autoup) && |
|
413 sema_tryp(&bp->b_sem)) { |
|
414 bp->b_flags |= B_ASYNC; |
|
415 hp->b_length--; |
|
416 notavail(bp); |
|
417 mutex_exit(hmp); |
|
418 if (bp->b_vp == NULL) { |
|
419 BWRITE(bp); |
|
420 } else { |
|
421 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, |
|
422 bp); |
|
423 } |
|
424 mutex_enter(hmp); |
|
425 bp = dwp->av_forw; |
|
426 } else { |
|
427 bp = bp->av_forw; |
|
428 } |
|
429 } |
|
430 mutex_exit(hmp); |
|
431 } |
|
432 |
|
433 /* |
|
434 * |
|
435 * There is no need to wakeup any thread waiting on bio_mem_cv |
|
436 * since brelse will wake them up as soon as IO is complete. |
|
437 */ |
|
438 bfreelist.b_bcount = bcount; |
|
439 |
|
440 if (dopageflush) |
|
441 fsflush_do_pages(); |
|
442 |
|
443 if (!doiflush) |
|
444 goto loop; |
|
445 |
|
446 /* |
|
447 * If the system was not booted to single user mode, skip the |
|
448 * inode flushing until after fsflush_iflush_delay secs have elapsed. |
|
449 */ |
|
450 if ((boothowto & RB_SINGLE) == 0 && |
|
451 (lbolt64 / hz) < fsflush_iflush_delay) |
|
452 goto loop; |
|
453 |
|
454 /* |
|
455 * Flush cached attribute information (e.g. inodes). |
|
456 */ |
|
457 if (++count >= icount) { |
|
458 count = 0; |
|
459 |
|
460 /* |
|
461 * Sync back cached data. |
|
462 */ |
|
463 RLOCK_VFSSW(); |
|
464 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) { |
|
465 if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) { |
|
466 vfs_refvfssw(vswp); |
|
467 RUNLOCK_VFSSW(); |
|
468 (void) fsop_sync_by_kind(vswp - vfssw, |
|
469 SYNC_ATTR, kcred); |
|
470 vfs_unrefvfssw(vswp); |
|
471 RLOCK_VFSSW(); |
|
472 } |
|
473 } |
|
474 RUNLOCK_VFSSW(); |
|
475 } |
|
476 goto loop; |
|
477 } |