|
1 /* |
|
2 * CDDL HEADER START |
|
3 * |
|
4 * The contents of this file are subject to the terms of the |
|
5 * Common Development and Distribution License (the "License"). |
|
6 * You may not use this file except in compliance with the License. |
|
7 * |
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
|
9 * or http://www.opensolaris.org/os/licensing. |
|
10 * See the License for the specific language governing permissions |
|
11 * and limitations under the License. |
|
12 * |
|
13 * When distributing Covered Code, include this CDDL HEADER in each |
|
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
|
15 * If applicable, add the following below this CDDL HEADER, with the |
|
16 * fields enclosed by brackets "[]" replaced with your own identifying |
|
17 * information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 * |
|
19 * CDDL HEADER END |
|
20 */ |
|
21 /* |
|
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
|
23 * Use is subject to license terms. |
|
24 */ |
|
25 |
|
26 #pragma ident "%Z%%M% %I% %E% SMI" |
|
27 |
|
28 #include <sys/dsl_pool.h> |
|
29 #include <sys/dsl_dataset.h> |
|
30 #include <sys/dsl_prop.h> |
|
31 #include <sys/dsl_dir.h> |
|
32 #include <sys/dsl_synctask.h> |
|
33 #include <sys/dnode.h> |
|
34 #include <sys/dmu_tx.h> |
|
35 #include <sys/dmu_objset.h> |
|
36 #include <sys/arc.h> |
|
37 #include <sys/zap.h> |
|
38 #include <sys/zio.h> |
|
39 #include <sys/zfs_context.h> |
|
40 #include <sys/fs/zfs.h> |
|
41 #include <sys/zfs_znode.h> |
|
42 #include <sys/spa_impl.h> |
|
43 #include <sys/vdev_impl.h> |
|
44 |
|
45 /* XXX */ |
|
46 #ifndef _KERNEL |
|
47 #include <ucontext.h> |
|
48 #include <stdio.h> |
|
49 #endif |
|
50 |
|
51 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); |
|
52 |
|
53 static scrub_cb_t dsl_pool_scrub_clean_cb; |
|
54 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; |
|
55 |
|
56 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ |
|
57 int zfs_scrub_max_time = 2; /* scrub for at most 2 sec each txg */ |
|
58 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ |
|
59 |
|
60 extern int zfs_txg_timeout; |
|
61 |
|
62 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { |
|
63 NULL, |
|
64 dsl_pool_scrub_clean_cb |
|
65 }; |
|
66 |
|
67 #define SET_BOOKMARK(zb, objset, object, level, blkid) \ |
|
68 { \ |
|
69 (zb)->zb_objset = objset; \ |
|
70 (zb)->zb_object = object; \ |
|
71 (zb)->zb_level = level; \ |
|
72 (zb)->zb_blkid = blkid; \ |
|
73 } |
|
74 |
|
75 /* ARGSUSED */ |
|
76 static void |
|
77 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) |
|
78 { |
|
79 dsl_pool_t *dp = arg1; |
|
80 enum scrub_func *funcp = arg2; |
|
81 dmu_object_type_t ot = 0; |
|
82 boolean_t complete = B_FALSE; |
|
83 |
|
84 dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); |
|
85 |
|
86 ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); |
|
87 ASSERT(*funcp > SCRUB_FUNC_NONE); |
|
88 ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); |
|
89 |
|
90 dp->dp_scrub_min_txg = 0; |
|
91 dp->dp_scrub_max_txg = tx->tx_txg; |
|
92 |
|
93 if (*funcp == SCRUB_FUNC_CLEAN) { |
|
94 vdev_t *rvd = dp->dp_spa->spa_root_vdev; |
|
95 |
|
96 /* rewrite all disk labels */ |
|
97 vdev_config_dirty(rvd); |
|
98 |
|
99 if (vdev_resilver_needed(rvd, |
|
100 &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { |
|
101 spa_event_notify(dp->dp_spa, NULL, |
|
102 ESC_ZFS_RESILVER_START); |
|
103 dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, |
|
104 tx->tx_txg); |
|
105 } |
|
106 |
|
107 /* zero out the scrub stats in all vdev_stat_t's */ |
|
108 vdev_scrub_stat_update(rvd, |
|
109 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : |
|
110 POOL_SCRUB_EVERYTHING, B_FALSE); |
|
111 |
|
112 dp->dp_spa->spa_scrub_started = B_TRUE; |
|
113 } |
|
114 |
|
115 /* back to the generic stuff */ |
|
116 |
|
117 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) |
|
118 ot = DMU_OT_ZAP_OTHER; |
|
119 |
|
120 dp->dp_scrub_func = *funcp; |
|
121 dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, |
|
122 ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); |
|
123 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); |
|
124 dp->dp_scrub_restart = B_FALSE; |
|
125 |
|
126 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
127 DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, |
|
128 &dp->dp_scrub_func, tx)); |
|
129 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
130 DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, |
|
131 &dp->dp_scrub_queue_obj, tx)); |
|
132 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
133 DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, |
|
134 &dp->dp_scrub_min_txg, tx)); |
|
135 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
136 DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, |
|
137 &dp->dp_scrub_max_txg, tx)); |
|
138 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
139 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, |
|
140 &dp->dp_scrub_bookmark, tx)); |
|
141 VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
142 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, |
|
143 &dp->dp_spa->spa_scrub_errors, tx)); |
|
144 |
|
145 spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, |
|
146 "func=%u mintxg=%llu maxtxg=%llu", |
|
147 *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); |
|
148 } |
|
149 |
|
150 int |
|
151 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) |
|
152 { |
|
153 return (dsl_sync_task_do(dp, NULL, |
|
154 dsl_pool_scrub_setup_sync, dp, &func, 0)); |
|
155 } |
|
156 |
|
157 /* ARGSUSED */ |
|
158 static void |
|
159 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) |
|
160 { |
|
161 dsl_pool_t *dp = arg1; |
|
162 boolean_t *completep = arg2; |
|
163 |
|
164 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) |
|
165 return; |
|
166 |
|
167 mutex_enter(&dp->dp_scrub_cancel_lock); |
|
168 |
|
169 if (dp->dp_scrub_restart) { |
|
170 dp->dp_scrub_restart = B_FALSE; |
|
171 *completep = B_FALSE; |
|
172 } |
|
173 |
|
174 /* XXX this is scrub-clean specific */ |
|
175 mutex_enter(&dp->dp_spa->spa_scrub_lock); |
|
176 while (dp->dp_spa->spa_scrub_inflight > 0) { |
|
177 cv_wait(&dp->dp_spa->spa_scrub_io_cv, |
|
178 &dp->dp_spa->spa_scrub_lock); |
|
179 } |
|
180 mutex_exit(&dp->dp_spa->spa_scrub_lock); |
|
181 dp->dp_spa->spa_scrub_started = B_FALSE; |
|
182 |
|
183 dp->dp_scrub_func = SCRUB_FUNC_NONE; |
|
184 VERIFY(0 == dmu_object_free(dp->dp_meta_objset, |
|
185 dp->dp_scrub_queue_obj, tx)); |
|
186 dp->dp_scrub_queue_obj = 0; |
|
187 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); |
|
188 |
|
189 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
190 DMU_POOL_SCRUB_QUEUE, tx)); |
|
191 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
192 DMU_POOL_SCRUB_MIN_TXG, tx)); |
|
193 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
194 DMU_POOL_SCRUB_MAX_TXG, tx)); |
|
195 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
196 DMU_POOL_SCRUB_BOOKMARK, tx)); |
|
197 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
198 DMU_POOL_SCRUB_FUNC, tx)); |
|
199 VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, |
|
200 DMU_POOL_SCRUB_ERRORS, tx)); |
|
201 |
|
202 spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, |
|
203 "complete=%u", *completep); |
|
204 |
|
205 /* below is scrub-clean specific */ |
|
206 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, |
|
207 *completep); |
|
208 /* |
|
209 * If the scrub/resilver completed, update all DTLs to reflect this. |
|
210 * Whether it succeeded or not, vacate all temporary scrub DTLs. |
|
211 */ |
|
212 vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, |
|
213 *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); |
|
214 if (dp->dp_scrub_min_txg && *completep) |
|
215 spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); |
|
216 spa_errlog_rotate(dp->dp_spa); |
|
217 |
|
218 /* |
|
219 * We may have finished replacing a device. |
|
220 * Let the async thread assess this and handle the detach. |
|
221 */ |
|
222 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); |
|
223 |
|
224 dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; |
|
225 mutex_exit(&dp->dp_scrub_cancel_lock); |
|
226 } |
|
227 |
|
228 int |
|
229 dsl_pool_scrub_cancel(dsl_pool_t *dp) |
|
230 { |
|
231 boolean_t complete = B_FALSE; |
|
232 |
|
233 return (dsl_sync_task_do(dp, NULL, |
|
234 dsl_pool_scrub_cancel_sync, dp, &complete, 3)); |
|
235 } |
|
236 |
|
237 int |
|
238 dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, |
|
239 zio_done_func_t *done, void *private, uint32_t arc_flags) |
|
240 { |
|
241 /* |
|
242 * This function will be used by bp-rewrite wad to intercept frees. |
|
243 */ |
|
244 return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, |
|
245 done, private, arc_flags)); |
|
246 } |
|
247 |
|
248 static boolean_t |
|
249 bookmark_is_zero(const zbookmark_t *zb) |
|
250 { |
|
251 return (zb->zb_objset == 0 && zb->zb_object == 0 && |
|
252 zb->zb_level == 0 && zb->zb_blkid == 0); |
|
253 } |
|
254 |
|
255 /* dnp is the dnode for zb1->zb_object */ |
|
256 static boolean_t |
|
257 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, |
|
258 const zbookmark_t *zb2) |
|
259 { |
|
260 uint64_t nextL0; |
|
261 |
|
262 ASSERT(zb1->zb_objset == zb2->zb_objset); |
|
263 ASSERT(zb1->zb_object != -1ULL); |
|
264 ASSERT(zb2->zb_object != 0); |
|
265 ASSERT(zb2->zb_level == 0); |
|
266 |
|
267 /* |
|
268 * A bookmark in the deadlist is considered to be after |
|
269 * everything else. |
|
270 */ |
|
271 if (zb2->zb_object == -1ULL) |
|
272 return (B_TRUE); |
|
273 |
|
274 /* The objset_phys_t isn't before anything. */ |
|
275 if (dnp == NULL) |
|
276 return (B_FALSE); |
|
277 |
|
278 nextL0 = (zb1->zb_blkid + 1) << |
|
279 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); |
|
280 |
|
281 if (zb1->zb_object == 0) { |
|
282 uint64_t nextobj = nextL0 * |
|
283 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; |
|
284 return (nextobj <= zb2->zb_object); |
|
285 } |
|
286 |
|
287 if (zb1->zb_object < zb2->zb_object) |
|
288 return (B_TRUE); |
|
289 if (zb1->zb_object > zb2->zb_object) |
|
290 return (B_FALSE); |
|
291 |
|
292 return (nextL0 <= zb2->zb_blkid); |
|
293 } |
|
294 |
|
295 static boolean_t |
|
296 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) |
|
297 { |
|
298 int elapsed_ticks; |
|
299 |
|
300 if (dp->dp_scrub_pausing) |
|
301 return (B_TRUE); /* we're already pausing */ |
|
302 |
|
303 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) |
|
304 return (B_FALSE); /* we're resuming */ |
|
305 |
|
306 /* we don't yet know how to resume from anything but leaf blocks */ |
|
307 if (zb->zb_object == 0 || zb->zb_level != 0) |
|
308 return (B_FALSE); |
|
309 |
|
310 elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; |
|
311 if (elapsed_ticks > hz * zfs_txg_timeout || |
|
312 (elapsed_ticks > hz * zfs_scrub_min_time && txg_sync_waiting(dp))) { |
|
313 dprintf("pausing at %llx/%llx/%llx/%llx\n", |
|
314 (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, |
|
315 (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); |
|
316 dp->dp_scrub_pausing = B_TRUE; |
|
317 dp->dp_scrub_bookmark = *zb; |
|
318 return (B_TRUE); |
|
319 } |
|
320 return (B_FALSE); |
|
321 } |
|
322 |
|
323 /* ARGSUSED */ |
|
324 static void |
|
325 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) |
|
326 { |
|
327 dsl_pool_t *dp = arg; |
|
328 |
|
329 if (bp->blk_birth <= dp->dp_scrub_min_txg) |
|
330 return; |
|
331 |
|
332 if (claim_txg != 0 || bp->blk_birth < spa_first_txg(dp->dp_spa)) { |
|
333 zbookmark_t zb = { 0 }; |
|
334 /* XXX figure out zb.objset */ |
|
335 zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; |
|
336 VERIFY(0 == |
|
337 scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); |
|
338 } |
|
339 } |
|
340 |
|
341 /* ARGSUSED */ |
|
342 static void |
|
343 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) |
|
344 { |
|
345 dsl_pool_t *dp = arg; |
|
346 |
|
347 if (lrc->lrc_txtype == TX_WRITE) { |
|
348 lr_write_t *lr = (lr_write_t *)lrc; |
|
349 blkptr_t *bp = &lr->lr_blkptr; |
|
350 |
|
351 if (bp->blk_birth <= dp->dp_scrub_min_txg) |
|
352 return; |
|
353 |
|
354 if (claim_txg != 0 && bp->blk_birth >= claim_txg) { |
|
355 zbookmark_t zb = { 0 }; |
|
356 /* XXX figure out zb.objset */ |
|
357 zb.zb_object = lr->lr_foid; |
|
358 zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); |
|
359 VERIFY(0 == |
|
360 scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); |
|
361 } |
|
362 } |
|
363 } |
|
364 |
|
365 static void |
|
366 traverse_zil(dsl_pool_t *dp, zil_header_t *zh) |
|
367 { |
|
368 uint64_t claim_txg = zh->zh_claim_txg; |
|
369 zilog_t *zilog; |
|
370 |
|
371 /* |
|
372 * We only want to visit blocks that have been claimed but not yet |
|
373 * replayed (or, in read-only mode, blocks that *would* be claimed). |
|
374 */ |
|
375 if (claim_txg == 0 && (spa_mode & FWRITE)) |
|
376 return; |
|
377 |
|
378 zilog = zil_alloc(dp->dp_meta_objset, zh); |
|
379 |
|
380 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, dp, |
|
381 claim_txg); |
|
382 |
|
383 zil_free(zilog); |
|
384 } |
|
385 |
|
386 static void |
|
387 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, |
|
388 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) |
|
389 { |
|
390 int err; |
|
391 arc_buf_t *buf = NULL; |
|
392 |
|
393 if (bp->blk_birth == 0) |
|
394 return; |
|
395 |
|
396 dprintf_bp(bp, "scrub_visitbp bm %lld/%lld/%lld/%lld: ", |
|
397 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); |
|
398 |
|
399 if (bp->blk_birth <= dp->dp_scrub_min_txg) |
|
400 return; |
|
401 |
|
402 if (scrub_pause(dp, zb)) |
|
403 return; |
|
404 |
|
405 if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { |
|
406 /* |
|
407 * If we already visited this bp & everything below (in |
|
408 * a prior txg), don't bother doing it again. |
|
409 */ |
|
410 if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) |
|
411 return; |
|
412 |
|
413 /* |
|
414 * If we found the block we're trying to resume from, or |
|
415 * we went past it to a different object, zero it out to |
|
416 * indicate that it's OK to start checking for pausing |
|
417 * again. |
|
418 */ |
|
419 if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || |
|
420 zb->zb_object > dp->dp_scrub_bookmark.zb_object) { |
|
421 dprintf("resuming at %llx/%llx/%llx/%llx\n", |
|
422 (longlong_t)zb->zb_objset, |
|
423 (longlong_t)zb->zb_object, |
|
424 (longlong_t)zb->zb_level, |
|
425 (longlong_t)zb->zb_blkid); |
|
426 bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); |
|
427 } |
|
428 } |
|
429 |
|
430 if (BP_GET_LEVEL(bp) > 0) { |
|
431 uint32_t flags = ARC_WAIT; |
|
432 int i; |
|
433 blkptr_t *cbp; |
|
434 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; |
|
435 |
|
436 err = arc_read(NULL, dp->dp_spa, bp, pbuf, |
|
437 arc_getbuf_func, &buf, |
|
438 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); |
|
439 if (err) { |
|
440 mutex_enter(&dp->dp_spa->spa_scrub_lock); |
|
441 dp->dp_spa->spa_scrub_errors++; |
|
442 mutex_exit(&dp->dp_spa->spa_scrub_lock); |
|
443 return; |
|
444 } |
|
445 cbp = buf->b_data; |
|
446 |
|
447 for (i = 0; i < epb; i++, cbp++) { |
|
448 zbookmark_t czb; |
|
449 |
|
450 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, |
|
451 zb->zb_level - 1, |
|
452 zb->zb_blkid * epb + i); |
|
453 scrub_visitbp(dp, dnp, buf, cbp, &czb); |
|
454 } |
|
455 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { |
|
456 uint32_t flags = ARC_WAIT; |
|
457 dnode_phys_t *child_dnp; |
|
458 int i, j; |
|
459 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; |
|
460 |
|
461 err = arc_read(NULL, dp->dp_spa, bp, pbuf, |
|
462 arc_getbuf_func, &buf, |
|
463 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); |
|
464 if (err) { |
|
465 mutex_enter(&dp->dp_spa->spa_scrub_lock); |
|
466 dp->dp_spa->spa_scrub_errors++; |
|
467 mutex_exit(&dp->dp_spa->spa_scrub_lock); |
|
468 return; |
|
469 } |
|
470 child_dnp = buf->b_data; |
|
471 |
|
472 for (i = 0; i < epb; i++, child_dnp++) { |
|
473 for (j = 0; j < child_dnp->dn_nblkptr; j++) { |
|
474 zbookmark_t czb; |
|
475 |
|
476 SET_BOOKMARK(&czb, zb->zb_objset, |
|
477 zb->zb_blkid * epb + i, |
|
478 child_dnp->dn_nlevels - 1, j); |
|
479 scrub_visitbp(dp, child_dnp, buf, |
|
480 &child_dnp->dn_blkptr[j], &czb); |
|
481 } |
|
482 } |
|
483 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { |
|
484 uint32_t flags = ARC_WAIT; |
|
485 objset_phys_t *osp; |
|
486 int j; |
|
487 |
|
488 err = arc_read_nolock(NULL, dp->dp_spa, bp, |
|
489 arc_getbuf_func, &buf, |
|
490 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); |
|
491 if (err) { |
|
492 mutex_enter(&dp->dp_spa->spa_scrub_lock); |
|
493 dp->dp_spa->spa_scrub_errors++; |
|
494 mutex_exit(&dp->dp_spa->spa_scrub_lock); |
|
495 return; |
|
496 } |
|
497 |
|
498 osp = buf->b_data; |
|
499 |
|
500 traverse_zil(dp, &osp->os_zil_header); |
|
501 |
|
502 for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { |
|
503 zbookmark_t czb; |
|
504 |
|
505 SET_BOOKMARK(&czb, zb->zb_objset, 0, |
|
506 osp->os_meta_dnode.dn_nlevels - 1, j); |
|
507 scrub_visitbp(dp, &osp->os_meta_dnode, buf, |
|
508 &osp->os_meta_dnode.dn_blkptr[j], &czb); |
|
509 } |
|
510 } |
|
511 |
|
512 (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); |
|
513 if (buf) |
|
514 (void) arc_buf_remove_ref(buf, &buf); |
|
515 } |
|
516 |
|
517 static void |
|
518 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) |
|
519 { |
|
520 zbookmark_t zb; |
|
521 |
|
522 SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); |
|
523 scrub_visitbp(dp, NULL, NULL, bp, &zb); |
|
524 } |
|
525 |
|
526 void |
|
527 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) |
|
528 { |
|
529 dsl_pool_t *dp = ds->ds_dir->dd_pool; |
|
530 |
|
531 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) |
|
532 return; |
|
533 |
|
534 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { |
|
535 SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); |
|
536 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
537 ds->ds_object, tx) != 0) { |
|
538 return; |
|
539 } |
|
540 |
|
541 if (ds->ds_phys->ds_next_snap_obj != 0) { |
|
542 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
543 ds->ds_phys->ds_next_snap_obj, tx) == 0); |
|
544 } |
|
545 ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); |
|
546 } |
|
547 |
|
548 void |
|
549 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) |
|
550 { |
|
551 dsl_pool_t *dp = ds->ds_dir->dd_pool; |
|
552 |
|
553 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) |
|
554 return; |
|
555 |
|
556 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); |
|
557 |
|
558 if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { |
|
559 dp->dp_scrub_bookmark.zb_objset = |
|
560 ds->ds_phys->ds_prev_snap_obj; |
|
561 } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
562 ds->ds_object, tx) == 0) { |
|
563 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
564 ds->ds_phys->ds_prev_snap_obj, tx) == 0); |
|
565 } |
|
566 } |
|
567 |
|
568 struct enqueue_clones_arg { |
|
569 dmu_tx_t *tx; |
|
570 uint64_t originobj; |
|
571 }; |
|
572 |
|
573 /* ARGSUSED */ |
|
574 static int |
|
575 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) |
|
576 { |
|
577 struct enqueue_clones_arg *eca = arg; |
|
578 dsl_dataset_t *ds; |
|
579 int err; |
|
580 dsl_pool_t *dp; |
|
581 |
|
582 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); |
|
583 if (err) |
|
584 return (err); |
|
585 dp = ds->ds_dir->dd_pool; |
|
586 |
|
587 if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { |
|
588 while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { |
|
589 dsl_dataset_t *prev; |
|
590 err = dsl_dataset_hold_obj(dp, |
|
591 ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); |
|
592 |
|
593 dsl_dataset_rele(ds, FTAG); |
|
594 if (err) |
|
595 return (err); |
|
596 ds = prev; |
|
597 } |
|
598 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
599 ds->ds_object, eca->tx) == 0); |
|
600 } |
|
601 dsl_dataset_rele(ds, FTAG); |
|
602 return (0); |
|
603 } |
|
604 |
|
605 static void |
|
606 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) |
|
607 { |
|
608 dsl_dataset_t *ds; |
|
609 uint64_t min_txg_save; |
|
610 |
|
611 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); |
|
612 |
|
613 /* |
|
614 * Iterate over the bps in this ds. |
|
615 */ |
|
616 min_txg_save = dp->dp_scrub_min_txg; |
|
617 dp->dp_scrub_min_txg = |
|
618 MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); |
|
619 scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); |
|
620 dp->dp_scrub_min_txg = min_txg_save; |
|
621 |
|
622 if (dp->dp_scrub_pausing) |
|
623 goto out; |
|
624 |
|
625 /* |
|
626 * Add descendent datasets to work queue. |
|
627 */ |
|
628 if (ds->ds_phys->ds_next_snap_obj != 0) { |
|
629 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
630 ds->ds_phys->ds_next_snap_obj, tx) == 0); |
|
631 } |
|
632 if (ds->ds_phys->ds_num_children > 1) { |
|
633 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { |
|
634 struct enqueue_clones_arg eca; |
|
635 eca.tx = tx; |
|
636 eca.originobj = ds->ds_object; |
|
637 |
|
638 (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, |
|
639 NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); |
|
640 } else { |
|
641 VERIFY(zap_join(dp->dp_meta_objset, |
|
642 ds->ds_phys->ds_next_clones_obj, |
|
643 dp->dp_scrub_queue_obj, tx) == 0); |
|
644 } |
|
645 } |
|
646 |
|
647 out: |
|
648 dsl_dataset_rele(ds, FTAG); |
|
649 } |
|
650 |
|
651 /* ARGSUSED */ |
|
652 static int |
|
653 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) |
|
654 { |
|
655 dmu_tx_t *tx = arg; |
|
656 dsl_dataset_t *ds; |
|
657 int err; |
|
658 dsl_pool_t *dp; |
|
659 |
|
660 err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); |
|
661 if (err) |
|
662 return (err); |
|
663 |
|
664 dp = ds->ds_dir->dd_pool; |
|
665 |
|
666 while (ds->ds_phys->ds_prev_snap_obj != 0) { |
|
667 dsl_dataset_t *prev; |
|
668 err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, |
|
669 FTAG, &prev); |
|
670 if (err) { |
|
671 dsl_dataset_rele(ds, FTAG); |
|
672 return (err); |
|
673 } |
|
674 |
|
675 /* |
|
676 * If this is a clone, we don't need to worry about it for now. |
|
677 */ |
|
678 if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { |
|
679 dsl_dataset_rele(ds, FTAG); |
|
680 dsl_dataset_rele(prev, FTAG); |
|
681 return (0); |
|
682 } |
|
683 dsl_dataset_rele(ds, FTAG); |
|
684 ds = prev; |
|
685 } |
|
686 |
|
687 VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, |
|
688 ds->ds_object, tx) == 0); |
|
689 dsl_dataset_rele(ds, FTAG); |
|
690 return (0); |
|
691 } |
|
692 |
|
693 void |
|
694 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) |
|
695 { |
|
696 zap_cursor_t zc; |
|
697 zap_attribute_t za; |
|
698 boolean_t complete = B_TRUE; |
|
699 |
|
700 if (dp->dp_scrub_func == SCRUB_FUNC_NONE) |
|
701 return; |
|
702 |
|
703 /* If the spa is not fully loaded, don't bother. */ |
|
704 if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) |
|
705 return; |
|
706 |
|
707 if (dp->dp_scrub_restart) { |
|
708 enum scrub_func func = dp->dp_scrub_func; |
|
709 dp->dp_scrub_restart = B_FALSE; |
|
710 dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); |
|
711 } |
|
712 |
|
713 if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { |
|
714 /* |
|
715 * We must have resumed after rebooting; reset the vdev |
|
716 * stats to know that we're doing a scrub (although it |
|
717 * will think we're just starting now). |
|
718 */ |
|
719 vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, |
|
720 dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : |
|
721 POOL_SCRUB_EVERYTHING, B_FALSE); |
|
722 } |
|
723 |
|
724 dp->dp_scrub_pausing = B_FALSE; |
|
725 dp->dp_scrub_start_time = lbolt64; |
|
726 dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); |
|
727 |
|
728 if (dp->dp_scrub_bookmark.zb_objset == 0) { |
|
729 /* First do the MOS & ORIGIN */ |
|
730 scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); |
|
731 if (dp->dp_scrub_pausing) |
|
732 goto out; |
|
733 |
|
734 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { |
|
735 VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, |
|
736 NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); |
|
737 } else { |
|
738 scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); |
|
739 } |
|
740 ASSERT(!dp->dp_scrub_pausing); |
|
741 } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { |
|
742 /* |
|
743 * If we were paused, continue from here. Note if the |
|
744 * ds we were paused on was deleted, the zb_objset will |
|
745 * be -1, so we will skip this and find a new objset |
|
746 * below. |
|
747 */ |
|
748 scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); |
|
749 if (dp->dp_scrub_pausing) |
|
750 goto out; |
|
751 } |
|
752 |
|
753 /* |
|
754 * In case we were paused right at the end of the ds, zero the |
|
755 * bookmark so we don't think that we're still trying to resume. |
|
756 */ |
|
757 bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); |
|
758 |
|
759 /* keep pulling things out of the zap-object-as-queue */ |
|
760 while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), |
|
761 zap_cursor_retrieve(&zc, &za) == 0) { |
|
762 VERIFY(0 == zap_remove(dp->dp_meta_objset, |
|
763 dp->dp_scrub_queue_obj, za.za_name, tx)); |
|
764 scrub_visitds(dp, za.za_first_integer, tx); |
|
765 if (dp->dp_scrub_pausing) |
|
766 break; |
|
767 zap_cursor_fini(&zc); |
|
768 } |
|
769 zap_cursor_fini(&zc); |
|
770 if (dp->dp_scrub_pausing) |
|
771 goto out; |
|
772 |
|
773 /* done. */ |
|
774 |
|
775 dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); |
|
776 return; |
|
777 out: |
|
778 VERIFY(0 == zap_update(dp->dp_meta_objset, |
|
779 DMU_POOL_DIRECTORY_OBJECT, |
|
780 DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, |
|
781 &dp->dp_scrub_bookmark, tx)); |
|
782 VERIFY(0 == zap_update(dp->dp_meta_objset, |
|
783 DMU_POOL_DIRECTORY_OBJECT, |
|
784 DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, |
|
785 &dp->dp_spa->spa_scrub_errors, tx)); |
|
786 |
|
787 /* XXX this is scrub-clean specific */ |
|
788 mutex_enter(&dp->dp_spa->spa_scrub_lock); |
|
789 while (dp->dp_spa->spa_scrub_inflight > 0) { |
|
790 cv_wait(&dp->dp_spa->spa_scrub_io_cv, |
|
791 &dp->dp_spa->spa_scrub_lock); |
|
792 } |
|
793 mutex_exit(&dp->dp_spa->spa_scrub_lock); |
|
794 } |
|
795 |
|
796 void |
|
797 dsl_pool_scrub_restart(dsl_pool_t *dp) |
|
798 { |
|
799 mutex_enter(&dp->dp_scrub_cancel_lock); |
|
800 dp->dp_scrub_restart = B_TRUE; |
|
801 mutex_exit(&dp->dp_scrub_cancel_lock); |
|
802 } |
|
803 |
|
804 /* |
|
805 * scrub consumers |
|
806 */ |
|
807 |
|
808 static void |
|
809 dsl_pool_scrub_clean_done(zio_t *zio) |
|
810 { |
|
811 spa_t *spa = zio->io_spa; |
|
812 |
|
813 zio_data_buf_free(zio->io_data, zio->io_size); |
|
814 |
|
815 mutex_enter(&spa->spa_scrub_lock); |
|
816 spa->spa_scrub_inflight--; |
|
817 cv_broadcast(&spa->spa_scrub_io_cv); |
|
818 |
|
819 if (zio->io_error) |
|
820 spa->spa_scrub_errors++; |
|
821 mutex_exit(&spa->spa_scrub_lock); |
|
822 } |
|
823 |
|
824 static int |
|
825 dsl_pool_scrub_clean_cb(dsl_pool_t *dp, |
|
826 const blkptr_t *bp, const zbookmark_t *zb) |
|
827 { |
|
828 size_t size = BP_GET_LSIZE(bp); |
|
829 int d; |
|
830 spa_t *spa = dp->dp_spa; |
|
831 boolean_t needs_io; |
|
832 int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; |
|
833 int zio_priority; |
|
834 |
|
835 dprintf_bp(bp, "visiting %s", ""); |
|
836 |
|
837 if (dp->dp_scrub_isresilver == 0) { |
|
838 /* It's a scrub */ |
|
839 zio_flags |= ZIO_FLAG_SCRUB; |
|
840 zio_priority = ZIO_PRIORITY_SCRUB; |
|
841 needs_io = B_TRUE; |
|
842 } else { |
|
843 /* It's a resilver */ |
|
844 zio_flags |= ZIO_FLAG_RESILVER; |
|
845 zio_priority = ZIO_PRIORITY_RESILVER; |
|
846 needs_io = B_FALSE; |
|
847 } |
|
848 |
|
849 for (d = 0; d < BP_GET_NDVAS(bp); d++) { |
|
850 vdev_t *vd = vdev_lookup_top(spa, |
|
851 DVA_GET_VDEV(&bp->blk_dva[d])); |
|
852 |
|
853 /* |
|
854 * Keep track of how much data we've examined so that |
|
855 * zpool(1M) status can make useful progress reports. |
|
856 */ |
|
857 mutex_enter(&vd->vdev_stat_lock); |
|
858 vd->vdev_stat.vs_scrub_examined += |
|
859 DVA_GET_ASIZE(&bp->blk_dva[d]); |
|
860 mutex_exit(&vd->vdev_stat_lock); |
|
861 |
|
862 /* if it's a resilver, this may not be in the target range */ |
|
863 if (!needs_io) { |
|
864 if (DVA_GET_GANG(&bp->blk_dva[d])) { |
|
865 /* |
|
866 * Gang members may be spread across multiple |
|
867 * vdevs, so the best we can do is look at the |
|
868 * pool-wide DTL. |
|
869 * XXX -- it would be better to change our |
|
870 * allocation policy to ensure that this can't |
|
871 * happen. |
|
872 */ |
|
873 vd = spa->spa_root_vdev; |
|
874 } |
|
875 needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, |
|
876 bp->blk_birth, 1); |
|
877 } |
|
878 } |
|
879 |
|
880 if (needs_io && !zfs_no_scrub_io) { |
|
881 void *data = zio_data_buf_alloc(size); |
|
882 |
|
883 mutex_enter(&spa->spa_scrub_lock); |
|
884 while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) |
|
885 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); |
|
886 spa->spa_scrub_inflight++; |
|
887 mutex_exit(&spa->spa_scrub_lock); |
|
888 |
|
889 zio_nowait(zio_read(NULL, spa, bp, data, size, |
|
890 dsl_pool_scrub_clean_done, NULL, zio_priority, |
|
891 zio_flags, zb)); |
|
892 } |
|
893 |
|
894 /* do not relocate this block */ |
|
895 return (0); |
|
896 } |
|
897 |
|
898 int |
|
899 dsl_pool_scrub_clean(dsl_pool_t *dp) |
|
900 { |
|
901 /* |
|
902 * Purge all vdev caches. We do this here rather than in sync |
|
903 * context because this requires a writer lock on the spa_config |
|
904 * lock, which we can't do from sync context. The |
|
905 * spa_scrub_reopen flag indicates that vdev_open() should not |
|
906 * attempt to start another scrub. |
|
907 */ |
|
908 spa_config_enter(dp->dp_spa, RW_WRITER, FTAG); |
|
909 dp->dp_spa->spa_scrub_reopen = B_TRUE; |
|
910 vdev_reopen(dp->dp_spa->spa_root_vdev); |
|
911 dp->dp_spa->spa_scrub_reopen = B_FALSE; |
|
912 spa_config_exit(dp->dp_spa, FTAG); |
|
913 |
|
914 return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); |
|
915 } |