usr/src/uts/common/fs/zfs/dsl_scrub.c
changeset 7046 361307ae060d
child 7160 12f467ba43f4
equal deleted inserted replaced
7045:d4292813278d 7046:361307ae060d
       
     1 /*
       
     2  * CDDL HEADER START
       
     3  *
       
     4  * The contents of this file are subject to the terms of the
       
     5  * Common Development and Distribution License (the "License").
       
     6  * You may not use this file except in compliance with the License.
       
     7  *
       
     8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
       
     9  * or http://www.opensolaris.org/os/licensing.
       
    10  * See the License for the specific language governing permissions
       
    11  * and limitations under the License.
       
    12  *
       
    13  * When distributing Covered Code, include this CDDL HEADER in each
       
    14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
       
    15  * If applicable, add the following below this CDDL HEADER, with the
       
    16  * fields enclosed by brackets "[]" replaced with your own identifying
       
    17  * information: Portions Copyright [yyyy] [name of copyright owner]
       
    18  *
       
    19  * CDDL HEADER END
       
    20  */
       
    21 /*
       
    22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
       
    23  * Use is subject to license terms.
       
    24  */
       
    25 
       
    26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
       
    27 
       
    28 #include <sys/dsl_pool.h>
       
    29 #include <sys/dsl_dataset.h>
       
    30 #include <sys/dsl_prop.h>
       
    31 #include <sys/dsl_dir.h>
       
    32 #include <sys/dsl_synctask.h>
       
    33 #include <sys/dnode.h>
       
    34 #include <sys/dmu_tx.h>
       
    35 #include <sys/dmu_objset.h>
       
    36 #include <sys/arc.h>
       
    37 #include <sys/zap.h>
       
    38 #include <sys/zio.h>
       
    39 #include <sys/zfs_context.h>
       
    40 #include <sys/fs/zfs.h>
       
    41 #include <sys/zfs_znode.h>
       
    42 #include <sys/spa_impl.h>
       
    43 #include <sys/vdev_impl.h>
       
    44 
       
    45 /* XXX */
       
    46 #ifndef _KERNEL
       
    47 #include <ucontext.h>
       
    48 #include <stdio.h>
       
    49 #endif
       
    50 
       
    51 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
       
    52 
       
    53 static scrub_cb_t dsl_pool_scrub_clean_cb;
       
    54 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
       
    55 
       
    56 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
       
    57 int zfs_scrub_max_time = 2; /* scrub for at most 2 sec each txg */
       
    58 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
       
    59 
       
    60 extern int zfs_txg_timeout;
       
    61 
       
    62 static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
       
    63 	NULL,
       
    64 	dsl_pool_scrub_clean_cb
       
    65 };
       
    66 
       
    67 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
       
    68 {                                                       \
       
    69 	(zb)->zb_objset = objset;                       \
       
    70 	(zb)->zb_object = object;                       \
       
    71 	(zb)->zb_level = level;                         \
       
    72 	(zb)->zb_blkid = blkid;                         \
       
    73 }
       
    74 
       
    75 /* ARGSUSED */
       
    76 static void
       
    77 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
       
    78 {
       
    79 	dsl_pool_t *dp = arg1;
       
    80 	enum scrub_func *funcp = arg2;
       
    81 	dmu_object_type_t ot = 0;
       
    82 	boolean_t complete = B_FALSE;
       
    83 
       
    84 	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
       
    85 
       
    86 	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
       
    87 	ASSERT(*funcp > SCRUB_FUNC_NONE);
       
    88 	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
       
    89 
       
    90 	dp->dp_scrub_min_txg = 0;
       
    91 	dp->dp_scrub_max_txg = tx->tx_txg;
       
    92 
       
    93 	if (*funcp == SCRUB_FUNC_CLEAN) {
       
    94 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
       
    95 
       
    96 		/* rewrite all disk labels */
       
    97 		vdev_config_dirty(rvd);
       
    98 
       
    99 		if (vdev_resilver_needed(rvd,
       
   100 		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
       
   101 			spa_event_notify(dp->dp_spa, NULL,
       
   102 			    ESC_ZFS_RESILVER_START);
       
   103 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
       
   104 			    tx->tx_txg);
       
   105 		}
       
   106 
       
   107 		/* zero out the scrub stats in all vdev_stat_t's */
       
   108 		vdev_scrub_stat_update(rvd,
       
   109 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
       
   110 		    POOL_SCRUB_EVERYTHING, B_FALSE);
       
   111 
       
   112 		dp->dp_spa->spa_scrub_started = B_TRUE;
       
   113 	}
       
   114 
       
   115 	/* back to the generic stuff */
       
   116 
       
   117 	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
       
   118 		ot = DMU_OT_ZAP_OTHER;
       
   119 
       
   120 	dp->dp_scrub_func = *funcp;
       
   121 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
       
   122 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
       
   123 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
       
   124 	dp->dp_scrub_restart = B_FALSE;
       
   125 
       
   126 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   127 	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
       
   128 	    &dp->dp_scrub_func, tx));
       
   129 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   130 	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
       
   131 	    &dp->dp_scrub_queue_obj, tx));
       
   132 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   133 	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
       
   134 	    &dp->dp_scrub_min_txg, tx));
       
   135 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   136 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
       
   137 	    &dp->dp_scrub_max_txg, tx));
       
   138 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   139 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
       
   140 	    &dp->dp_scrub_bookmark, tx));
       
   141 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   142 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
       
   143 	    &dp->dp_spa->spa_scrub_errors, tx));
       
   144 
       
   145 	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
       
   146 	    "func=%u mintxg=%llu maxtxg=%llu",
       
   147 	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
       
   148 }
       
   149 
       
   150 int
       
   151 dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
       
   152 {
       
   153 	return (dsl_sync_task_do(dp, NULL,
       
   154 	    dsl_pool_scrub_setup_sync, dp, &func, 0));
       
   155 }
       
   156 
       
   157 /* ARGSUSED */
       
   158 static void
       
   159 dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
       
   160 {
       
   161 	dsl_pool_t *dp = arg1;
       
   162 	boolean_t *completep = arg2;
       
   163 
       
   164 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
       
   165 		return;
       
   166 
       
   167 	mutex_enter(&dp->dp_scrub_cancel_lock);
       
   168 
       
   169 	if (dp->dp_scrub_restart) {
       
   170 		dp->dp_scrub_restart = B_FALSE;
       
   171 		*completep = B_FALSE;
       
   172 	}
       
   173 
       
   174 	/* XXX this is scrub-clean specific */
       
   175 	mutex_enter(&dp->dp_spa->spa_scrub_lock);
       
   176 	while (dp->dp_spa->spa_scrub_inflight > 0) {
       
   177 		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
       
   178 		    &dp->dp_spa->spa_scrub_lock);
       
   179 	}
       
   180 	mutex_exit(&dp->dp_spa->spa_scrub_lock);
       
   181 	dp->dp_spa->spa_scrub_started = B_FALSE;
       
   182 
       
   183 	dp->dp_scrub_func = SCRUB_FUNC_NONE;
       
   184 	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
       
   185 	    dp->dp_scrub_queue_obj, tx));
       
   186 	dp->dp_scrub_queue_obj = 0;
       
   187 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
       
   188 
       
   189 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   190 	    DMU_POOL_SCRUB_QUEUE, tx));
       
   191 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   192 	    DMU_POOL_SCRUB_MIN_TXG, tx));
       
   193 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   194 	    DMU_POOL_SCRUB_MAX_TXG, tx));
       
   195 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   196 	    DMU_POOL_SCRUB_BOOKMARK, tx));
       
   197 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   198 	    DMU_POOL_SCRUB_FUNC, tx));
       
   199 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
       
   200 	    DMU_POOL_SCRUB_ERRORS, tx));
       
   201 
       
   202 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
       
   203 	    "complete=%u", *completep);
       
   204 
       
   205 	/* below is scrub-clean specific */
       
   206 	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
       
   207 	    *completep);
       
   208 	/*
       
   209 	 * If the scrub/resilver completed, update all DTLs to reflect this.
       
   210 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
       
   211 	 */
       
   212 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
       
   213 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
       
   214 	if (dp->dp_scrub_min_txg && *completep)
       
   215 		spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
       
   216 	spa_errlog_rotate(dp->dp_spa);
       
   217 
       
   218 	/*
       
   219 	 * We may have finished replacing a device.
       
   220 	 * Let the async thread assess this and handle the detach.
       
   221 	 */
       
   222 	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
       
   223 
       
   224 	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
       
   225 	mutex_exit(&dp->dp_scrub_cancel_lock);
       
   226 }
       
   227 
       
   228 int
       
   229 dsl_pool_scrub_cancel(dsl_pool_t *dp)
       
   230 {
       
   231 	boolean_t complete = B_FALSE;
       
   232 
       
   233 	return (dsl_sync_task_do(dp, NULL,
       
   234 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
       
   235 }
       
   236 
       
   237 int
       
   238 dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
       
   239     zio_done_func_t *done, void *private, uint32_t arc_flags)
       
   240 {
       
   241 	/*
       
   242 	 * This function will be used by bp-rewrite wad to intercept frees.
       
   243 	 */
       
   244 	return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
       
   245 	    done, private, arc_flags));
       
   246 }
       
   247 
       
   248 static boolean_t
       
   249 bookmark_is_zero(const zbookmark_t *zb)
       
   250 {
       
   251 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
       
   252 	    zb->zb_level == 0 && zb->zb_blkid == 0);
       
   253 }
       
   254 
       
   255 /* dnp is the dnode for zb1->zb_object */
       
   256 static boolean_t
       
   257 bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
       
   258     const zbookmark_t *zb2)
       
   259 {
       
   260 	uint64_t nextL0;
       
   261 
       
   262 	ASSERT(zb1->zb_objset == zb2->zb_objset);
       
   263 	ASSERT(zb1->zb_object != -1ULL);
       
   264 	ASSERT(zb2->zb_object != 0);
       
   265 	ASSERT(zb2->zb_level == 0);
       
   266 
       
   267 	/*
       
   268 	 * A bookmark in the deadlist is considered to be after
       
   269 	 * everything else.
       
   270 	 */
       
   271 	if (zb2->zb_object == -1ULL)
       
   272 		return (B_TRUE);
       
   273 
       
   274 	/* The objset_phys_t isn't before anything. */
       
   275 	if (dnp == NULL)
       
   276 		return (B_FALSE);
       
   277 
       
   278 	nextL0 = (zb1->zb_blkid + 1) <<
       
   279 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
       
   280 
       
   281 	if (zb1->zb_object == 0) {
       
   282 		uint64_t nextobj = nextL0 *
       
   283 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
       
   284 		return (nextobj <= zb2->zb_object);
       
   285 	}
       
   286 
       
   287 	if (zb1->zb_object < zb2->zb_object)
       
   288 		return (B_TRUE);
       
   289 	if (zb1->zb_object > zb2->zb_object)
       
   290 		return (B_FALSE);
       
   291 
       
   292 	return (nextL0 <= zb2->zb_blkid);
       
   293 }
       
   294 
       
   295 static boolean_t
       
   296 scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
       
   297 {
       
   298 	int elapsed_ticks;
       
   299 
       
   300 	if (dp->dp_scrub_pausing)
       
   301 		return (B_TRUE); /* we're already pausing */
       
   302 
       
   303 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
       
   304 		return (B_FALSE); /* we're resuming */
       
   305 
       
   306 	/* we don't yet know how to resume from anything but leaf blocks */
       
   307 	if (zb->zb_object == 0 || zb->zb_level != 0)
       
   308 		return (B_FALSE);
       
   309 
       
   310 	elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
       
   311 	if (elapsed_ticks > hz * zfs_txg_timeout ||
       
   312 	    (elapsed_ticks > hz * zfs_scrub_min_time && txg_sync_waiting(dp))) {
       
   313 		dprintf("pausing at %llx/%llx/%llx/%llx\n",
       
   314 		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
       
   315 		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
       
   316 		dp->dp_scrub_pausing = B_TRUE;
       
   317 		dp->dp_scrub_bookmark = *zb;
       
   318 		return (B_TRUE);
       
   319 	}
       
   320 	return (B_FALSE);
       
   321 }
       
   322 
       
   323 /* ARGSUSED */
       
   324 static void
       
   325 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
       
   326 {
       
   327 	dsl_pool_t *dp = arg;
       
   328 
       
   329 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
       
   330 		return;
       
   331 
       
   332 	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(dp->dp_spa)) {
       
   333 		zbookmark_t zb = { 0 };
       
   334 		/* XXX figure out zb.objset */
       
   335 		zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
       
   336 		VERIFY(0 ==
       
   337 		    scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
       
   338 	}
       
   339 }
       
   340 
       
   341 /* ARGSUSED */
       
   342 static void
       
   343 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
       
   344 {
       
   345 	dsl_pool_t *dp = arg;
       
   346 
       
   347 	if (lrc->lrc_txtype == TX_WRITE) {
       
   348 		lr_write_t *lr = (lr_write_t *)lrc;
       
   349 		blkptr_t *bp = &lr->lr_blkptr;
       
   350 
       
   351 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
       
   352 			return;
       
   353 
       
   354 		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
       
   355 			zbookmark_t zb = { 0 };
       
   356 			/* XXX figure out zb.objset */
       
   357 			zb.zb_object = lr->lr_foid;
       
   358 			zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
       
   359 			VERIFY(0 ==
       
   360 			    scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
       
   361 		}
       
   362 	}
       
   363 }
       
   364 
       
   365 static void
       
   366 traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
       
   367 {
       
   368 	uint64_t claim_txg = zh->zh_claim_txg;
       
   369 	zilog_t *zilog;
       
   370 
       
   371 	/*
       
   372 	 * We only want to visit blocks that have been claimed but not yet
       
   373 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
       
   374 	 */
       
   375 	if (claim_txg == 0 && (spa_mode & FWRITE))
       
   376 		return;
       
   377 
       
   378 	zilog = zil_alloc(dp->dp_meta_objset, zh);
       
   379 
       
   380 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, dp,
       
   381 	    claim_txg);
       
   382 
       
   383 	zil_free(zilog);
       
   384 }
       
   385 
       
   386 static void
       
   387 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
       
   388     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
       
   389 {
       
   390 	int err;
       
   391 	arc_buf_t *buf = NULL;
       
   392 
       
   393 	if (bp->blk_birth == 0)
       
   394 		return;
       
   395 
       
   396 	dprintf_bp(bp, "scrub_visitbp bm %lld/%lld/%lld/%lld: ",
       
   397 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
       
   398 
       
   399 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
       
   400 		return;
       
   401 
       
   402 	if (scrub_pause(dp, zb))
       
   403 		return;
       
   404 
       
   405 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
       
   406 		/*
       
   407 		 * If we already visited this bp & everything below (in
       
   408 		 * a prior txg), don't bother doing it again.
       
   409 		 */
       
   410 		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
       
   411 			return;
       
   412 
       
   413 		/*
       
   414 		 * If we found the block we're trying to resume from, or
       
   415 		 * we went past it to a different object, zero it out to
       
   416 		 * indicate that it's OK to start checking for pausing
       
   417 		 * again.
       
   418 		 */
       
   419 		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
       
   420 		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
       
   421 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
       
   422 			    (longlong_t)zb->zb_objset,
       
   423 			    (longlong_t)zb->zb_object,
       
   424 			    (longlong_t)zb->zb_level,
       
   425 			    (longlong_t)zb->zb_blkid);
       
   426 			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
       
   427 		}
       
   428 	}
       
   429 
       
   430 	if (BP_GET_LEVEL(bp) > 0) {
       
   431 		uint32_t flags = ARC_WAIT;
       
   432 		int i;
       
   433 		blkptr_t *cbp;
       
   434 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
       
   435 
       
   436 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
       
   437 		    arc_getbuf_func, &buf,
       
   438 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
       
   439 		if (err) {
       
   440 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
       
   441 			dp->dp_spa->spa_scrub_errors++;
       
   442 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
       
   443 			return;
       
   444 		}
       
   445 		cbp = buf->b_data;
       
   446 
       
   447 		for (i = 0; i < epb; i++, cbp++) {
       
   448 			zbookmark_t czb;
       
   449 
       
   450 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
       
   451 			    zb->zb_level - 1,
       
   452 			    zb->zb_blkid * epb + i);
       
   453 			scrub_visitbp(dp, dnp, buf, cbp, &czb);
       
   454 		}
       
   455 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
       
   456 		uint32_t flags = ARC_WAIT;
       
   457 		dnode_phys_t *child_dnp;
       
   458 		int i, j;
       
   459 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
       
   460 
       
   461 		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
       
   462 		    arc_getbuf_func, &buf,
       
   463 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
       
   464 		if (err) {
       
   465 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
       
   466 			dp->dp_spa->spa_scrub_errors++;
       
   467 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
       
   468 			return;
       
   469 		}
       
   470 		child_dnp = buf->b_data;
       
   471 
       
   472 		for (i = 0; i < epb; i++, child_dnp++) {
       
   473 			for (j = 0; j < child_dnp->dn_nblkptr; j++) {
       
   474 				zbookmark_t czb;
       
   475 
       
   476 				SET_BOOKMARK(&czb, zb->zb_objset,
       
   477 				    zb->zb_blkid * epb + i,
       
   478 				    child_dnp->dn_nlevels - 1, j);
       
   479 				scrub_visitbp(dp, child_dnp, buf,
       
   480 				    &child_dnp->dn_blkptr[j], &czb);
       
   481 			}
       
   482 		}
       
   483 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
       
   484 		uint32_t flags = ARC_WAIT;
       
   485 		objset_phys_t *osp;
       
   486 		int j;
       
   487 
       
   488 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
       
   489 		    arc_getbuf_func, &buf,
       
   490 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
       
   491 		if (err) {
       
   492 			mutex_enter(&dp->dp_spa->spa_scrub_lock);
       
   493 			dp->dp_spa->spa_scrub_errors++;
       
   494 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
       
   495 			return;
       
   496 		}
       
   497 
       
   498 		osp = buf->b_data;
       
   499 
       
   500 		traverse_zil(dp, &osp->os_zil_header);
       
   501 
       
   502 		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
       
   503 			zbookmark_t czb;
       
   504 
       
   505 			SET_BOOKMARK(&czb, zb->zb_objset, 0,
       
   506 			    osp->os_meta_dnode.dn_nlevels - 1, j);
       
   507 			scrub_visitbp(dp, &osp->os_meta_dnode, buf,
       
   508 			    &osp->os_meta_dnode.dn_blkptr[j], &czb);
       
   509 		}
       
   510 	}
       
   511 
       
   512 	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
       
   513 	if (buf)
       
   514 		(void) arc_buf_remove_ref(buf, &buf);
       
   515 }
       
   516 
       
   517 static void
       
   518 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
       
   519 {
       
   520 	zbookmark_t zb;
       
   521 
       
   522 	SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
       
   523 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
       
   524 }
       
   525 
       
   526 void
       
   527 dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
       
   528 {
       
   529 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
       
   530 
       
   531 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
       
   532 		return;
       
   533 
       
   534 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
       
   535 		SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
       
   536 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   537 	    ds->ds_object, tx) != 0) {
       
   538 		return;
       
   539 	}
       
   540 
       
   541 	if (ds->ds_phys->ds_next_snap_obj != 0) {
       
   542 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   543 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
       
   544 	}
       
   545 	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
       
   546 }
       
   547 
       
   548 void
       
   549 dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
       
   550 {
       
   551 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
       
   552 
       
   553 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
       
   554 		return;
       
   555 
       
   556 	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
       
   557 
       
   558 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
       
   559 		dp->dp_scrub_bookmark.zb_objset =
       
   560 		    ds->ds_phys->ds_prev_snap_obj;
       
   561 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   562 	    ds->ds_object, tx) == 0) {
       
   563 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   564 		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
       
   565 	}
       
   566 }
       
   567 
       
   568 struct enqueue_clones_arg {
       
   569 	dmu_tx_t *tx;
       
   570 	uint64_t originobj;
       
   571 };
       
   572 
       
   573 /* ARGSUSED */
       
   574 static int
       
   575 enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
       
   576 {
       
   577 	struct enqueue_clones_arg *eca = arg;
       
   578 	dsl_dataset_t *ds;
       
   579 	int err;
       
   580 	dsl_pool_t *dp;
       
   581 
       
   582 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
       
   583 	if (err)
       
   584 		return (err);
       
   585 	dp = ds->ds_dir->dd_pool;
       
   586 
       
   587 	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
       
   588 		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
       
   589 			dsl_dataset_t *prev;
       
   590 			err = dsl_dataset_hold_obj(dp,
       
   591 			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
       
   592 
       
   593 			dsl_dataset_rele(ds, FTAG);
       
   594 			if (err)
       
   595 				return (err);
       
   596 			ds = prev;
       
   597 		}
       
   598 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   599 		    ds->ds_object, eca->tx) == 0);
       
   600 	}
       
   601 	dsl_dataset_rele(ds, FTAG);
       
   602 	return (0);
       
   603 }
       
   604 
       
   605 static void
       
   606 scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
       
   607 {
       
   608 	dsl_dataset_t *ds;
       
   609 	uint64_t min_txg_save;
       
   610 
       
   611 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
       
   612 
       
   613 	/*
       
   614 	 * Iterate over the bps in this ds.
       
   615 	 */
       
   616 	min_txg_save = dp->dp_scrub_min_txg;
       
   617 	dp->dp_scrub_min_txg =
       
   618 	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
       
   619 	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
       
   620 	dp->dp_scrub_min_txg = min_txg_save;
       
   621 
       
   622 	if (dp->dp_scrub_pausing)
       
   623 		goto out;
       
   624 
       
   625 	/*
       
   626 	 * Add descendent datasets to work queue.
       
   627 	 */
       
   628 	if (ds->ds_phys->ds_next_snap_obj != 0) {
       
   629 		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   630 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
       
   631 	}
       
   632 	if (ds->ds_phys->ds_num_children > 1) {
       
   633 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
       
   634 			struct enqueue_clones_arg eca;
       
   635 			eca.tx = tx;
       
   636 			eca.originobj = ds->ds_object;
       
   637 
       
   638 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
       
   639 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
       
   640 		} else {
       
   641 			VERIFY(zap_join(dp->dp_meta_objset,
       
   642 			    ds->ds_phys->ds_next_clones_obj,
       
   643 			    dp->dp_scrub_queue_obj, tx) == 0);
       
   644 		}
       
   645 	}
       
   646 
       
   647 out:
       
   648 	dsl_dataset_rele(ds, FTAG);
       
   649 }
       
   650 
       
   651 /* ARGSUSED */
       
   652 static int
       
   653 enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
       
   654 {
       
   655 	dmu_tx_t *tx = arg;
       
   656 	dsl_dataset_t *ds;
       
   657 	int err;
       
   658 	dsl_pool_t *dp;
       
   659 
       
   660 	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
       
   661 	if (err)
       
   662 		return (err);
       
   663 
       
   664 	dp = ds->ds_dir->dd_pool;
       
   665 
       
   666 	while (ds->ds_phys->ds_prev_snap_obj != 0) {
       
   667 		dsl_dataset_t *prev;
       
   668 		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
       
   669 		    FTAG, &prev);
       
   670 		if (err) {
       
   671 			dsl_dataset_rele(ds, FTAG);
       
   672 			return (err);
       
   673 		}
       
   674 
       
   675 		/*
       
   676 		 * If this is a clone, we don't need to worry about it for now.
       
   677 		 */
       
   678 		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
       
   679 			dsl_dataset_rele(ds, FTAG);
       
   680 			dsl_dataset_rele(prev, FTAG);
       
   681 			return (0);
       
   682 		}
       
   683 		dsl_dataset_rele(ds, FTAG);
       
   684 		ds = prev;
       
   685 	}
       
   686 
       
   687 	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
       
   688 	    ds->ds_object, tx) == 0);
       
   689 	dsl_dataset_rele(ds, FTAG);
       
   690 	return (0);
       
   691 }
       
   692 
       
   693 void
       
   694 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
       
   695 {
       
   696 	zap_cursor_t zc;
       
   697 	zap_attribute_t za;
       
   698 	boolean_t complete = B_TRUE;
       
   699 
       
   700 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
       
   701 		return;
       
   702 
       
   703 	/* If the spa is not fully loaded, don't bother. */
       
   704 	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
       
   705 		return;
       
   706 
       
   707 	if (dp->dp_scrub_restart) {
       
   708 		enum scrub_func func = dp->dp_scrub_func;
       
   709 		dp->dp_scrub_restart = B_FALSE;
       
   710 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
       
   711 	}
       
   712 
       
   713 	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
       
   714 		/*
       
   715 		 * We must have resumed after rebooting; reset the vdev
       
   716 		 * stats to know that we're doing a scrub (although it
       
   717 		 * will think we're just starting now).
       
   718 		 */
       
   719 		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
       
   720 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
       
   721 		    POOL_SCRUB_EVERYTHING, B_FALSE);
       
   722 	}
       
   723 
       
   724 	dp->dp_scrub_pausing = B_FALSE;
       
   725 	dp->dp_scrub_start_time = lbolt64;
       
   726 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
       
   727 
       
   728 	if (dp->dp_scrub_bookmark.zb_objset == 0) {
       
   729 		/* First do the MOS & ORIGIN */
       
   730 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
       
   731 		if (dp->dp_scrub_pausing)
       
   732 			goto out;
       
   733 
       
   734 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
       
   735 			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
       
   736 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
       
   737 		} else {
       
   738 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
       
   739 		}
       
   740 		ASSERT(!dp->dp_scrub_pausing);
       
   741 	} else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
       
   742 		/*
       
   743 		 * If we were paused, continue from here.  Note if the
       
   744 		 * ds we were paused on was deleted, the zb_objset will
       
   745 		 * be -1, so we will skip this and find a new objset
       
   746 		 * below.
       
   747 		 */
       
   748 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
       
   749 		if (dp->dp_scrub_pausing)
       
   750 			goto out;
       
   751 	}
       
   752 
       
   753 	/*
       
   754 	 * In case we were paused right at the end of the ds, zero the
       
   755 	 * bookmark so we don't think that we're still trying to resume.
       
   756 	 */
       
   757 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
       
   758 
       
   759 	/* keep pulling things out of the zap-object-as-queue */
       
   760 	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
       
   761 	    zap_cursor_retrieve(&zc, &za) == 0) {
       
   762 		VERIFY(0 == zap_remove(dp->dp_meta_objset,
       
   763 		    dp->dp_scrub_queue_obj, za.za_name, tx));
       
   764 		scrub_visitds(dp, za.za_first_integer, tx);
       
   765 		if (dp->dp_scrub_pausing)
       
   766 			break;
       
   767 		zap_cursor_fini(&zc);
       
   768 	}
       
   769 	zap_cursor_fini(&zc);
       
   770 	if (dp->dp_scrub_pausing)
       
   771 		goto out;
       
   772 
       
   773 	/* done. */
       
   774 
       
   775 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
       
   776 	return;
       
   777 out:
       
   778 	VERIFY(0 == zap_update(dp->dp_meta_objset,
       
   779 	    DMU_POOL_DIRECTORY_OBJECT,
       
   780 	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
       
   781 	    &dp->dp_scrub_bookmark, tx));
       
   782 	VERIFY(0 == zap_update(dp->dp_meta_objset,
       
   783 	    DMU_POOL_DIRECTORY_OBJECT,
       
   784 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
       
   785 	    &dp->dp_spa->spa_scrub_errors, tx));
       
   786 
       
   787 	/* XXX this is scrub-clean specific */
       
   788 	mutex_enter(&dp->dp_spa->spa_scrub_lock);
       
   789 	while (dp->dp_spa->spa_scrub_inflight > 0) {
       
   790 		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
       
   791 		    &dp->dp_spa->spa_scrub_lock);
       
   792 	}
       
   793 	mutex_exit(&dp->dp_spa->spa_scrub_lock);
       
   794 }
       
   795 
       
   796 void
       
   797 dsl_pool_scrub_restart(dsl_pool_t *dp)
       
   798 {
       
   799 	mutex_enter(&dp->dp_scrub_cancel_lock);
       
   800 	dp->dp_scrub_restart = B_TRUE;
       
   801 	mutex_exit(&dp->dp_scrub_cancel_lock);
       
   802 }
       
   803 
       
   804 /*
       
   805  * scrub consumers
       
   806  */
       
   807 
       
   808 static void
       
   809 dsl_pool_scrub_clean_done(zio_t *zio)
       
   810 {
       
   811 	spa_t *spa = zio->io_spa;
       
   812 
       
   813 	zio_data_buf_free(zio->io_data, zio->io_size);
       
   814 
       
   815 	mutex_enter(&spa->spa_scrub_lock);
       
   816 	spa->spa_scrub_inflight--;
       
   817 	cv_broadcast(&spa->spa_scrub_io_cv);
       
   818 
       
   819 	if (zio->io_error)
       
   820 		spa->spa_scrub_errors++;
       
   821 	mutex_exit(&spa->spa_scrub_lock);
       
   822 }
       
   823 
       
   824 static int
       
   825 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
       
   826     const blkptr_t *bp, const zbookmark_t *zb)
       
   827 {
       
   828 	size_t size = BP_GET_LSIZE(bp);
       
   829 	int d;
       
   830 	spa_t *spa = dp->dp_spa;
       
   831 	boolean_t needs_io;
       
   832 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
       
   833 	int zio_priority;
       
   834 
       
   835 	dprintf_bp(bp, "visiting %s", "");
       
   836 
       
   837 	if (dp->dp_scrub_isresilver == 0) {
       
   838 		/* It's a scrub */
       
   839 		zio_flags |= ZIO_FLAG_SCRUB;
       
   840 		zio_priority = ZIO_PRIORITY_SCRUB;
       
   841 		needs_io = B_TRUE;
       
   842 	} else {
       
   843 		/* It's a resilver */
       
   844 		zio_flags |= ZIO_FLAG_RESILVER;
       
   845 		zio_priority = ZIO_PRIORITY_RESILVER;
       
   846 		needs_io = B_FALSE;
       
   847 	}
       
   848 
       
   849 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
       
   850 		vdev_t *vd = vdev_lookup_top(spa,
       
   851 		    DVA_GET_VDEV(&bp->blk_dva[d]));
       
   852 
       
   853 		/*
       
   854 		 * Keep track of how much data we've examined so that
       
   855 		 * zpool(1M) status can make useful progress reports.
       
   856 		 */
       
   857 		mutex_enter(&vd->vdev_stat_lock);
       
   858 		vd->vdev_stat.vs_scrub_examined +=
       
   859 		    DVA_GET_ASIZE(&bp->blk_dva[d]);
       
   860 		mutex_exit(&vd->vdev_stat_lock);
       
   861 
       
   862 		/* if it's a resilver, this may not be in the target range */
       
   863 		if (!needs_io) {
       
   864 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
       
   865 				/*
       
   866 				 * Gang members may be spread across multiple
       
   867 				 * vdevs, so the best we can do is look at the
       
   868 				 * pool-wide DTL.
       
   869 				 * XXX -- it would be better to change our
       
   870 				 * allocation policy to ensure that this can't
       
   871 				 * happen.
       
   872 				 */
       
   873 				vd = spa->spa_root_vdev;
       
   874 			}
       
   875 			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
       
   876 			    bp->blk_birth, 1);
       
   877 		}
       
   878 	}
       
   879 
       
   880 	if (needs_io && !zfs_no_scrub_io) {
       
   881 		void *data = zio_data_buf_alloc(size);
       
   882 
       
   883 		mutex_enter(&spa->spa_scrub_lock);
       
   884 		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
       
   885 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
       
   886 		spa->spa_scrub_inflight++;
       
   887 		mutex_exit(&spa->spa_scrub_lock);
       
   888 
       
   889 		zio_nowait(zio_read(NULL, spa, bp, data, size,
       
   890 		    dsl_pool_scrub_clean_done, NULL, zio_priority,
       
   891 		    zio_flags, zb));
       
   892 	}
       
   893 
       
   894 	/* do not relocate this block */
       
   895 	return (0);
       
   896 }
       
   897 
       
   898 int
       
   899 dsl_pool_scrub_clean(dsl_pool_t *dp)
       
   900 {
       
   901 	/*
       
   902 	 * Purge all vdev caches.  We do this here rather than in sync
       
   903 	 * context because this requires a writer lock on the spa_config
       
   904 	 * lock, which we can't do from sync context.  The
       
   905 	 * spa_scrub_reopen flag indicates that vdev_open() should not
       
   906 	 * attempt to start another scrub.
       
   907 	 */
       
   908 	spa_config_enter(dp->dp_spa, RW_WRITER, FTAG);
       
   909 	dp->dp_spa->spa_scrub_reopen = B_TRUE;
       
   910 	vdev_reopen(dp->dp_spa->spa_root_vdev);
       
   911 	dp->dp_spa->spa_scrub_reopen = B_FALSE;
       
   912 	spa_config_exit(dp->dp_spa, FTAG);
       
   913 
       
   914 	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
       
   915 }