usr/src/uts/common/fs/zfs/dsl_scan.c
changeset 13700 2889e2596bd6
parent 12839 1eab9192da8b
child 13743 95aba6e49b9f
equal deleted inserted replaced
13699:733714f4dc24 13700:2889e2596bd6
    18  *
    18  *
    19  * CDDL HEADER END
    19  * CDDL HEADER END
    20  */
    20  */
    21 /*
    21 /*
    22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
    22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
       
    23  * Copyright (c) 2012 by Delphix. All rights reserved.
    23  */
    24  */
    24 
    25 
    25 #include <sys/dsl_scan.h>
    26 #include <sys/dsl_scan.h>
    26 #include <sys/dsl_pool.h>
    27 #include <sys/dsl_pool.h>
    27 #include <sys/dsl_dataset.h>
    28 #include <sys/dsl_dataset.h>
    42 #include <sys/zil_impl.h>
    43 #include <sys/zil_impl.h>
    43 #include <sys/zio_checksum.h>
    44 #include <sys/zio_checksum.h>
    44 #include <sys/ddt.h>
    45 #include <sys/ddt.h>
    45 #include <sys/sa.h>
    46 #include <sys/sa.h>
    46 #include <sys/sa_impl.h>
    47 #include <sys/sa_impl.h>
       
    48 #include <sys/zfeature.h>
    47 #ifdef _KERNEL
    49 #ifdef _KERNEL
    48 #include <sys/zfs_vfsops.h>
    50 #include <sys/zfs_vfsops.h>
    49 #endif
    51 #endif
    50 
    52 
    51 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
    53 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
   380 {
   382 {
   381 	return (arc_read_nolock(pio, spa, bpp, done, private,
   383 	return (arc_read_nolock(pio, spa, bpp, done, private,
   382 	    priority, zio_flags, arc_flags, zb));
   384 	    priority, zio_flags, arc_flags, zb));
   383 }
   385 }
   384 
   386 
   385 static boolean_t
       
   386 bookmark_is_zero(const zbookmark_t *zb)
       
   387 {
       
   388 	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
       
   389 	    zb->zb_level == 0 && zb->zb_blkid == 0);
       
   390 }
       
   391 
       
   392 /* dnp is the dnode for zb1->zb_object */
       
   393 static boolean_t
       
   394 bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
       
   395     const zbookmark_t *zb2)
       
   396 {
       
   397 	uint64_t zb1nextL0, zb2thisobj;
       
   398 
       
   399 	ASSERT(zb1->zb_objset == zb2->zb_objset);
       
   400 	ASSERT(zb2->zb_level == 0);
       
   401 
       
   402 	/*
       
   403 	 * A bookmark in the deadlist is considered to be after
       
   404 	 * everything else.
       
   405 	 */
       
   406 	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
       
   407 		return (B_TRUE);
       
   408 
       
   409 	/* The objset_phys_t isn't before anything. */
       
   410 	if (dnp == NULL)
       
   411 		return (B_FALSE);
       
   412 
       
   413 	zb1nextL0 = (zb1->zb_blkid + 1) <<
       
   414 	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
       
   415 
       
   416 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
       
   417 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
       
   418 
       
   419 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
       
   420 		uint64_t nextobj = zb1nextL0 *
       
   421 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
       
   422 		return (nextobj <= zb2thisobj);
       
   423 	}
       
   424 
       
   425 	if (zb1->zb_object < zb2thisobj)
       
   426 		return (B_TRUE);
       
   427 	if (zb1->zb_object > zb2thisobj)
       
   428 		return (B_FALSE);
       
   429 	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
       
   430 		return (B_FALSE);
       
   431 	return (zb1nextL0 <= zb2->zb_blkid);
       
   432 }
       
   433 
       
   434 static uint64_t
   387 static uint64_t
   435 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
   388 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
   436 {
   389 {
   437 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
   390 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
   438 	if (dsl_dataset_is_snapshot(ds))
   391 	if (dsl_dataset_is_snapshot(ds))
   460 		return (B_FALSE);
   413 		return (B_FALSE);
   461 
   414 
   462 	if (scn->scn_pausing)
   415 	if (scn->scn_pausing)
   463 		return (B_TRUE); /* we're already pausing */
   416 		return (B_TRUE); /* we're already pausing */
   464 
   417 
   465 	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
   418 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
   466 		return (B_FALSE); /* we're resuming */
   419 		return (B_FALSE); /* we're resuming */
   467 
   420 
   468 	/* We only know how to resume from level-0 blocks. */
   421 	/* We only know how to resume from level-0 blocks. */
   469 	if (zb && zb->zb_level != 0)
   422 	if (zb && zb->zb_level != 0)
   470 		return (B_FALSE);
   423 		return (B_FALSE);
   615     const zbookmark_t *zb)
   568     const zbookmark_t *zb)
   616 {
   569 {
   617 	/*
   570 	/*
   618 	 * We never skip over user/group accounting objects (obj<0)
   571 	 * We never skip over user/group accounting objects (obj<0)
   619 	 */
   572 	 */
   620 	if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
   573 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
   621 	    (int64_t)zb->zb_object >= 0) {
   574 	    (int64_t)zb->zb_object >= 0) {
   622 		/*
   575 		/*
   623 		 * If we already visited this bp & everything below (in
   576 		 * If we already visited this bp & everything below (in
   624 		 * a prior txg sync), don't bother doing it again.
   577 		 * a prior txg sync), don't bother doing it again.
   625 		 */
   578 		 */
   626 		if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
   579 		if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
   627 			return (B_TRUE);
   580 			return (B_TRUE);
   628 
   581 
   629 		/*
   582 		/*
   630 		 * If we found the block we're trying to resume from, or
   583 		 * If we found the block we're trying to resume from, or
   631 		 * we went past it to a different object, zero it out to
   584 		 * we went past it to a different object, zero it out to
   813 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
   766 	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
   814 	    pbuf, bp);
   767 	    pbuf, bp);
   815 
   768 
   816 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
   769 	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
   817 		return;
   770 		return;
   818 
       
   819 	if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
       
   820 		/*
       
   821 		 * For non-user-accounting blocks, we need to read the
       
   822 		 * new bp (from a deleted snapshot, found in
       
   823 		 * check_existing_xlation).  If we used the old bp,
       
   824 		 * pointers inside this block from before we resumed
       
   825 		 * would be untranslated.
       
   826 		 *
       
   827 		 * For user-accounting blocks, we need to read the old
       
   828 		 * bp, because we will apply the entire space delta to
       
   829 		 * it (original untranslated -> translations from
       
   830 		 * deleted snap -> now).
       
   831 		 */
       
   832 		bp_toread = *bp;
       
   833 	}
       
   834 
   771 
   835 	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
   772 	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
   836 	    &buf) != 0)
   773 	    &buf) != 0)
   837 		return;
   774 		return;
   838 
   775 
  1394 			return;
  1331 			return;
  1395 	}
  1332 	}
  1396 	zap_cursor_fini(&zc);
  1333 	zap_cursor_fini(&zc);
  1397 }
  1334 }
  1398 
  1335 
  1399 static int
  1336 static boolean_t
  1400 dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  1337 dsl_scan_free_should_pause(dsl_scan_t *scn)
  1401 {
  1338 {
  1402 	dsl_scan_t *scn = arg;
       
  1403 	uint64_t elapsed_nanosecs;
  1339 	uint64_t elapsed_nanosecs;
  1404 
  1340 
  1405 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
  1341 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
  1406 
  1342 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
  1407 	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
       
  1408 	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
  1343 	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
  1409 	    txg_sync_waiting(scn->scn_dp)) ||
  1344 	    txg_sync_waiting(scn->scn_dp)) ||
  1410 	    spa_shutting_down(scn->scn_dp->dp_spa))
  1345 	    spa_shutting_down(scn->scn_dp->dp_spa));
  1411 		return (ERESTART);
  1346 }
       
  1347 
       
  1348 static int
       
  1349 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
       
  1350 {
       
  1351 	dsl_scan_t *scn = arg;
       
  1352 
       
  1353 	if (!scn->scn_is_bptree ||
       
  1354 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
       
  1355 		if (dsl_scan_free_should_pause(scn))
       
  1356 			return (ERESTART);
       
  1357 	}
  1412 
  1358 
  1413 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
  1359 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
  1414 	    dmu_tx_get_txg(tx), bp, 0));
  1360 	    dmu_tx_get_txg(tx), bp, 0));
  1415 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
  1361 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
  1416 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
  1362 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
  1431 		return (B_FALSE);
  1377 		return (B_FALSE);
  1432 
  1378 
  1433 	if (scn->scn_phys.scn_state == DSS_SCANNING)
  1379 	if (scn->scn_phys.scn_state == DSS_SCANNING)
  1434 		return (B_TRUE);
  1380 		return (B_TRUE);
  1435 
  1381 
       
  1382 	if (spa_feature_is_active(spa,
       
  1383 	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
       
  1384 		return (B_TRUE);
       
  1385 	}
  1436 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
  1386 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
  1437 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
  1387 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
  1438 		    &used, &comp, &uncomp);
  1388 		    &used, &comp, &uncomp);
  1439 	}
  1389 	}
  1440 	return (used != 0);
  1390 	return (used != 0);
  1477 	 * any scanning.  This ensures that there is no free list when
  1427 	 * any scanning.  This ensures that there is no free list when
  1478 	 * we are scanning, so the scan code doesn't have to worry about
  1428 	 * we are scanning, so the scan code doesn't have to worry about
  1479 	 * traversing it.
  1429 	 * traversing it.
  1480 	 */
  1430 	 */
  1481 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
  1431 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
       
  1432 		scn->scn_is_bptree = B_FALSE;
  1482 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
  1433 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
  1483 		    NULL, ZIO_FLAG_MUSTSUCCEED);
  1434 		    NULL, ZIO_FLAG_MUSTSUCCEED);
  1484 		err = bpobj_iterate(&dp->dp_free_bpobj,
  1435 		err = bpobj_iterate(&dp->dp_free_bpobj,
  1485 		    dsl_scan_free_cb, scn, tx);
  1436 		    dsl_scan_free_block_cb, scn, tx);
  1486 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
  1437 		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
       
  1438 
       
  1439 		if (err == 0 && spa_feature_is_active(spa,
       
  1440 		    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
       
  1441 			scn->scn_is_bptree = B_TRUE;
       
  1442 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
       
  1443 			    NULL, ZIO_FLAG_MUSTSUCCEED);
       
  1444 			err = bptree_iterate(dp->dp_meta_objset,
       
  1445 			    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
       
  1446 			    scn, tx);
       
  1447 			VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
       
  1448 			if (err != 0)
       
  1449 				return;
       
  1450 
       
  1451 			/* disable async destroy feature */
       
  1452 			spa_feature_decr(spa,
       
  1453 			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
       
  1454 			ASSERT(!spa_feature_is_active(spa,
       
  1455 			    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
       
  1456 			VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
       
  1457 			    DMU_POOL_DIRECTORY_OBJECT,
       
  1458 			    DMU_POOL_BPTREE_OBJ, tx));
       
  1459 			VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
       
  1460 			    dp->dp_bptree_obj, tx));
       
  1461 			dp->dp_bptree_obj = 0;
       
  1462 		}
  1487 		if (scn->scn_visited_this_txg) {
  1463 		if (scn->scn_visited_this_txg) {
  1488 			zfs_dbgmsg("freed %llu blocks in %llums from "
  1464 			zfs_dbgmsg("freed %llu blocks in %llums from "
  1489 			    "free_bpobj txg %llu",
  1465 			    "free_bpobj/bptree txg %llu",
  1490 			    (longlong_t)scn->scn_visited_this_txg,
  1466 			    (longlong_t)scn->scn_visited_this_txg,
  1491 			    (longlong_t)
  1467 			    (longlong_t)
  1492 			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
  1468 			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
  1493 			    (longlong_t)tx->tx_txg);
  1469 			    (longlong_t)tx->tx_txg);
  1494 			scn->scn_visited_this_txg = 0;
  1470 			scn->scn_visited_this_txg = 0;
  1599 		return;
  1575 		return;
  1600 
  1576 
  1601 	for (i = 0; i < 4; i++) {
  1577 	for (i = 0; i < 4; i++) {
  1602 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
  1578 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
  1603 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
  1579 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
       
  1580 		if (t & DMU_OT_NEWTYPE)
       
  1581 			t = DMU_OT_OTHER;
  1604 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
  1582 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
  1605 		int equal;
  1583 		int equal;
  1606 
  1584 
  1607 		zb->zb_count++;
  1585 		zb->zb_count++;
  1608 		zb->zb_asize += BP_GET_ASIZE(bp);
  1586 		zb->zb_asize += BP_GET_ASIZE(bp);