sustaining/oi_151a/illumos-gate: changeset 8241:5a60f16123ba

--- a/usr/src/cmd/zdb/Makefile.com	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/cmd/zdb/Makefile.com	Mon Dec 01 12:43:36 2008 -0800
@@ -20,11 +20,9 @@
 #
 
 #
-# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-#
 
 PROG:sh=	cd ..; basename `pwd`
 SRCS= ../$(PROG).c ../zdb_il.c
@@ -35,7 +33,7 @@
 INCS += -I../../../lib/libzpool/common 
 INCS +=	-I../../../uts/common/fs/zfs
 
-LDLIBS += -lzpool -lumem -lavl -lnvpair -lzfs
+LDLIBS += -lzpool -lumem -lnvpair -lzfs
 
 C99MODE=	-xc99=%all
 C99LMODE=	-Xc99=%all

--- a/usr/src/cmd/zdb/zdb.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Mon Dec 01 12:43:36 2008 -0800
@@ -521,44 +521,52 @@
 }
 
 static void
+dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	char *prefix = (void *)sm;
+
+	(void) printf("%s [%llu,%llu) length %llu\n",
+	    prefix,
+	    (u_longlong_t)start,
+	    (u_longlong_t)(start + size),
+	    (u_longlong_t)(size));
+}
+
+static void
 dump_dtl(vdev_t *vd, int indent)
 {
-	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
-	space_seg_t *ss;
-	vdev_t *pvd;
-	int c;
+	spa_t *spa = vd->vdev_spa;
+	boolean_t required;
+	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
+	char prefix[256];
+
+	spa_vdev_state_enter(spa);
+	required = vdev_dtl_required(vd);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
-	(void) printf("\t%*s%s\n", indent, "",
+	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
-	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type :
-	    spa_name(vd->vdev_spa));
+	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+	    required ? "DTL-required" : "DTL-expendable");
 
-	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
-		/*
-		 * Everything in this DTL must appear in all parent DTL unions.
-		 */
-		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
-			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
-			    ss->ss_start, ss->ss_end - ss->ss_start));
-		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
-		    indent, "",
-		    (u_longlong_t)ss->ss_start,
-		    (u_longlong_t)ss->ss_end - 1,
-		    (u_longlong_t)(ss->ss_end - ss->ss_start));
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_t *sm = &vd->vdev_dtl[t];
+		if (sm->sm_space == 0)
+			continue;
+		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+		    indent + 2, "", name[t]);
+		mutex_enter(sm->sm_lock);
+		space_map_walk(sm, dump_dtl_seg, (void *)prefix);
+		mutex_exit(sm->sm_lock);
+		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+			dump_spacemap(spa->spa_meta_objset,
+			    &vd->vdev_dtl_smo, sm);
 	}
 
-	(void) printf("\n");
-
-	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
-		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
-		    &vd->vdev_dtl_map);
-		(void) printf("\n");
-	}
-
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
@@ -672,7 +680,8 @@
 				break;
 			fill += cbp->blk_fill;
 		}
-		ASSERT3U(fill, ==, bp->blk_fill);
+		if (!err)
+			ASSERT3U(fill, ==, bp->blk_fill);
 		(void) arc_buf_remove_ref(buf, &buf);
 	}

--- a/usr/src/cmd/ztest/ztest.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/cmd/ztest/ztest.c	Mon Dec 01 12:43:36 2008 -0800
@@ -419,10 +419,10 @@
 	return (r % range);
 }
 
+/* ARGSUSED */
 static void
 ztest_record_enospc(char *s)
 {
-	dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
 	ztest_shared->zs_enospc_count++;
 }
 
@@ -970,7 +970,7 @@
 	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
-	uint64_t oldguid;
+	uint64_t oldguid, pguid;
 	size_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
@@ -1002,10 +1002,16 @@
 	 * Locate this vdev.
 	 */
 	oldvd = rvd->vdev_child[top];
-	if (zopt_mirrors >= 1)
+	if (zopt_mirrors >= 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+		ASSERT(oldvd->vdev_children >= zopt_mirrors);
 		oldvd = oldvd->vdev_child[leaf / zopt_raidz];
-	if (zopt_raidz > 1)
+	}
+	if (zopt_raidz > 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+		ASSERT(oldvd->vdev_children == zopt_raidz);
 		oldvd = oldvd->vdev_child[leaf % zopt_raidz];
+	}
 
 	/*
 	 * If we're already doing an attach or replace, oldvd may be a
@@ -1013,8 +1019,8 @@
 	 */
 	while (oldvd->vdev_children != 0) {
 		oldvd_has_siblings = B_TRUE;
-		ASSERT(oldvd->vdev_children == 2);
-		oldvd = oldvd->vdev_child[ztest_random(2)];
+		ASSERT(oldvd->vdev_children >= 2);
+		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
 	}
 
 	oldguid = oldvd->vdev_guid;
@@ -1022,16 +1028,17 @@
 	oldvd_is_log = oldvd->vdev_top->vdev_islog;
 	(void) strcpy(oldpath, oldvd->vdev_path);
 	pvd = oldvd->vdev_parent;
+	pguid = pvd->vdev_guid;
 
 	/*
 	 * If oldvd has siblings, then half of the time, detach it.
 	 */
 	if (oldvd_has_siblings && ztest_random(2) == 0) {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
-		error = spa_vdev_detach(spa, oldguid, B_FALSE);
-		if (error != 0 && error != ENODEV && error != EBUSY)
-			fatal(0, "detach (%s) returned %d",
-			    oldpath, error);
+		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+		if (error != 0 && error != ENODEV && error != EBUSY &&
+		    error != ENOTSUP)
+			fatal(0, "detach (%s) returned %d", oldpath, error);
 		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
 		return;
 	}
@@ -1131,7 +1138,6 @@
 /*
  * Verify that dynamic LUN growth works as expected.
  */
-/* ARGSUSED */
 void
 ztest_vdev_LUN_growth(ztest_args_t *za)
 {
@@ -2050,8 +2056,6 @@
 	error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
 	za->za_dbuf = db;
 	if (error) {
-		dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
-		    osname, ZTEST_DIROBJ, blkoff, error);
 		(void) mutex_unlock(lp);
 		return;
 	}
@@ -2062,11 +2066,8 @@
 
 	(void) mutex_unlock(lp);
 
-	if (error) {
-		dprintf("dmu_sync(%s, %d, %llx) = %d\n",
-		    osname, ZTEST_DIROBJ, off, error);
+	if (error)
 		return;
-	}
 
 	if (blk.blk_birth == 0)		/* concurrent free */
 		return;
@@ -2575,8 +2576,6 @@
 		maxfaults = INT_MAX;	/* no limit on cache devices */
 	}
 
-	dprintf("damaging %s and %s\n", path0, pathrand);
-
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (maxfaults == 0)
@@ -2586,10 +2585,13 @@
 	 * If we can tolerate two or more faults, randomly online/offline vd0.
 	 */
 	if (maxfaults >= 2 && guid0 != 0) {
-		if (ztest_random(10) < 6)
-			(void) vdev_offline(spa, guid0, B_TRUE);
-		else
-			(void) vdev_online(spa, guid0, B_FALSE, NULL);
+		if (ztest_random(10) < 6) {
+			int flags = (ztest_random(2) == 0 ?
+			    ZFS_OFFLINE_TEMPORARY : 0);
+			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+		} else {
+			(void) vdev_online(spa, guid0, 0, NULL);
+		}
 	}
 
 	/*
@@ -2844,7 +2846,7 @@
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
-	nvlist_t *config;
+	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
 	int error;
@@ -2866,6 +2868,12 @@
 	if (error)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
+	/*
+	 * Kick off a scrub to tickle scrub/export races.
+	 */
+	if (ztest_random(2) == 0)
+		(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
@@ -2881,6 +2889,13 @@
 	ztest_walk_pool_directory("pools after export");
 
 	/*
+	 * Try to import it.
+	 */
+	newconfig = spa_tryimport(config);
+	ASSERT(newconfig != NULL);
+	nvlist_free(newconfig);
+
+	/*
 	 * Import it under the new name.
 	 */
 	error = spa_import(newname, config, NULL);
@@ -2922,22 +2937,25 @@
 	nvlist_free(config);
 }
 
+static void
+ztest_resume(spa_t *spa)
+{
+	if (spa_suspended(spa)) {
+		spa_vdev_state_enter(spa);
+		vdev_clear(spa, NULL);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+		zio_resume(spa);
+	}
+}
+
 static void *
-ztest_resume(void *arg)
+ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;
 
 	while (!ztest_exiting) {
 		(void) poll(NULL, 0, 1000);
-
-		if (!spa_suspended(spa))
-			continue;
-
-		spa_vdev_state_enter(spa);
-		vdev_clear(spa, NULL);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-
-		zio_resume(spa);
+		ztest_resume(spa);
 	}
 	return (NULL);
 }
@@ -3080,9 +3098,19 @@
 	VERIFY(spa_open(pool, &spa, FTAG) == 0);
 
 	/*
+	 * We don't expect the pool to suspend unless maxfaults == 0,
+	 * in which case ztest_fault_inject() temporarily takes away
+	 * the only valid replica.
+	 */
+	if (zopt_maxfaults == 0)
+		spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
+	else
+		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+	/*
 	 * Create a thread to periodically resume suspended I/O.
 	 */
-	VERIFY(thr_create(0, 0, ztest_resume, spa, THR_BOUND,
+	VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
 	    &resume_tid) == 0);
 
 	/*
@@ -3200,6 +3228,7 @@
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
 	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+	ztest_resume(spa);
 
 	/*
 	 * Right before closing the pool, kick off a bunch of async I/O;
@@ -3295,11 +3324,6 @@
 
 	process_options(argc, argv);
 
-	argc -= optind;
-	argv += optind;
-
-	dprintf_setup(&argc, argv);
-
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */

--- a/usr/src/tools/codereview/lwlp.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/tools/codereview/lwlp.c	Mon Dec 01 12:43:36 2008 -0800
@@ -736,7 +736,6 @@
 	nlines++;
 
 	restore_format_state(&format_state, outfile);
-	savestate(outfile);
 	return (nlines);
 }
 
@@ -893,6 +892,7 @@
 	if (bannerfile) {
 		current.lineno += printbanner(bannerfile, outfile);
 		bannerfile = NULL;
+		savestate(outfile);
 	}
 	for (; current.lineno < lines_per_page; ) {
 		if (fgetline(bufin, sizeof (bufin), infile) == (char *)NULL) {
@@ -1186,7 +1186,7 @@
 		    current.row - DEFAULT_DESCENDER_FRACTION * point_size,
 		    end_x,
 		    current.row +
-			(1.0 - DEFAULT_DESCENDER_FRACTION) * point_size,
+		    (1.0 - DEFAULT_DESCENDER_FRACTION) * point_size,
 		    SHADE);
 	}

--- a/usr/src/uts/common/fs/zfs/arc.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Dec 01 12:43:36 2008 -0800
@@ -4458,7 +4458,7 @@
 void
 l2arc_start(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
@@ -4468,7 +4468,7 @@
 void
 l2arc_stop(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);

--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Mon Dec 01 12:43:36 2008 -0800
@@ -119,7 +119,7 @@
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);

--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Mon Dec 01 12:43:36 2008 -0800
@@ -391,7 +391,7 @@
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -409,9 +409,6 @@
 	int err;
 	arc_buf_t *buf = NULL;
 
-	if (bp->blk_birth == 0)
-		return;
-
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
 		return;
 
@@ -740,6 +737,7 @@
 void
 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+	spa_t *spa = dp->dp_spa;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	boolean_t complete = B_TRUE;
@@ -747,8 +745,10 @@
 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
 		return;
 
-	/* If the spa is not fully loaded, don't bother. */
-	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+	/*
+	 * If the pool is not loaded, or is trying to unload, leave it alone.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_NONE || spa_shutting_down(spa))
 		return;
 
 	if (dp->dp_scrub_restart) {
@@ -757,13 +757,13 @@
 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
 	}
 
-	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
 		/*
 		 * We must have resumed after rebooting; reset the vdev
 		 * stats to know that we're doing a scrub (although it
 		 * will think we're just starting now).
 		 */
-		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+		vdev_scrub_stat_update(spa->spa_root_vdev,
 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
 		    POOL_SCRUB_EVERYTHING, B_FALSE);
 	}
@@ -771,7 +771,7 @@
 	dp->dp_scrub_pausing = B_FALSE;
 	dp->dp_scrub_start_time = lbolt64;
 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-	dp->dp_spa->spa_scrub_active = B_TRUE;
+	spa->spa_scrub_active = B_TRUE;
 
 	if (dp->dp_scrub_bookmark.zb_objset == 0) {
 		/* First do the MOS & ORIGIN */
@@ -779,8 +779,8 @@
 		if (dp->dp_scrub_pausing)
 			goto out;
 
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(spa,
 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
 		} else {
 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
@@ -830,15 +830,13 @@
 	VERIFY(0 == zap_update(dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &dp->dp_spa->spa_scrub_errors, tx));
+	    &spa->spa_scrub_errors, tx));
 
 	/* XXX this is scrub-clean specific */
-	mutex_enter(&dp->dp_spa->spa_scrub_lock);
-	while (dp->dp_spa->spa_scrub_inflight > 0) {
-		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-		    &dp->dp_spa->spa_scrub_lock);
-	}
-	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight > 0)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
 void
@@ -921,12 +919,16 @@
     const blkptr_t *bp, const zbookmark_t *zb)
 {
 	size_t size = BP_GET_LSIZE(bp);
-	int d;
 	spa_t *spa = dp->dp_spa;
 	boolean_t needs_io;
 	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
 	int zio_priority;
 
+	ASSERT(bp->blk_birth > dp->dp_scrub_min_txg);
+
+	if (bp->blk_birth >= dp->dp_scrub_max_txg)
+		return (0);
+
 	count_block(dp->dp_blkstats, bp);
 
 	if (dp->dp_scrub_isresilver == 0) {
@@ -945,7 +947,7 @@
 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		vdev_t *vd = vdev_lookup_top(spa,
 		    DVA_GET_VDEV(&bp->blk_dva[d]));
 
@@ -963,16 +965,17 @@
 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
 				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
 				 */
-				vd = spa->spa_root_vdev;
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    bp->blk_birth, 1);
 			}
-			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1);
 		}
 	}

--- a/usr/src/uts/common/fs/zfs/metaslab.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Mon Dec 01 12:43:36 2008 -0800
@@ -720,6 +720,8 @@
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
+	int zio_lock = B_FALSE;
+	boolean_t allocatable;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;
@@ -778,11 +780,20 @@
 	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+
 		/*
 		 * Don't allocate from faulted devices.
 		 */
-		if (!vdev_allocatable(vd))
+		if (zio_lock) {
+			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+			allocatable = vdev_allocatable(vd);
+			spa_config_exit(spa, SCL_ZIO, FTAG);
+		} else {
+			allocatable = vdev_allocatable(vd);
+		}
+		if (!allocatable)
 			goto next;
+
 		/*
 		 * Avoid writing single-copy data to a failing vdev
 		 */
@@ -858,6 +869,12 @@
 		goto top;
 	}
 
+	if (!zio_lock) {
+		dshift = 3;
+		zio_lock = B_TRUE;
+		goto top;
+	}
+
 	bzero(&dva[d], sizeof (dva_t));
 
 	return (ENOSPC);
@@ -946,7 +963,7 @@
 
 	space_map_claim(&msp->ms_map, offset, size);
 
-	if (spa_mode & FWRITE) {	/* don't dirty if we're zdb(1M) */
+	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);

--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Dec 01 12:43:36 2008 -0800
@@ -486,11 +486,12 @@
  * Activate an uninitialized pool.
  */
 static void
-spa_activate(spa_t *spa)
+spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create();
 	spa->spa_log_class = metaslab_class_create();
@@ -640,11 +641,6 @@
 	mutex_exit(&spa->spa_async_root_lock);
 
 	/*
-	 * Drop and purge level 2 cache
-	 */
-	spa_l2cache_drop(spa);
-
-	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
@@ -652,6 +648,13 @@
 		spa->spa_dsl_pool = NULL;
 	}
 
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * Drop and purge level 2 cache
+	 */
+	spa_l2cache_drop(spa);
+
 	/*
 	 * Close all vdevs.
 	 */
@@ -686,6 +689,8 @@
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -897,12 +902,9 @@
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
-			if ((spa_mode & FWRITE) &&
-			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL &&
-			    l2arc_vdev_present(vd)) {
+			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
-			}
 			(void) vdev_close(vd);
 			spa_l2cache_remove(vd);
 		}
@@ -1018,8 +1020,16 @@
 	uint64_t pool_guid;
 	uint64_t version;
 	uint64_t autoreplace = 0;
+	int orig_mode = spa->spa_mode;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 
+	/*
+	 * If this is an untrusted config, access the pool in read-only mode.
+	 * This prevents things like resilvering recently removed devices.
+	 */
+	if (!mosconfig)
+		spa->spa_mode = FREAD;
+
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
@@ -1077,12 +1087,13 @@
 	 * Validate the labels for all leaf vdevs.  We need to grab the config
 	 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
 	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = vdev_validate(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-
-	if (error != 0)
-		goto out;
+	if (mosconfig) {
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		error = vdev_validate(rvd);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		if (error != 0)
+			goto out;
+	}
 
 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 		error = ENXIO;
@@ -1184,7 +1195,7 @@
 		spa_config_set(spa, newconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
-		spa_activate(spa);
+		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, newconfig, state, B_TRUE));
 	}
@@ -1376,10 +1387,11 @@
 		goto out;
 	}
 
-	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+	if (spa_writeable(spa)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
-		int c;
+
+		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
@@ -1407,7 +1419,7 @@
 		    state == SPA_LOAD_IMPORT)
 			need_update = B_TRUE;
 
-		for (c = 0; c < rvd->vdev_children; c++)
+		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
@@ -1417,6 +1429,12 @@
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+		/*
+		 * Check all DTLs to see if anything needs resilvering.
+		 */
+		if (vdev_resilver_needed(rvd, NULL, NULL))
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
 	}
 
 	error = 0;
@@ -1469,7 +1487,7 @@
 	}
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 
-		spa_activate(spa);
+		spa_activate(spa, spa_mode_global);
 
 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
 
@@ -1873,11 +1891,9 @@
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
-		if ((spa_mode & FWRITE) &&
-		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
-		    l2arc_vdev_present(vd)) {
+		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
-		}
 		if (vd->vdev_isl2cache)
 			spa_l2cache_remove(vd);
 		vdev_clear_stats(vd);
@@ -1918,7 +1934,7 @@
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, altroot);
-	spa_activate(spa);
+	spa_activate(spa, spa_mode_global);
 
 	spa->spa_uberblock.ub_txg = txg - 1;
 
@@ -2121,7 +2137,7 @@
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, altroot);
-	spa_activate(spa);
+	spa_activate(spa, spa_mode_global);
 
 	if (allowfaulted)
 		spa->spa_import_faulted = B_TRUE;
@@ -2160,7 +2176,8 @@
 		    VDEV_ALLOC_L2CACHE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
-	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
+	if (error != 0 || (props && spa_writeable(spa) &&
+	    (error = spa_prop_set(spa, props)))) {
 		if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
 			/*
 			 * If we failed to load the pool, but 'allowfaulted' is
@@ -2219,7 +2236,7 @@
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
-	if (spa_mode & FWRITE) {
+	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
@@ -2489,7 +2506,7 @@
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, NULL);
-	spa_activate(spa);
+	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
@@ -2575,7 +2592,7 @@
 	if (oldconfig)
 		*oldconfig = NULL;
 
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return (EROFS);
 
 	mutex_enter(&spa_namespace_lock);
@@ -2710,7 +2727,7 @@
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg;
-	int c, error;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
@@ -2749,7 +2766,7 @@
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
@@ -2957,10 +2974,8 @@
 	 */
 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
 
-	mutex_enter(&newvd->vdev_dtl_lock);
-	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
-	    open_txg - TXG_INITIAL + 1);
-	mutex_exit(&newvd->vdev_dtl_lock);
+	vdev_dtl_dirty(newvd, DTL_MISSING,
+	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
 
 	if (newvd->vdev_isspare)
 		spa_spare_activate(newvd);
@@ -3004,10 +3019,10 @@
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
-	int c, t, error;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
@@ -3027,6 +3042,22 @@
 	pvd = vd->vdev_parent;
 
 	/*
+	 * If the parent/child relationship is not as expected, don't do it.
+	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+	 * vdev that's replacing B with C.  The user's intent in replacing
+	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
+	 * the replace by detaching C, the expected behavior is to end up
+	 * M(A,B).  But suppose that right after deciding to detach C,
+	 * the replacement of B completes.  We would have M(A,C), and then
+	 * ask to detach C, which would leave us with just A -- not what
+	 * the user wanted.  To prevent this, we make sure that the
+	 * parent/child relationship hasn't changed -- in this example,
+	 * that C's parent is still the replacing vdev R.
+	 */
+	if (pvd->vdev_guid != pguid && pguid != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
 	 * If replace_done is specified, only remove this device if it's
 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
 	 * disk can be removed.
@@ -3052,37 +3083,14 @@
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
-	 * If there's only one replica, you can't detach it.
-	 */
-	if (pvd->vdev_children <= 1)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
-	/*
-	 * If all siblings have non-empty DTLs, this device may have the only
-	 * valid copy of the data, which means we cannot safely detach it.
-	 *
-	 * XXX -- as in the vdev_offline() case, we really want a more
-	 * precise DTL check.
+	 * If this device has the only valid copy of some data,
+	 * we cannot safely detach it.
 	 */
-	for (c = 0; c < pvd->vdev_children; c++) {
-		uint64_t dirty;
-
-		cvd = pvd->vdev_child[c];
-		if (cvd == vd)
-			continue;
-		if (vdev_is_dead(cvd))
-			continue;
-		mutex_enter(&cvd->vdev_dtl_lock);
-		dirty = cvd->vdev_dtl_map.sm_space |
-		    cvd->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd->vdev_dtl_lock);
-		if (!dirty)
-			break;
-	}
-
-	if (c == pvd->vdev_children)
+	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
+	ASSERT(pvd->vdev_children >= 2);
+
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
@@ -3107,7 +3115,7 @@
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0)
+	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
@@ -3133,14 +3141,18 @@
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
-	 * do it now, marking the vdev as no longer a spare in the process.  We
-	 * must do this before vdev_remove_parent(), because that can change the
-	 * GUID if it creates a new toplevel GUID.
+	 * do it now, marking the vdev as no longer a spare in the process.
+	 * We must do this before vdev_remove_parent(), because that can
+	 * change the GUID if it creates a new toplevel GUID.  For a similar
+	 * reason, we must remove the spare now, in the same txg as the detach;
+	 * otherwise someone could attach a new sibling, change the GUID, and
+	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
+		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 	}
 
 	/*
@@ -3178,7 +3190,7 @@
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
-	for (t = 0; t < TXG_SIZE; t++)
+	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
@@ -3193,11 +3205,14 @@
 	 * list of every other pool.
 	 */
 	if (unspare) {
+		spa_t *myspa = spa;
 		spa = NULL;
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL) {
 			if (spa->spa_state != POOL_STATE_ACTIVE)
 				continue;
+			if (spa == myspa)
+				continue;
 			spa_open_ref(spa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
@@ -3261,10 +3276,12 @@
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
 	uint_t nspares, nl2cache;
-	uint64_t txg;
+	uint64_t txg = 0;
 	int error = 0;
-
-	txg = spa_vdev_enter(spa);
+	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+
+	if (!locked)
+		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -3307,7 +3324,10 @@
 		error = ENOENT;
 	}
 
-	return (spa_vdev_exit(spa, NULL, txg, error));
+	if (!locked)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	return (error);
 }
 
 /*
@@ -3333,13 +3353,9 @@
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
-		if (newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
-			mutex_exit(&newvd->vdev_dtl_lock);
+		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd))
 			return (oldvd);
-		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	/*
@@ -3349,15 +3365,12 @@
 		newvd = vd->vdev_child[0];
 		oldvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
 		if (newvd->vdev_unspare &&
-		    newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
+		    vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd)) {
 			newvd->vdev_unspare = 0;
-			mutex_exit(&newvd->vdev_dtl_lock);
 			return (oldvd);
 		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	return (NULL);
@@ -3366,36 +3379,37 @@
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
-	vdev_t *vd;
-	vdev_t *pvd;
-	uint64_t guid;
-	uint64_t pguid = 0;
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *vd, *pvd, *ppvd;
+	uint64_t guid, sguid, pguid, ppguid;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+		pvd = vd->vdev_parent;
+		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
+		pguid = pvd->vdev_guid;
+		ppguid = ppvd->vdev_guid;
+		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
-		pvd = vd->vdev_parent;
-		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_id == 0) {
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(pvd->vdev_parent->vdev_children == 2);
-			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+			ASSERT(ppvd->vdev_children == 2);
+			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
-		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -3930,9 +3944,22 @@
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-		vdev_state_clean(vd);
-		vdev_config_dirty(vd);
+	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+		/*
+		 * We need the write lock here because, for aux vdevs,
+		 * calling vdev_config_dirty() modifies sav_config.
+		 * This is ugly and will become unnecessary when we
+		 * eliminate the aux vdev wart by integrating all vdevs
+		 * into the root vdev tree.
+		 */
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+		}
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);

--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Dec 01 12:43:36 2008 -0800
@@ -230,7 +230,7 @@
 static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
-int spa_mode;
+int spa_mode_global;
 
 #ifdef ZFS_DEBUG
 /* Everything except dprintf is on by default in debug builds */
@@ -880,8 +880,10 @@
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
@@ -912,6 +914,15 @@
 
 	spa_config_exit(spa, SCL_STATE_ALL, spa);
 
+	/*
+	 * If anything changed, wait for it to sync.  This ensures that,
+	 * from the system administrator's perspective, zpool(1M) commands
+	 * are synchronous.  This is important for things like zpool offline:
+	 * when the command completes, you expect no further I/O from ZFS.
+	 */
+	if (vd != NULL)
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
 	return (error);
 }
 
@@ -1351,7 +1362,7 @@
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
-	spa_mode = mode;
+	spa_mode_global = mode;
 
 	refcount_init();
 	unique_init();
@@ -1408,3 +1419,15 @@
 {
 	return (spa->spa_is_root);
 }
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+	return (!!(spa->spa_mode & FWRITE));
+}
+
+int
+spa_mode(spa_t *spa)
+{
+	return (spa->spa_mode);
+}

--- a/usr/src/uts/common/fs/zfs/space_map.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/space_map.c	Mon Dec 01 12:43:36 2008 -0800
@@ -181,7 +181,7 @@
 	sm->sm_space -= size;
 }
 
-int
+boolean_t
 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	avl_index_t where;
@@ -221,59 +221,10 @@
 {
 	space_seg_t *ss;
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	avl_index_t where;
-	space_seg_t *ss, search;
-	uint64_t end = start + size;
-	uint64_t rm_start, rm_end;
-
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	search.ss_start = start;
-	search.ss_end = start;
-
-	for (;;) {
-		ss = avl_find(t, &search, &where);
-
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-
-		if (ss == NULL || ss->ss_start >= end)
-			break;
-
-		rm_start = MAX(ss->ss_start, start);
-		rm_end = MIN(ss->ss_end, end);
-
-		space_map_remove(sm, rm_start, rm_end - rm_start);
-	}
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-	avl_tree_t *t = &sms->sm_root;
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(smd->sm_lock));
-
-	/*
-	 * For each source segment, remove any intersections with the
-	 * destination, then add the source segment to the destination.
-	 */
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-	}
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 }
 
 /*
@@ -505,3 +456,131 @@
 	smo->smo_objsize = 0;
 	smo->smo_alloc = 0;
 }
+
+/*
+ * Space map reference trees.
+ *
+ * A space map is a collection of integers.  Every integer is either
+ * in the map, or it's not.  A space map reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a space map.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N space maps is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N space maps is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'space map domain', so we convert the maps
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_map_ref_compare(const void *x1, const void *x2)
+{
+	const space_ref_t *sr1 = x1;
+	const space_ref_t *sr2 = x2;
+
+	if (sr1->sr_offset < sr2->sr_offset)
+		return (-1);
+	if (sr1->sr_offset > sr2->sr_offset)
+		return (1);
+
+	if (sr1 < sr2)
+		return (-1);
+	if (sr1 > sr2)
+		return (1);
+
+	return (0);
+}
+
+void
+space_map_ref_create(avl_tree_t *t)
+{
+	avl_create(t, space_map_ref_compare,
+	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_map_ref_destroy(avl_tree_t *t)
+{
+	space_ref_t *sr;
+	void *cookie = NULL;
+
+	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(sr, sizeof (*sr));
+
+	avl_destroy(t);
+}
+
+static void
+space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+	space_ref_t *sr;
+
+	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+	sr->sr_offset = offset;
+	sr->sr_refcnt = refcnt;
+
+	avl_add(t, sr);
+}
+
+void
+space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+	int64_t refcnt)
+{
+	space_map_ref_add_node(t, start, refcnt);
+	space_map_ref_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a space map into a reference tree.
+ */
+void
+space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+{
+	space_seg_t *ss;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a space map.  The space map will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+{
+	uint64_t start = -1ULL;
+	int64_t refcnt = 0;
+	space_ref_t *sr;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	space_map_vacate(sm, NULL, NULL);
+
+	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+		refcnt += sr->sr_refcnt;
+		if (refcnt >= minref) {
+			if (start == -1ULL) {
+				start = sr->sr_offset;
+			}
+		} else {
+			if (start != -1ULL) {
+				uint64_t end = sr->sr_offset;
+				ASSERT(start <= end);
+				if (end > start)
+					space_map_add(sm, start, end - start);
+				start = -1ULL;
+			}
+		}
+	}
+	ASSERT(refcnt == 0);
+	ASSERT(start == -1ULL);
+}

--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Dec 01 12:43:36 2008 -0800
@@ -352,7 +352,8 @@
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+    int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 
@@ -476,6 +477,8 @@
 extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+extern int spa_mode(spa_t *spa);
 
 /* history logging */
 typedef enum history_log_type {
@@ -546,7 +549,7 @@
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
-extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
+extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
 }

--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Dec 01 12:43:36 2008 -0800
@@ -170,6 +170,7 @@
 	boolean_t	spa_import_faulted;	/* allow faulted vdevs */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
+	int		spa_mode;		/* FREAD | FWRITE */
 	spa_log_state_t spa_log_state;		/* log state */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements

--- a/usr/src/uts/common/fs/zfs/sys/space_map.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h	Mon Dec 01 12:43:36 2008 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPACE_MAP_H
 #define	_SYS_SPACE_MAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/dmu.h>
 
@@ -58,6 +56,12 @@
 	uint64_t	ss_end;		/* ending offset (non-inclusive) */
 } space_seg_t;
 
+typedef struct space_ref {
+	avl_node_t	sr_node;	/* AVL node */
+	uint64_t	sr_offset;	/* offset (start or end) */
+	int64_t		sr_refcnt;	/* associated reference count */
+} space_ref_t;
+
 typedef struct space_map_obj {
 	uint64_t	smo_object;	/* on-disk space map object */
 	uint64_t	smo_objsize;	/* size of the object */
@@ -133,13 +137,12 @@
 extern void space_map_destroy(space_map_t *sm);
 extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern boolean_t space_map_contains(space_map_t *sm,
+    uint64_t start, uint64_t size);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
 extern void space_map_walk(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
 
 extern void space_map_load_wait(space_map_t *sm);
 extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
@@ -155,6 +158,15 @@
 extern void space_map_truncate(space_map_obj_t *smo,
     objset_t *os, dmu_tx_t *tx);
 
+extern void space_map_ref_create(avl_tree_t *t);
+extern void space_map_ref_destroy(avl_tree_t *t);
+extern void space_map_ref_add_seg(avl_tree_t *t,
+    uint64_t start, uint64_t end, int64_t refcnt);
+extern void space_map_ref_add_map(avl_tree_t *t,
+    space_map_t *sm, int64_t refcnt);
+extern void space_map_ref_generate_map(avl_tree_t *t,
+    space_map_t *sm, int64_t minref);
+
 #ifdef	__cplusplus
 }
 #endif

--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Dec 01 12:43:36 2008 -0800
@@ -36,6 +36,14 @@
 extern "C" {
 #endif
 
+typedef enum vdev_dtl_type {
+	DTL_MISSING,	/* 0% replication: no copies of the data */
+	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
+	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
+	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
+	DTL_TYPES
+} vdev_dtl_type_t;
+
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
@@ -50,10 +58,14 @@
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);

--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h	Mon Dec 01 12:43:36 2008 -0800
@@ -123,8 +123,7 @@
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
-	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
+	space_map_t	vdev_dtl[DTL_TYPES]; /* in-core dirty time logs	*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 
 	/*
@@ -149,7 +148,7 @@
 	 * Leaf vdev state.
 	 */
 	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
+	space_map_obj_t	vdev_dtl_smo;	/* dirty time log space map obj	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/

--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Dec 01 12:43:36 2008 -0800
@@ -132,12 +132,13 @@
 #define	ZIO_FLAG_IO_RETRY		0x00400
 #define	ZIO_FLAG_IO_REWRITE		0x00800
 
-#define	ZIO_FLAG_PROBE			0x01000
+#define	ZIO_FLAG_SELF_HEAL		0x01000
 #define	ZIO_FLAG_RESILVER		0x02000
 #define	ZIO_FLAG_SCRUB			0x04000
 #define	ZIO_FLAG_SCRUB_THREAD		0x08000
 
-#define	ZIO_FLAG_GANG_CHILD		0x10000
+#define	ZIO_FLAG_PROBE			0x10000
+#define	ZIO_FLAG_GANG_CHILD		0x20000
 
 #define	ZIO_FLAG_GANG_INHERIT		\
 	(ZIO_FLAG_CANFAIL |		\
@@ -146,6 +147,7 @@
 	ZIO_FLAG_DONT_RETRY |		\
 	ZIO_FLAG_DONT_CACHE |		\
 	ZIO_FLAG_DONT_AGGREGATE |	\
+	ZIO_FLAG_SELF_HEAL |		\
 	ZIO_FLAG_RESILVER |		\
 	ZIO_FLAG_SCRUB |		\
 	ZIO_FLAG_SCRUB_THREAD)
@@ -156,6 +158,14 @@
 	ZIO_FLAG_IO_RETRY |		\
 	ZIO_FLAG_PROBE)
 
+#define	ZIO_FLAG_AGG_INHERIT		\
+	(ZIO_FLAG_DONT_AGGREGATE |	\
+	ZIO_FLAG_IO_REPAIR |		\
+	ZIO_FLAG_SELF_HEAL |		\
+	ZIO_FLAG_RESILVER |		\
+	ZIO_FLAG_SCRUB |		\
+	ZIO_FLAG_SCRUB_THREAD)
+
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101

--- a/usr/src/uts/common/fs/zfs/vdev.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Dec 01 12:43:36 2008 -0800
@@ -316,8 +316,10 @@
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+		    &vd->vdev_dtl_lock);
+	}
 	txg_list_create(&vd->vdev_ms_list,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list,
@@ -474,7 +476,7 @@
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl.smo_object);
+			    &vd->vdev_dtl_smo.smo_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
@@ -566,12 +568,14 @@
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
+
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_unload(&vd->vdev_dtl[t]);
+		space_map_destroy(&vd->vdev_dtl[t]);
+	}
 	mutex_exit(&vd->vdev_dtl_lock);
+
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@@ -709,14 +713,18 @@
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
+
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
-	if (mvd->vdev_top == mvd)
-		cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
+	if (mvd->vdev_top == mvd) {
+		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+		cvd->vdev_guid += guid_delta;
+		cvd->vdev_guid_sum += guid_delta;
+	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
@@ -815,6 +823,7 @@
 static void
 vdev_probe_done(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
 	vdev_probe_stats_t *vps = zio->io_private;
 	vdev_t *vd = vps->vps_vd;
 
@@ -822,7 +831,7 @@
 		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
-		if (zio->io_error == 0 && (spa_mode & FWRITE)) {
+		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vps->vps_root, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
@@ -843,12 +852,12 @@
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
-		    (vdev_writeable(vd) || !(spa_mode & FWRITE))) {
+		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-			    zio->io_spa, vd, NULL, 0, 0);
+			    spa, vd, NULL, 0, 0);
 			zio->io_error = ENXIO;
 		}
 		kmem_free(vps, sizeof (*vps));
@@ -916,12 +925,15 @@
 int
 vdev_open(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
 	int error;
 	int c;
 	uint64_t osize = 0;
 	uint64_t asize, psize;
 	uint64_t ashift = 0;
 
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
@@ -1055,16 +1067,12 @@
 
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
-	 * resilver.  But don't do this if we are doing a reopen for a
-	 * scrub, since this would just restart the scrub we are already
-	 * doing.
+	 * resilver.  But don't do this if we are doing a reopen for a scrub,
+	 * since this would just restart the scrub we are already doing.
 	 */
-	if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
-			spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
-		mutex_exit(&vd->vdev_dtl_lock);
-	}
+	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
+	    vdev_resilver_needed(vd, NULL, NULL))
+		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
@@ -1165,6 +1173,10 @@
 void
 vdev_close(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
@@ -1283,34 +1295,88 @@
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication.  There are three kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ *	txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ *	persistent errors or just some device being offline.
+ *	Unlike the other three, the DTL_OUTAGE map is not generally
+ *	maintained; it's only computed when needed, typically to
+ *	determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read.  For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk.  When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
 void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
+	space_map_t *sm = &vd->vdev_dtl[t];
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
 	mutex_enter(sm->sm_lock);
 	if (!space_map_contains(sm, txg, size))
 		space_map_add(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 }
 
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	int dirty;
-
-	/*
-	 * Quick test without the lock -- covers the common case that
-	 * there are no dirty time segments.
-	 */
-	if (sm->sm_space == 0)
-		return (0);
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t dirty = B_FALSE;
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	mutex_enter(sm->sm_lock);
-	dirty = space_map_contains(sm, txg, size);
+	if (sm->sm_space != 0)
+		dirty = space_map_contains(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 
 	return (dirty);
 }
 
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t empty;
+
+	mutex_enter(sm->sm_lock);
+	empty = (sm->sm_space == 0);
+	mutex_exit(sm->sm_lock);
+
+	return (empty);
+}
+
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
@@ -1318,11 +1384,19 @@
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
-	int c;
-
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-
-	if (vd->vdev_children == 0) {
+	avl_tree_t reftree;
+	int minref;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dtl_reassess(vd->vdev_child[c], txg,
+		    scrub_txg, scrub_done);
+
+	if (vd == spa->spa_root_vdev)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
@@ -1333,12 +1407,38 @@
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
+			 *
+			 * There's little trick here: to excise the beginning
+			 * of the DTL_MISSING map, we put it into a reference
+			 * tree and then add a segment with refcnt -1 that
+			 * covers the range [0, scrub_txg).  This means
+			 * that each txg in that range has refcnt -1 or 0.
+			 * We then add DTL_SCRUB with a refcnt of 2, so that
+			 * entries in the range [0, scrub_txg) will have a
+			 * positive refcnt -- either 1 or 2.  We then convert
+			 * the reference tree into the new DTL_MISSING map.
 			 */
-			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
-			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+			space_map_ref_create(&reftree);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_SCRUB], 2);
+			space_map_ref_generate_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_destroy(&reftree);
 		}
+		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+		if (!vdev_readable(vd))
+			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+		else
+			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
@@ -1346,35 +1446,34 @@
 		return;
 	}
 
-	/*
-	 * Make sure the DTLs are always correct under the scrub lock.
-	 */
-	if (vd == spa->spa_root_vdev)
-		mutex_enter(&spa->spa_scrub_lock);
-
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		if (t == DTL_SCRUB)
+			continue;			/* leaf vdevs only */
+		if (t == DTL_PARTIAL)
+			minref = 1;			/* i.e. non-zero */
+		else if (vd->vdev_nparity != 0)
+			minref = vd->vdev_nparity + 1;	/* RAID-Z */
+		else
+			minref = vd->vdev_children;	/* any kind of mirror */
+		space_map_ref_create(&reftree);
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			mutex_enter(&cvd->vdev_dtl_lock);
+			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
+			mutex_exit(&cvd->vdev_dtl_lock);
+		}
+		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
+		space_map_ref_destroy(&reftree);
+	}
 	mutex_exit(&vd->vdev_dtl_lock);
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
-		mutex_enter(&vd->vdev_dtl_lock);
-		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
-		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
-		mutex_exit(&vd->vdev_dtl_lock);
-	}
-
-	if (vd == spa->spa_root_vdev)
-		mutex_exit(&spa->spa_scrub_lock);
 }
 
 static int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *db;
 	int error;
@@ -1392,7 +1491,8 @@
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
+	    NULL, SM_ALLOC, smo, mos);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (error);
@@ -1402,8 +1502,8 @@
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
-	space_map_t *sm = &vd->vdev_dtl_map;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
+	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	space_map_t smsync;
 	kmutex_t smlock;
@@ -1461,6 +1561,37 @@
 }
 
 /*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint8_t cant_read = vd->vdev_cant_read;
+	boolean_t required;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (vd == spa->spa_root_vdev || vd == tvd)
+		return (B_TRUE);
+
+	/*
+	 * Temporarily mark the device as unreadable, and then determine
+	 * whether this results in any DTL outages in the top-level vdev.
+	 * If not, we can safely offline/detach/remove the device.
+	 */
+	vd->vdev_cant_read = B_TRUE;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+	vd->vdev_cant_read = cant_read;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+
+	return (required);
+}
+
+/*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
@@ -1472,19 +1603,19 @@
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+		    vdev_writeable(vd)) {
 			space_seg_t *ss;
 
-			ss = avl_first(&vd->vdev_dtl_map.sm_root);
+			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismin = ss->ss_start - 1;
-			ss = avl_last(&vd->vdev_dtl_map.sm_root);
+			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismax = ss->ss_end;
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
-		int c;
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
@@ -1506,12 +1637,10 @@
 void
 vdev_load(vdev_t *vd)
 {
-	int c;
-
 	/*
 	 * Recursively load all children.
 	 */
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_load(vd->vdev_child[c]);
 
 	/*
@@ -1731,11 +1860,7 @@
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
-	(void) spa_vdev_state_exit(spa, vd, 0);
-
-	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
-	return (0);
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
@@ -1756,13 +1881,10 @@
 	 */
 	if (!vd->vdev_offline) {
 		/*
-		 * If this device's top-level vdev has a non-empty DTL,
-		 * don't allow the device to be offlined.
-		 *
-		 * XXX -- make this more precise by allowing the offline
-		 * as long as the remaining devices don't have any DTL holes.
+		 * If this device has the only valid copy of some data,
+		 * don't allow it to be offlined.
 		 */
-		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+		if (vd->vdev_aux == NULL && vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
@@ -1772,7 +1894,7 @@
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(vd->vdev_top);
-		if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+		if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(vd->vdev_top);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
@@ -1852,13 +1974,17 @@
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
+	uint64_t state = vd->vdev_state;
+
 	/*
-	 * We currently allow allocations from vdevs which maybe in the
+	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
-	 * the proper locks.
+	 * the proper locks.  Note that we have to get the vdev state
+	 * in a local variable because although it changes atomically,
+	 * we're asking two separate questions about it.
 	 */
-	return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) &&
+	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write);
 }
 
@@ -1928,7 +2054,8 @@
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
-	vdev_t *rvd = zio->io_spa->spa_root_vdev;
+	spa_t *spa = zio->io_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
@@ -1961,21 +2088,23 @@
 			return;
 
 		ASSERT(vd == zio->io_vd);
-		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += psize;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
+
+		if (flags & ZIO_FLAG_IO_BYPASS)
+			return;
+
+		mutex_enter(&vd->vdev_stat_lock);
+
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			ASSERT(zio->io_delegate_list == NULL);
-			mutex_enter(&vd->vdev_stat_lock);
 			if (flags & ZIO_FLAG_SCRUB_THREAD)
 				vs->vs_scrub_repaired += psize;
-			else
+			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
-			mutex_exit(&vd->vdev_stat_lock);
 		}
+
+		vs->vs_ops[type]++;
+		vs->vs_bytes[type] += psize;
+
+		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
@@ -1993,19 +2122,39 @@
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
-		if (flags & ZIO_FLAG_SCRUB_THREAD) {
-			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
+	if (type == ZIO_TYPE_WRITE && txg != 0 &&
+	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
+	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
+		/*
+		 * This is either a normal write (not a repair), or it's a
+		 * repair induced by the scrub thread.  In the normal case,
+		 * we commit the DTL change in the same txg as the block
+		 * was born.  In the scrub-induced repair case, we know that
+		 * scrubs run in first-pass syncing context, so we commit
+		 * the DTL change in spa->spa_syncing_txg.
+		 *
+		 * We currently do not make DTL entries for failed spontaneous
+		 * self-healing writes triggered by normal (non-scrubbing)
+		 * reads, because we have no transactional context in which to
+		 * do so -- and it's not clear that it'd be desirable anyway.
+		 */
+		if (vd->vdev_ops->vdev_op_leaf) {
+			uint64_t commit_txg = txg;
+			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				ASSERT(spa_sync_pass(spa) == 1);
+				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+				commit_txg = spa->spa_syncing_txg;
+			}
+			ASSERT(commit_txg >= spa->spa_syncing_txg);
+			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
+				return;
+			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
-		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
-				return;
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
-		}
+		if (vd != rvd)
+			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
@@ -2218,7 +2367,8 @@
 void
 vdev_propagate_state(vdev_t *vd)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	int c;
@@ -2229,7 +2379,7 @@
 			child = vd->vdev_child[c];
 
 			if (!vdev_readable(child) ||
-			    (!vdev_writeable(child) && (spa_mode & FWRITE))) {
+			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were

--- a/usr/src/uts/common/fs/zfs/vdev_disk.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c	Mon Dec 01 12:43:36 2008 -0800
@@ -47,6 +47,7 @@
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
+	spa_t *spa = vd->vdev_spa;
 	vdev_disk_t *dvd;
 	struct dk_minfo dkm;
 	int error;
@@ -95,7 +96,7 @@
 
 	error = EINVAL;		/* presume failure */
 
-	if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) {
+	if (vd->vdev_path != NULL && !spa_is_root(spa)) {
 		ddi_devid_t devid;
 
 		if (vd->vdev_wholedisk == -1ULL) {
@@ -105,18 +106,18 @@
 
 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
 
-			if (ldi_open_by_name(buf, spa_mode, kcred,
+			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
 			    &lh, zfs_li) == 0) {
 				spa_strfree(vd->vdev_path);
 				vd->vdev_path = buf;
 				vd->vdev_wholedisk = 1ULL;
-				(void) ldi_close(lh, spa_mode, kcred);
+				(void) ldi_close(lh, spa_mode(spa), kcred);
 			} else {
 				kmem_free(buf, len);
 			}
 		}
 
-		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
 		    &dvd->vd_lh, zfs_li);
 
 		/*
@@ -126,7 +127,8 @@
 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
 				error = EINVAL;
-				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
+				    kcred);
 				dvd->vd_lh = NULL;
 			}
 			ddi_devid_free(devid);
@@ -146,7 +148,7 @@
 	 */
 	if (error != 0 && vd->vdev_devid != NULL)
 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
-		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
 
 	/*
 	 * If all else fails, then try opening by physical path (if available)
@@ -157,7 +159,7 @@
 	if (error) {
 		if (vd->vdev_physpath != NULL &&
 		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
-			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
+			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
 			    kcred, &dvd->vd_lh, zfs_li);
 
 		/*
@@ -165,10 +167,9 @@
 		 * as above.  This hasn't been used in a very long time and we
 		 * don't need to propagate its oddities to this edge condition.
 		 */
-		if (error && vd->vdev_path != NULL &&
-		    !spa_is_root(vd->vdev_spa))
-			error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
-			    &dvd->vd_lh, zfs_li);
+		if (error && vd->vdev_path != NULL && !spa_is_root(spa))
+			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+			    kcred, &dvd->vd_lh, zfs_li);
 	}
 
 	if (error) {
@@ -253,7 +254,7 @@
 		ddi_devid_free(dvd->vd_devid);
 
 	if (dvd->vd_lh != NULL)
-		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
 
 	kmem_free(dvd, sizeof (vdev_disk_t));
 	vd->vdev_tsd = NULL;
@@ -469,7 +470,7 @@
 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
 	    &minor_name) == 0) {
 		error = ldi_open_by_devid(tmpdevid, minor_name,
-		    spa_mode, kcred, &vd_lh, zfs_li);
+		    FREAD, kcred, &vd_lh, zfs_li);
 		ddi_devid_free(tmpdevid);
 		ddi_devid_str_free(minor_name);
 	}

--- a/usr/src/uts/common/fs/zfs/vdev_file.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c	Mon Dec 01 12:43:36 2008 -0800
@@ -61,7 +61,7 @@
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
-	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -105,7 +105,8 @@
 
 	if (vf->vf_vnode != NULL) {
 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+		    kcred, NULL);
 		VN_RELE(vf->vf_vnode);
 	}

--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Mon Dec 01 12:43:36 2008 -0800
@@ -277,9 +277,9 @@
 		    vd->vdev_islog) == 0);
 	}
 
-	if (vd->vdev_dtl.smo_object != 0)
+	if (vd->vdev_dtl_smo.smo_object != 0)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl.smo_object) == 0);
+		    vd->vdev_dtl_smo.smo_object) == 0);
 
 	if (getstats) {
 		vdev_stat_t vs;

--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Mon Dec 01 12:43:36 2008 -0800
@@ -225,7 +225,7 @@
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
 			return (c);
 		mc->mc_error = ESTALE;
 		mc->mc_skipped = 1;
@@ -282,20 +282,10 @@
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
-		 * If this is a resilvering I/O to a replacing vdev,
-		 * only the last child should be written -- unless the
-		 * first child happens to have a DTL entry here as well.
-		 * All other writes go to all children.
+		 * Writes go to all children.
 		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
-		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
-		    zio->io_txg, 1)) {
-			c = mm->mm_children - 1;
-			children = 1;
-		} else {
-			c = 0;
-			children = mm->mm_children;
-		}
+		c = 0;
+		children = mm->mm_children;
 	}
 
 	while (children--) {
@@ -398,7 +388,7 @@
 		ASSERT(zio->io_error != 0);
 	}
 
-	if (good_copies && (spa_mode & FWRITE) &&
+	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
@@ -419,7 +409,7 @@
 				if (mc->mc_tried)
 					continue;
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = ESTALE;
@@ -429,7 +419,8 @@
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }

--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Mon Dec 01 12:43:36 2008 -0800
@@ -176,6 +176,7 @@
 	zio_t *fio, *lio, *aio, *dio;
 	avl_tree_t *tree;
 	uint64_t size;
+	int flags;
 
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
@@ -187,21 +188,32 @@
 
 	tree = fio->io_vdev_tree;
 	size = fio->io_size;
+	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
 
-	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-	    !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		dio->io_delegate_next = fio;
-		fio = dio;
-		size += dio->io_size;
-	}
-
-	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-	    !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		lio->io_delegate_next = dio;
-		lio = dio;
-		size += dio->io_size;
+	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
+		/*
+		 * We can aggregate I/Os that are adjacent and of the
+		 * same flavor, as expressed by the AGG_INHERIT flags.
+		 * The latter is necessary so that certain attributes
+		 * of the I/O, such as whether it's a normal I/O or a
+		 * scrub/resilver, can be preserved in the aggregate.
+		 */
+		while ((dio = AVL_PREV(tree, fio)) != NULL &&
+		    IS_ADJACENT(dio, fio) &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    size + dio->io_size <= zfs_vdev_aggregation_limit) {
+			dio->io_delegate_next = fio;
+			fio = dio;
+			size += dio->io_size;
+		}
+		while ((dio = AVL_NEXT(tree, lio)) != NULL &&
+		    IS_ADJACENT(lio, dio) &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    size + dio->io_size <= zfs_vdev_aggregation_limit) {
+			lio->io_delegate_next = dio;
+			lio = dio;
+			size += dio->io_size;
+		}
 	}
 
 	if (fio != lio) {
@@ -212,7 +224,7 @@
 
 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
 		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 
 		aio->io_delegate_list = fio;

--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Mon Dec 01 12:43:36 2008 -0800
@@ -687,7 +687,7 @@
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
@@ -1165,7 +1165,7 @@
 done:
 	zio_checksum_verified(zio);
 
-	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
@@ -1180,7 +1180,8 @@
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }

--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Dec 01 12:43:36 2008 -0800
@@ -1163,7 +1163,7 @@
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);

--- a/usr/src/uts/common/fs/zfs/zio.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Mon Dec 01 12:43:36 2008 -0800
@@ -1790,7 +1790,30 @@
 
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+	/*
+	 * If this is a repair I/O, and there's no self-healing involved --
+	 * that is, we're just resilvering what we expect to resilver --
+	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
+	 * This prevents spurious resilvering with nested replication.
+	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
+	 * A is out of date, we'll read from C+D, then use the data to
+	 * resilver A+B -- but we don't actually want to resilver B, just A.
+	 * The top-level mirror has no way to know this, so instead we just
+	 * discard unnecessary repairs as we work our way down the vdev tree.
+	 * The same logic applies to any form of nested replication:
+	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_delegate_list == NULL);
+		zio_vdev_io_bypass(zio);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
@@ -1806,7 +1829,6 @@
 			zio_interrupt(zio);
 			return (ZIO_PIPELINE_STOP);
 		}
-
 	}
 
 	return (vd->vdev_ops->vdev_op_io_start(zio));
@@ -2157,6 +2179,7 @@
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    zio->io_error == ENXIO &&
+		    spa->spa_load_state == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;

--- a/usr/src/uts/i86pc/os/trap.c	Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/i86pc/os/trap.c	Mon Dec 01 12:43:36 2008 -0800
@@ -32,7 +32,6 @@
 /*		All Rights Reserved   				*/
 /*								*/
 
-
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/param.h>
@@ -422,29 +421,27 @@
  * The first byte of prefetch instructions is always 0x0F.
  * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch.
  * The third byte is between 0 and 3 inclusive.
+ *
+ * In 64-bit mode, there may be a one-byte REX prefex (0x40-0x4F).
  */
 
-#define	PREFETCHSIZE 3
-
 static int
 cmp_to_prefetch(uchar_t *p)
 {
-	if (*p == 0x0F && (*(p+1) == 0x18 || *(p+1) == 0x0D) && *(p+2) <= 3)
-		return (1);
-	return (0);
+#ifdef _LP64
+	if ((p[0] & 0xF0) == 0x40)	/* 64-bit REX prefix */
+		p++;
+#endif
+	return (p[0] == 0x0F && (p[1] == 0x18 || p[1] == 0x0D) && p[2] <= 3);
 }
 
 static int
 instr_is_prefetch(caddr_t pc)
 {
-	uchar_t instr[PREFETCHSIZE];
-	int	error;
+	uchar_t instr[4];	/* optional REX prefix plus 3-byte opcode */
 
-	error = copyin_nowatch(pc, (caddr_t)instr, PREFETCHSIZE);
-
-	if (error == 0 && cmp_to_prefetch(instr))
-		return (1);
-	return (0);
+	return (copyin_nowatch(pc, instr, sizeof (instr)) == 0 &&
+	    cmp_to_prefetch(instr));
 }
 
 #endif /* OPTERON_ERRATUM_91 */

author	Jeff Bonwick <Jeff.Bonwick@Sun.COM>
	Mon, 01 Dec 2008 12:43:36 -0800
changeset 8241	5a60f16123ba
parent 8240	ca4d4a3ddd4c
child 8242	e46e4b2f0a03

usr/src/cmd/zdb/Makefile.com		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/zdb/zdb.c		file \| annotate \| diff \| comparison \| revisions
usr/src/cmd/ztest/ztest.c		file \| annotate \| diff \| comparison \| revisions
usr/src/tools/codereview/lwlp.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/arc.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/dmu_traverse.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/dsl_scrub.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/metaslab.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/spa.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/spa_misc.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/space_map.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/spa.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/spa_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/space_map.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/vdev.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/vdev_impl.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/sys/zio.h		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_disk.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_file.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_label.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_mirror.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_queue.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/vdev_raidz.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/zfs_ioctl.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/common/fs/zfs/zio.c		file \| annotate \| diff \| comparison \| revisions
usr/src/uts/i86pc/os/trap.c		file \| annotate \| diff \| comparison \| revisions