usr/src/uts/common/fs/zfs/spa.c
changeset 4451 24fbf2d7a5d7
parent 4309 3dfde0f4662d
child 4527 5d5b6ba91b17
--- a/usr/src/uts/common/fs/zfs/spa.c	Tue Jun 12 11:51:07 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Tue Jun 12 13:18:17 2007 -0700
@@ -424,6 +424,24 @@
 }
 
 /*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+	int c;
+
+	for (c = 0; c < vd->vdev_children; c++)
+		spa_check_removed(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
+		zfs_post_autoreplace(vd->vdev_spa, vd);
+		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
+	}
+}
+
+/*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
@@ -438,6 +456,7 @@
 	uint64_t pool_guid;
 	uint64_t version;
 	zio_t *zio;
+	uint64_t autoreplace = 0;
 
 	spa->spa_load_state = state;
 
@@ -711,11 +730,25 @@
 	if (error == 0) {
 		(void) zap_lookup(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS),
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
+		(void) zap_lookup(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
+		    sizeof (uint64_t), 1, &autoreplace);
 	}
 
 	/*
+	 * If the 'autoreplace' property is set, then post a resource notifying
+	 * the ZFS DE that it should not issue any faults for unopenable
+	 * devices.  We also iterate over the vdevs, and post a sysevent for any
+	 * unopenable vdevs so that the normal autoreplace handler can take
+	 * over.
+	 */
+	if (autoreplace)
+		spa_check_removed(spa->spa_root_vdev);
+
+	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
@@ -795,7 +828,7 @@
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
- * POOL_STATE_UNITIALIZED state.
+ * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
@@ -879,6 +912,13 @@
 	}
 
 	spa_open_ref(spa, tag);
+
+	/*
+	 * If we just loaded the pool, resilver anything that's out of date.
+	 */
+	if (loaded && (spa_mode & FWRITE))
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
@@ -890,12 +930,6 @@
 		spa_config_exit(spa, FTAG);
 	}
 
-	/*
-	 * If we just loaded the pool, resilver anything that's out of date.
-	 */
-	if (loaded && (spa_mode & FWRITE))
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
 	return (0);
 }
 
@@ -1219,7 +1253,7 @@
 
 	dmu_tx_commit(tx);
 
-	spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS);
+	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
@@ -1325,14 +1359,14 @@
 	 */
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
-	mutex_exit(&spa_namespace_lock);
-
 	/*
 	 * Resilver anything that's out of date.
 	 */
 	if (spa_mode & FWRITE)
 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
 
+	mutex_exit(&spa_namespace_lock);
+
 	return (0);
 }
 
@@ -1476,6 +1510,8 @@
 		}
 	}
 
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
+
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
@@ -1657,7 +1693,7 @@
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
- * mirror using the 'replacing' vdev, which is functionally idendical to
+ * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
@@ -1685,7 +1721,10 @@
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
+	    VDEV_ALLOC_ADD)) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
@@ -1818,9 +1857,12 @@
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
-	 * Kick off a resilver to update newvd.
+	 * Kick off a resilver to update newvd.  We need to grab the namespace
+	 * lock because spa_scrub() needs to post a sysevent with the pool name.
 	 */
+	mutex_enter(&spa_namespace_lock);
 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
@@ -1973,7 +2015,7 @@
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
-	vdev_propagate_state(cvd->vdev_parent);
+	vdev_propagate_state(cvd);
 
 	/*
 	 * If the device we just detached was smaller than the others, it may be
@@ -1996,6 +2038,8 @@
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
+	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	/*
@@ -2098,20 +2142,24 @@
 }
 
 /*
- * Find any device that's done replacing, so we can detach it.
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * current spared, so we can detach it.
  */
 static vdev_t *
-spa_vdev_replace_done_hunt(vdev_t *vd)
+spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 	int c;
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
+	/*
+	 * Check for a completed replacement.
+	 */
 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
@@ -2125,11 +2173,29 @@
 		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
+	/*
+	 * Check for a completed resilver with the 'unspare' flag set.
+	 */
+	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
+		newvd = vd->vdev_child[0];
+		oldvd = vd->vdev_child[1];
+
+		mutex_enter(&newvd->vdev_dtl_lock);
+		if (newvd->vdev_unspare &&
+		    newvd->vdev_dtl_map.sm_space == 0 &&
+		    newvd->vdev_dtl_scrub.sm_space == 0) {
+			newvd->vdev_unspare = 0;
+			mutex_exit(&newvd->vdev_dtl_lock);
+			return (oldvd);
+		}
+		mutex_exit(&newvd->vdev_dtl_lock);
+	}
+
 	return (NULL);
 }
 
 static void
-spa_vdev_replace_done(spa_t *spa)
+spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd;
 	vdev_t *pvd;
@@ -2138,7 +2204,7 @@
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		guid = vd->vdev_guid;
 		/*
 		 * If we have just finished replacing a hot spared device, then
@@ -2449,6 +2515,9 @@
 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
 	spa_errlog_rotate(spa);
 
+	if (scrub_type == POOL_SCRUB_RESILVER && complete)
+		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
+
 	spa_config_exit(spa, FTAG);
 
 	mutex_enter(&spa->spa_scrub_lock);
@@ -2457,7 +2526,7 @@
 	 * We may have finished replacing a device.
 	 * Let the async thread assess this and handle the detach.
 	 */
-	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 	/*
 	 * If we were told to restart, our final act is to start a new scrub.
@@ -2568,7 +2637,7 @@
 		 */
 		if (type == POOL_SCRUB_RESILVER) {
 			type = POOL_SCRUB_NONE;
-			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
+			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		}
 	} else {
 		/*
@@ -2593,6 +2662,8 @@
 		mintxg = ss->ss_start - 1;
 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
 		maxtxg = MIN(ss->ss_end, maxtxg);
+
+		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
 	}
 
 	mutex_exit(&rvd->vdev_dtl_lock);
@@ -2624,29 +2695,29 @@
  */
 
 static void
-spa_async_reopen(spa_t *spa)
+spa_async_remove(spa_t *spa, vdev_t *vd)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *tvd;
 	int c;
 
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	for (c = 0; c < rvd->vdev_children; c++) {
-		tvd = rvd->vdev_child[c];
-		if (tvd->vdev_reopen_wanted) {
-			tvd->vdev_reopen_wanted = 0;
-			vdev_reopen(tvd);
+	for (c = 0; c < vd->vdev_children; c++) {
+		tvd = vd->vdev_child[c];
+		if (tvd->vdev_remove_wanted) {
+			tvd->vdev_remove_wanted = 0;
+			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
+			    VDEV_AUX_NONE);
+			vdev_clear(spa, tvd);
+			vdev_config_dirty(tvd->vdev_top);
 		}
+		spa_async_remove(spa, tvd);
 	}
-
-	spa_config_exit(spa, FTAG);
 }
 
 static void
 spa_async_thread(spa_t *spa)
 {
 	int tasks;
+	uint64_t txg;
 
 	ASSERT(spa->spa_sync_on);
 
@@ -2665,28 +2736,40 @@
 	}
 
 	/*
-	 * See if any devices need to be reopened.
+	 * See if any devices need to be marked REMOVED.
 	 */
-	if (tasks & SPA_ASYNC_REOPEN)
-		spa_async_reopen(spa);
+	if (tasks & SPA_ASYNC_REMOVE) {
+		txg = spa_vdev_enter(spa);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		(void) spa_vdev_exit(spa, NULL, txg, 0);
+	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
-	if (tasks & SPA_ASYNC_REPLACE_DONE)
-		spa_vdev_replace_done(spa);
+	if (tasks & SPA_ASYNC_RESILVER_DONE)
+		spa_vdev_resilver_done(spa);
 
 	/*
-	 * Kick off a scrub.
+	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
+	 * scrub which can become a resilver), we need to hold
+	 * spa_namespace_lock() because the sysevent we post via
+	 * spa_event_notify() needs to get the name of the pool.
 	 */
-	if (tasks & SPA_ASYNC_SCRUB)
+	if (tasks & SPA_ASYNC_SCRUB) {
+		mutex_enter(&spa_namespace_lock);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+		mutex_exit(&spa_namespace_lock);
+	}
 
 	/*
 	 * Kick off a resilver.
 	 */
-	if (tasks & SPA_ASYNC_RESILVER)
+	if (tasks & SPA_ASYNC_RESILVER) {
+		mutex_enter(&spa_namespace_lock);
 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+		mutex_exit(&spa_namespace_lock);
+	}
 
 	/*
 	 * Let the world know that we're done.
@@ -2810,7 +2893,7 @@
 	/*
 	 * Update the MOS nvlist describing the list of available spares.
 	 * spa_validate_spares() will have already made sure this nvlist is
-	 * valid and the vdevs are labelled appropriately.
+	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (spa->spa_spares_object == 0) {
 		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
@@ -2869,6 +2952,7 @@
 	nvpair_t *nvpair;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zapobj;
+	uint64_t intval;
 
 	mutex_enter(&spa->spa_props_lock);
 	if (spa->spa_pool_props_object == 0) {
@@ -2886,14 +2970,23 @@
 	nvpair = NULL;
 	while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) {
 		switch (zpool_name_to_prop(nvpair_name(nvpair))) {
-		case ZFS_PROP_BOOTFS:
+		case ZPOOL_PROP_BOOTFS:
 			VERIFY(nvlist_lookup_uint64(nvp,
 			    nvpair_name(nvpair), &spa->spa_bootfs) == 0);
 			VERIFY(zap_update(mos,
 			    spa->spa_pool_props_object,
-			    zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1,
+			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 8, 1,
 			    &spa->spa_bootfs, tx) == 0);
 			break;
+
+		case ZPOOL_PROP_AUTOREPLACE:
+			VERIFY(nvlist_lookup_uint64(nvp,
+			    nvpair_name(nvpair), &intval) == 0);
+			VERIFY(zap_update(mos,
+			    spa->spa_pool_props_object,
+			    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 8, 1,
+			    &intval, tx) == 0);
+			break;
 		}
 	}
 }
@@ -3191,7 +3284,7 @@
 	zap_attribute_t za;
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_source_t src;
-	zfs_prop_t prop;
+	zpool_prop_t prop;
 	nvlist_t *propval;
 	uint64_t value;
 	int err;
@@ -3215,14 +3308,14 @@
 		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		switch (za.za_integer_length) {
 		case 8:
-			if (zfs_prop_default_numeric(prop) ==
+			if (zpool_prop_default_numeric(prop) ==
 			    za.za_first_integer)
 				src = ZFS_SRC_DEFAULT;
 			else
 				src = ZFS_SRC_LOCAL;
 			value = za.za_first_integer;
 
-			if (prop == ZFS_PROP_BOOTFS) {
+			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 				char strval[MAXPATHLEN];
@@ -3274,7 +3367,61 @@
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0);
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
+
+/*
+ * Post a sysevent corresponding to the given event.  The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+#ifdef _KERNEL
+	sysevent_t		*ev;
+	sysevent_attr_list_t	*attr = NULL;
+	sysevent_value_t	value;
+	sysevent_id_t		eid;
+
+	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
+	    SE_SLEEP);
+
+	value.value_type = SE_DATA_TYPE_STRING;
+	value.value.sv_string = spa_name(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
+		goto done;
+
+	value.value_type = SE_DATA_TYPE_UINT64;
+	value.value.sv_uint64 = spa_guid(spa);
+	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
+		goto done;
+
+	if (vd) {
+		value.value_type = SE_DATA_TYPE_UINT64;
+		value.value.sv_uint64 = vd->vdev_guid;
+		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
+		    SE_SLEEP) != 0)
+			goto done;
+
+		if (vd->vdev_path) {
+			value.value_type = SE_DATA_TYPE_STRING;
+			value.value.sv_string = vd->vdev_path;
+			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
+			    &value, SE_SLEEP) != 0)
+				goto done;
+		}
+	}
+
+	(void) log_sysevent(ev, SE_SLEEP, &eid);
+
+done:
+	if (attr)
+		sysevent_free_attr(attr);
+	sysevent_free(ev);
+#endif
+}