usr/src/uts/common/fs/zfs/spa.c
changeset 10921 8aac17999e4d
parent 10822 2a6b5dc1374c
child 10922 e2081f502306
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Oct 30 16:13:16 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Fri Oct 30 18:47:17 2009 -0600
@@ -1151,12 +1151,91 @@
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
+typedef struct spa_load_error {
+	uint64_t	sle_metadata_count;
+	uint64_t	sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	spa_load_error_t *sle = zio->io_private;
+	dmu_object_type_t type = BP_GET_TYPE(bp);
+	int error = zio->io_error;
+
+	if (error) {
+		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+		    type != DMU_OT_INTENT_LOG)
+			atomic_add_64(&sle->sle_metadata_count, 1);
+		else
+			atomic_add_64(&sle->sle_data_count, 1);
+	}
+	zio_data_buf_free(zio->io_data, zio->io_size);
+}
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
+    const dnode_phys_t *dnp, void *arg)
+{
+	if (bp != NULL) {
+		zio_t *rio = arg;
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = zio_data_buf_alloc(size);
+
+		zio_nowait(zio_read(rio, spa, bp, data, size,
+		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+	}
+	return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+	zio_t *rio;
+	spa_load_error_t sle = { 0 };
+	zpool_rewind_policy_t policy;
+	boolean_t verify_ok = B_FALSE;
+	int error;
+
+	rio = zio_root(spa, NULL, &sle,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	error = traverse_pool(spa, spa_load_verify_cb, rio,
+	    spa->spa_verify_min_txg);
+
+	(void) zio_wait(rio);
+
+	zpool_get_rewind_policy(spa->spa_config, &policy);
+
+	spa->spa_load_meta_errors = sle.sle_metadata_count;
+	spa->spa_load_data_errors = sle.sle_data_count;
+
+	if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta &&
+	    sle.sle_data_count <= policy.zrp_maxdata) {
+		verify_ok = B_TRUE;
+		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+	}
+
+	if (error) {
+		if (error != ENXIO && error != EIO)
+			error = EIO;
+		return (error);
+	}
+
+	return (verify_ok ? 0 : EIO);
+}
+
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+spa_load(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int error = 0;
 	nvlist_t *nvconfig, *nvroot = NULL;
@@ -1168,6 +1247,7 @@
 	uint64_t autoreplace = 0;
 	int orig_mode = spa->spa_mode;
 	char *ereport = FM_EREPORT_ZFS_POOL;
+	nvlist_t *config = spa->spa_config;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
@@ -1296,11 +1376,15 @@
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error) {
 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
+		error = EIO;
 		goto out;
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
@@ -1359,7 +1443,7 @@
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
-		return (spa_load(spa, nvconfig, state, B_TRUE));
+		return (spa_load(spa, state, B_TRUE));
 	}
 
 	if (zap_lookup(spa->spa_meta_objset,
@@ -1569,7 +1653,17 @@
 		goto out;
 	}
 
-	if (spa_writeable(spa)) {
+	if (state != SPA_LOAD_TRYIMPORT) {
+		error = spa_load_verify(spa);
+		if (error) {
+			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			goto out;
+		}
+	}
+
+	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 
@@ -1578,6 +1672,7 @@
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
+		 * Price of rollback is that we abandon the log.
 		 */
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
@@ -1602,7 +1697,8 @@
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
+		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+		    state == SPA_LOAD_RECOVER)
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
@@ -1636,6 +1732,7 @@
 
 	error = 0;
 out:
+
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error && error != EBADF)
 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
@@ -1645,6 +1742,76 @@
 	return (error);
 }
 
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+	spa_unload(spa);
+	spa_deactivate(spa);
+
+	spa->spa_load_max_txg--;
+
+	spa_activate(spa, spa_mode_global);
+	spa_async_suspend(spa);
+
+	return (spa_load(spa, state, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+    uint64_t max_request, boolean_t extreme)
+{
+	nvlist_t *config = NULL;
+	int load_error, rewind_error;
+	uint64_t safe_rollback_txg;
+	uint64_t min_txg;
+
+	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER)
+		spa->spa_load_max_txg = spa->spa_load_txg;
+	else
+		spa->spa_load_max_txg = max_request;
+
+	load_error = rewind_error = spa_load(spa, state, mosconfig);
+	if (load_error == 0)
+		return (0);
+
+	if (spa->spa_root_vdev != NULL)
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+	/* specific txg requested */
+	if (spa->spa_load_max_txg != UINT64_MAX && !extreme) {
+		nvlist_free(config);
+		return (load_error);
+	}
+
+	/* Price of rolling back is discarding txgs, including log */
+	if (state == SPA_LOAD_RECOVER)
+		spa->spa_log_state = SPA_LOG_CLEAR;
+
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+	safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE;
+
+	min_txg = extreme ? TXG_INITIAL : safe_rollback_txg;
+	while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) {
+		if (spa->spa_load_max_txg < safe_rollback_txg)
+			spa->spa_extreme_rewind = B_TRUE;
+		rewind_error = spa_load_retry(spa, state, mosconfig);
+	}
+
+	if (config)
+		spa_rewind_data_to_nvlist(spa, config);
+
+	spa->spa_extreme_rewind = B_FALSE;
+	spa->spa_load_max_txg = UINT64_MAX;
+
+	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+		spa_config_set(spa, config);
+
+	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
+}
+
 /*
  * Pool Open/Import
  *
@@ -1658,14 +1825,25 @@
  * ambiguous state.
  */
 static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+    nvlist_t **config)
 {
 	spa_t *spa;
+	boolean_t norewind;
+	boolean_t extreme;
+	zpool_rewind_policy_t policy;
+	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 
 	*spapp = NULL;
 
+	zpool_get_rewind_policy(nvpolicy, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+	norewind = (policy.zrp_request == ZPOOL_NO_REWIND);
+	extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0);
+
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
@@ -1682,11 +1860,26 @@
 			mutex_exit(&spa_namespace_lock);
 		return (ENOENT);
 	}
+
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 
 		spa_activate(spa, spa_mode_global);
 
-		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+		if (spa->spa_last_open_failed && norewind) {
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config,
+				    config, KM_SLEEP) == 0);
+			spa_deactivate(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (spa->spa_last_open_failed);
+		}
+
+		if (state != SPA_LOAD_RECOVER)
+			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
+		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+		    extreme);
 
 		if (error == EBADF) {
 			/*
@@ -1711,38 +1904,49 @@
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_root_vdev != NULL)
-				*config = spa_config_generate(spa, NULL, -1ULL,
-				    B_TRUE);
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config, config,
+				    KM_SLEEP) == 0);
 			spa_unload(spa);
 			spa_deactivate(spa);
-			spa->spa_last_open_failed = B_TRUE;
+			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
-		} else {
-			spa->spa_last_open_failed = B_FALSE;
 		}
+
 	}
 
 	spa_open_ref(spa, tag);
 
+	spa->spa_last_open_failed = 0;
+
+	if (config != NULL)
+		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = 0;
+	spa->spa_load_txg = 0;
+
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
 	*spapp = spa;
 
-	if (config != NULL)
-		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
-
 	return (0);
 }
 
 int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+    nvlist_t **config)
+{
+	return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
-	return (spa_open_common(name, spapp, tag, NULL));
+	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
@@ -1883,7 +2087,7 @@
 	spa_t *spa;
 
 	*config = NULL;
-	error = spa_open_common(name, &spa, FTAG, config);
+	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
@@ -2143,7 +2347,7 @@
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	spa->spa_uberblock.ub_txg = txg - 1;
@@ -2450,7 +2654,7 @@
 		spa_remove(spa);
 	}
 
-	spa = spa_add(pname, NULL);
+	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_load_verbatim = B_TRUE;
 
@@ -2529,6 +2733,7 @@
 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
 {
 	spa_t *spa;
+	zpool_rewind_policy_t policy;
 	char *altroot = NULL;
 
 	mutex_enter(&spa_namespace_lock);
@@ -2539,12 +2744,13 @@
 
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, config, altroot);
+
+	zpool_get_rewind_policy(config, &policy);
+	spa->spa_load_max_txg = policy.zrp_txg;
 
 	spa->spa_load_verbatim = B_TRUE;
 
-	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
-
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
@@ -2564,6 +2770,8 @@
 {
 	spa_t *spa;
 	char *altroot = NULL;
+	spa_load_state_t state = SPA_LOAD_IMPORT;
+	zpool_rewind_policy_t policy;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
@@ -2578,12 +2786,16 @@
 		return (EEXIST);
 	}
 
+	zpool_get_rewind_policy(config, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
+	spa = spa_add(pool, config, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	/*
@@ -2596,7 +2808,16 @@
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
-	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
+	if (state != SPA_LOAD_RECOVER)
+		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+	    ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0));
+
+	/*
+	 * Propagate anything learned about failing or best txgs
+	 * back to caller
+	 */
+	spa_rewind_data_to_nvlist(spa, config);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
@@ -2726,7 +2947,7 @@
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, NULL);
+	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
@@ -2734,7 +2955,7 @@
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
-	error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+	error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -4531,6 +4752,8 @@
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+	spa_handle_ignored_writes(spa);
+
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */