--- a/usr/src/uts/common/fs/zfs/spa.c Mon Dec 01 11:27:58 2008 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c Mon Dec 01 12:43:36 2008 -0800
@@ -486,11 +486,12 @@
* Activate an uninitialized pool.
*/
static void
-spa_activate(spa_t *spa)
+spa_activate(spa_t *spa, int mode)
{
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
spa->spa_state = POOL_STATE_ACTIVE;
+ spa->spa_mode = mode;
spa->spa_normal_class = metaslab_class_create();
spa->spa_log_class = metaslab_class_create();
@@ -640,11 +641,6 @@
mutex_exit(&spa->spa_async_root_lock);
/*
- * Drop and purge level 2 cache
- */
- spa_l2cache_drop(spa);
-
- /*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
@@ -652,6 +648,13 @@
spa->spa_dsl_pool = NULL;
}
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * Drop and purge level 2 cache
+ */
+ spa_l2cache_drop(spa);
+
/*
* Close all vdevs.
*/
@@ -686,6 +689,8 @@
spa->spa_l2cache.sav_count = 0;
spa->spa_async_suspended = 0;
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
@@ -897,12 +902,9 @@
vd = oldvdevs[i];
if (vd != NULL) {
- if ((spa_mode & FWRITE) &&
- spa_l2cache_exists(vd->vdev_guid, &pool) &&
- pool != 0ULL &&
- l2arc_vdev_present(vd)) {
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
l2arc_remove_vdev(vd);
- }
(void) vdev_close(vd);
spa_l2cache_remove(vd);
}
@@ -1018,8 +1020,16 @@
uint64_t pool_guid;
uint64_t version;
uint64_t autoreplace = 0;
+ int orig_mode = spa->spa_mode;
char *ereport = FM_EREPORT_ZFS_POOL;
+ /*
+ * If this is an untrusted config, access the pool in read-only mode.
+ * This prevents things like resilvering recently removed devices.
+ */
+ if (!mosconfig)
+ spa->spa_mode = FREAD;
+
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa->spa_load_state = state;
@@ -1077,12 +1087,13 @@
* Validate the labels for all leaf vdevs. We need to grab the config
* lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
*/
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
- error = vdev_validate(rvd);
- spa_config_exit(spa, SCL_ALL, FTAG);
-
- if (error != 0)
- goto out;
+ if (mosconfig) {
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ error = vdev_validate(rvd);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (error != 0)
+ goto out;
+ }
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
error = ENXIO;
@@ -1184,7 +1195,7 @@
spa_config_set(spa, newconfig);
spa_unload(spa);
spa_deactivate(spa);
- spa_activate(spa);
+ spa_activate(spa, orig_mode);
return (spa_load(spa, newconfig, state, B_TRUE));
}
@@ -1376,10 +1387,11 @@
goto out;
}
- if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
+ if (spa_writeable(spa)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
- int c;
+
+ ASSERT(state != SPA_LOAD_TRYIMPORT);
/*
* Claim log blocks that haven't been committed yet.
@@ -1407,7 +1419,7 @@
state == SPA_LOAD_IMPORT)
need_update = B_TRUE;
- for (c = 0; c < rvd->vdev_children; c++)
+ for (int c = 0; c < rvd->vdev_children; c++)
if (rvd->vdev_child[c]->vdev_ms_array == 0)
need_update = B_TRUE;
@@ -1417,6 +1429,12 @@
*/
if (need_update)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+ /*
+ * Check all DTLs to see if anything needs resilvering.
+ */
+ if (vdev_resilver_needed(rvd, NULL, NULL))
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
}
error = 0;
@@ -1469,7 +1487,7 @@
}
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
- spa_activate(spa);
+ spa_activate(spa, spa_mode_global);
error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
@@ -1873,11 +1891,9 @@
vd = sav->sav_vdevs[i];
ASSERT(vd != NULL);
- if ((spa_mode & FWRITE) &&
- spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
- l2arc_vdev_present(vd)) {
+ if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL && l2arc_vdev_present(vd))
l2arc_remove_vdev(vd);
- }
if (vd->vdev_isl2cache)
spa_l2cache_remove(vd);
vdev_clear_stats(vd);
@@ -1918,7 +1934,7 @@
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, altroot);
- spa_activate(spa);
+ spa_activate(spa, spa_mode_global);
spa->spa_uberblock.ub_txg = txg - 1;
@@ -2121,7 +2137,7 @@
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
spa = spa_add(pool, altroot);
- spa_activate(spa);
+ spa_activate(spa, spa_mode_global);
if (allowfaulted)
spa->spa_import_faulted = B_TRUE;
@@ -2160,7 +2176,8 @@
VDEV_ALLOC_L2CACHE);
spa_config_exit(spa, SCL_ALL, FTAG);
- if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
+ if (error != 0 || (props && spa_writeable(spa) &&
+ (error = spa_prop_set(spa, props)))) {
if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
/*
* If we failed to load the pool, but 'allowfaulted' is
@@ -2219,7 +2236,7 @@
spa->spa_l2cache.sav_sync = B_TRUE;
}
- if (spa_mode & FWRITE) {
+ if (spa_writeable(spa)) {
/*
* Update the config cache to include the newly-imported pool.
*/
@@ -2489,7 +2506,7 @@
*/
mutex_enter(&spa_namespace_lock);
spa = spa_add(TRYIMPORT_NAME, NULL);
- spa_activate(spa);
+ spa_activate(spa, FREAD);
/*
* Pass off the heavy lifting to spa_load().
@@ -2575,7 +2592,7 @@
if (oldconfig)
*oldconfig = NULL;
- if (!(spa_mode & FWRITE))
+ if (!(spa_mode_global & FWRITE))
return (EROFS);
mutex_enter(&spa_namespace_lock);
@@ -2710,7 +2727,7 @@
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
uint64_t txg;
- int c, error;
+ int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
nvlist_t **spares, **l2cache;
@@ -2749,7 +2766,7 @@
/*
* Transfer each new top-level vdev from vd to rvd.
*/
- for (c = 0; c < vd->vdev_children; c++) {
+ for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
tvd->vdev_id = rvd->vdev_children;
@@ -2957,10 +2974,8 @@
*/
open_txg = txg + TXG_CONCURRENT_STATES - 1;
- mutex_enter(&newvd->vdev_dtl_lock);
- space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
- open_txg - TXG_INITIAL + 1);
- mutex_exit(&newvd->vdev_dtl_lock);
+ vdev_dtl_dirty(newvd, DTL_MISSING,
+ TXG_INITIAL, open_txg - TXG_INITIAL + 1);
if (newvd->vdev_isspare)
spa_spare_activate(newvd);
@@ -3004,10 +3019,10 @@
* is a replacing vdev.
*/
int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
{
uint64_t txg;
- int c, t, error;
+ int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *pvd, *cvd, *tvd;
boolean_t unspare = B_FALSE;
@@ -3027,6 +3042,22 @@
pvd = vd->vdev_parent;
/*
+ * If the parent/child relationship is not as expected, don't do it.
+ * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+ * vdev that's replacing B with C. The user's intent in replacing
+ * is to go from M(A,B) to M(A,C). If the user decides to cancel
+ * the replace by detaching C, the expected behavior is to end up
+ * M(A,B). But suppose that right after deciding to detach C,
+ * the replacement of B completes. We would have M(A,C), and then
+ * ask to detach C, which would leave us with just A -- not what
+ * the user wanted. To prevent this, we make sure that the
+ * parent/child relationship hasn't changed -- in this example,
+ * that C's parent is still the replacing vdev R.
+ */
+ if (pvd->vdev_guid != pguid && pguid != 0)
+ return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+ /*
* If replace_done is specified, only remove this device if it's
* the first child of a replacing vdev. For the 'spare' vdev, either
* disk can be removed.
@@ -3052,37 +3083,14 @@
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
/*
- * If there's only one replica, you can't detach it.
- */
- if (pvd->vdev_children <= 1)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-
- /*
- * If all siblings have non-empty DTLs, this device may have the only
- * valid copy of the data, which means we cannot safely detach it.
- *
- * XXX -- as in the vdev_offline() case, we really want a more
- * precise DTL check.
+ * If this device has the only valid copy of some data,
+ * we cannot safely detach it.
*/
- for (c = 0; c < pvd->vdev_children; c++) {
- uint64_t dirty;
-
- cvd = pvd->vdev_child[c];
- if (cvd == vd)
- continue;
- if (vdev_is_dead(cvd))
- continue;
- mutex_enter(&cvd->vdev_dtl_lock);
- dirty = cvd->vdev_dtl_map.sm_space |
- cvd->vdev_dtl_scrub.sm_space;
- mutex_exit(&cvd->vdev_dtl_lock);
- if (!dirty)
- break;
- }
-
- if (c == pvd->vdev_children)
+ if (vdev_dtl_required(vd))
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ ASSERT(pvd->vdev_children >= 2);
+
/*
* If we are detaching the second disk from a replacing vdev, then
* check to see if we changed the original vdev's path to have "/old"
@@ -3107,7 +3115,7 @@
* active spare list for the pool.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0)
+ vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
unspare = B_TRUE;
/*
@@ -3133,14 +3141,18 @@
/*
* If we need to remove the remaining child from the list of hot spares,
- * do it now, marking the vdev as no longer a spare in the process. We
- * must do this before vdev_remove_parent(), because that can change the
- * GUID if it creates a new toplevel GUID.
+ * do it now, marking the vdev as no longer a spare in the process.
+ * We must do this before vdev_remove_parent(), because that can
+ * change the GUID if it creates a new toplevel GUID. For a similar
+ * reason, we must remove the spare now, in the same txg as the detach;
+ * otherwise someone could attach a new sibling, change the GUID, and
+ * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
*/
if (unspare) {
ASSERT(cvd->vdev_isspare);
spa_spare_remove(cvd);
unspare_guid = cvd->vdev_guid;
+ (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
}
/*
@@ -3178,7 +3190,7 @@
* But first make sure we're not on any *other* txg's DTL list, to
* prevent vd from being accessed after it's freed.
*/
- for (t = 0; t < TXG_SIZE; t++)
+ for (int t = 0; t < TXG_SIZE; t++)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
vd->vdev_detached = B_TRUE;
vdev_dirty(tvd, VDD_DTL, vd, txg);
@@ -3193,11 +3205,14 @@
* list of every other pool.
*/
if (unspare) {
+ spa_t *myspa = spa;
spa = NULL;
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (spa->spa_state != POOL_STATE_ACTIVE)
continue;
+ if (spa == myspa)
+ continue;
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
@@ -3261,10 +3276,12 @@
vdev_t *vd;
nvlist_t **spares, **l2cache, *nv;
uint_t nspares, nl2cache;
- uint64_t txg;
+ uint64_t txg = 0;
int error = 0;
-
- txg = spa_vdev_enter(spa);
+ boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+
+ if (!locked)
+ txg = spa_vdev_enter(spa);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3307,7 +3324,10 @@
error = ENOENT;
}
- return (spa_vdev_exit(spa, NULL, txg, error));
+ if (!locked)
+ return (spa_vdev_exit(spa, NULL, txg, error));
+
+ return (error);
}
/*
@@ -3333,13 +3353,9 @@
oldvd = vd->vdev_child[0];
newvd = vd->vdev_child[1];
- mutex_enter(&newvd->vdev_dtl_lock);
- if (newvd->vdev_dtl_map.sm_space == 0 &&
- newvd->vdev_dtl_scrub.sm_space == 0) {
- mutex_exit(&newvd->vdev_dtl_lock);
+ if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+ !vdev_dtl_required(oldvd))
return (oldvd);
- }
- mutex_exit(&newvd->vdev_dtl_lock);
}
/*
@@ -3349,15 +3365,12 @@
newvd = vd->vdev_child[0];
oldvd = vd->vdev_child[1];
- mutex_enter(&newvd->vdev_dtl_lock);
if (newvd->vdev_unspare &&
- newvd->vdev_dtl_map.sm_space == 0 &&
- newvd->vdev_dtl_scrub.sm_space == 0) {
+ vdev_dtl_empty(newvd, DTL_MISSING) &&
+ !vdev_dtl_required(oldvd)) {
newvd->vdev_unspare = 0;
- mutex_exit(&newvd->vdev_dtl_lock);
return (oldvd);
}
- mutex_exit(&newvd->vdev_dtl_lock);
}
return (NULL);
@@ -3366,36 +3379,37 @@
static void
spa_vdev_resilver_done(spa_t *spa)
{
- vdev_t *vd;
- vdev_t *pvd;
- uint64_t guid;
- uint64_t pguid = 0;
-
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd, *pvd, *ppvd;
+ uint64_t guid, sguid, pguid, ppguid;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+ pvd = vd->vdev_parent;
+ ppvd = pvd->vdev_parent;
guid = vd->vdev_guid;
+ pguid = pvd->vdev_guid;
+ ppguid = ppvd->vdev_guid;
+ sguid = 0;
/*
* If we have just finished replacing a hot spared device, then
* we need to detach the parent's first child (the original hot
* spare) as well.
*/
- pvd = vd->vdev_parent;
- if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
- pvd->vdev_id == 0) {
+ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
- ASSERT(pvd->vdev_parent->vdev_children == 2);
- pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+ ASSERT(ppvd->vdev_children == 2);
+ sguid = ppvd->vdev_child[1]->vdev_guid;
}
- spa_config_exit(spa, SCL_CONFIG, FTAG);
- if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
- if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+ if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
return;
- spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
}
- spa_config_exit(spa, SCL_CONFIG, FTAG);
+ spa_config_exit(spa, SCL_ALL, FTAG);
}
/*
@@ -3930,9 +3944,22 @@
* into config changes that go out with this transaction group.
*/
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
- vdev_state_clean(vd);
- vdev_config_dirty(vd);
+ while (list_head(&spa->spa_state_dirty_list) != NULL) {
+ /*
+ * We need the write lock here because, for aux vdevs,
+ * calling vdev_config_dirty() modifies sav_config.
+ * This is ugly and will become unnecessary when we
+ * eliminate the aux vdev wart by integrating all vdevs
+ * into the root vdev tree.
+ */
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+ while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+ vdev_state_clean(vd);
+ vdev_config_dirty(vd);
+ }
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
}
spa_config_exit(spa, SCL_STATE, FTAG);