# HG changeset patch # User George Wilson # Date 1280361451 25200 # Node ID b521d551715f7901a8b19e119c5e0985ca421ef9 # Parent 59a6f9b695ef13ed623621c3c6fbbb36dfd57945 6733267 Allow a pool to be imported with a missing slog 6950437 missing logzillas should not fault pool when they contain no ZIL data diff -r 59a6f9b695ef -r b521d551715f usr/src/cmd/zdb/zdb.c --- a/usr/src/cmd/zdb/zdb.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/cmd/zdb/zdb.c Wed Jul 28 16:57:31 2010 -0700 @@ -3076,8 +3076,11 @@ fatal("can't open '%s': %s", target, strerror(ENOMEM)); } - if ((error = spa_import(name, cfg, NULL)) != 0) - error = spa_import_verbatim(name, cfg, NULL); + if ((error = spa_import(name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG)) != 0) { + error = spa_import(name, cfg, NULL, + ZFS_IMPORT_VERBATIM); + } } } diff -r 59a6f9b695ef -r b521d551715f usr/src/cmd/zpool/zpool_main.c --- a/usr/src/cmd/zpool/zpool_main.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/cmd/zpool/zpool_main.c Wed Jul 28 16:57:31 2010 -0700 @@ -1499,7 +1499,7 @@ */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - int force, nvlist_t *props, boolean_t do_verbatim) + nvlist_t *props, int flags) { zpool_handle_t *zhp; char *name; @@ -1517,7 +1517,8 @@ (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using a newer ZFS version\n"), name); return (1); - } else if (state != POOL_STATE_EXPORTED && !force) { + } else if (state != POOL_STATE_EXPORTED && + !(flags & ZFS_IMPORT_ANY_HOST)) { uint64_t hostid; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, @@ -1551,7 +1552,7 @@ } } - if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0) + if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) @@ -1620,7 +1621,6 @@ boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; - boolean_t do_force = B_FALSE; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; @@ -1630,7 +1630,7 @@ nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; - boolean_t do_verbatim = B_FALSE; + int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; @@ -1640,7 +1640,7 @@ importargs_t idata = { 0 }; /* check options */ - while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) { + while ((c = getopt(argc, argv, ":aCc:d:DEfFmno:rR:VX")) != -1) { switch (c) { case 'a': do_all = B_TRUE; @@ -1665,11 +1665,14 @@ do_destroyed = B_TRUE; break; case 'f': - do_force = B_TRUE; + flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; + case 'm': + flags |= ZFS_IMPORT_MISSING_LOG; + break; case 'n': dryrun = B_TRUE; break; @@ -1697,7 +1700,7 @@ goto error; break; case 'V': - do_verbatim = B_TRUE; + flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; @@ -1869,7 +1872,7 @@ if (do_all) { err |= do_import(config, NULL, mntopts, - do_force, props, do_verbatim); + props, flags); } else { show_import(config); } @@ -1918,7 +1921,7 @@ err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, do_force, props, do_verbatim); + argv[1], mntopts, props, flags); } } diff -r 59a6f9b695ef -r b521d551715f usr/src/cmd/ztest/ztest.c --- a/usr/src/cmd/ztest/ztest.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/cmd/ztest/ztest.c Wed Jul 28 16:57:31 2010 -0700 @@ -4847,19 +4847,19 @@ /* * Import it under the new name. */ - VERIFY3U(0, ==, spa_import(newname, config, NULL)); + VERIFY3U(0, ==, spa_import(newname, config, NULL, 0)); ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ - VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL)); + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ - VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL)); + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. diff -r 59a6f9b695ef -r b521d551715f usr/src/lib/libzfs/common/libzfs.h --- a/usr/src/lib/libzfs/common/libzfs.h Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/lib/libzfs/common/libzfs.h Wed Jul 28 16:57:31 2010 -0700 @@ -326,7 +326,7 @@ extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, - nvlist_t *, boolean_t); + nvlist_t *, int); /* * Search for pools to import diff -r 59a6f9b695ef -r b521d551715f usr/src/lib/libzfs/common/libzfs_pool.c --- a/usr/src/lib/libzfs/common/libzfs_pool.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_pool.c Wed Jul 28 16:57:31 2010 -0700 @@ -1202,19 +1202,23 @@ static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, - nvlist_t *rbi) + nvlist_t *config) { + nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; - if (!hdl->libzfs_printerr || rbi == NULL) + if (!hdl->libzfs_printerr || config == NULL) + return; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0) return; - if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; - (void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, 0, &t) != 0) { @@ -1249,6 +1253,7 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { + nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; @@ -1264,12 +1269,12 @@ (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; - (void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, @@ -1353,12 +1358,40 @@ } } - ret = zpool_import_props(hdl, config, newname, props, B_FALSE); + ret = zpool_import_props(hdl, config, newname, props, + ZFS_IMPORT_NORMAL); if (props) nvlist_free(props); return (ret); } +static void +print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, + int indent) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + uint64_t is_log = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (name != NULL) + (void) printf("\t%*s%s%s\n", indent, "", name, + is_log ? " [log]" : ""); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); + print_vdev_tree(hdl, vname, child[c], indent + 2); + free(vname); + } +} + /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from @@ -1367,15 +1400,17 @@ */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - nvlist_t *props, boolean_t importfaulted) + nvlist_t *props, int flags) { zfs_cmd_t zc = { 0 }; zpool_rewind_policy_t policy; - nvlist_t *nvi = NULL; + nvlist_t *nv = NULL; + nvlist_t *nvinfo = NULL; + nvlist_t *missing = NULL; char *thename; char *origname; - uint64_t returned_size; int ret; + int error = 0; char errbuf[1024]; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, @@ -1418,27 +1453,36 @@ nvlist_free(props); return (-1); } - returned_size = zc.zc_nvlist_conf_size + 512; - if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) { + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { nvlist_free(props); return (-1); } - zc.zc_cookie = (uint64_t)importfaulted; - ret = 0; - if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) { + zc.zc_cookie = flags; + while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + if (ret != 0) + error = errno; + + (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); + zpool_get_rewind_policy(config, &policy); + + if (error) { char desc[1024]; - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - zpool_get_rewind_policy(config, &policy); /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ - if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) { + if (policy.zrp_request & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, - B_TRUE, nvi); - nvlist_free(nvi); + B_TRUE, nv); + nvlist_free(nv); return (-1); } @@ -1451,7 +1495,7 @@ dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); - switch (errno) { + switch (error) { case ENOTSUP: /* * Unsupported version. @@ -1469,15 +1513,32 @@ (void) zfs_error(hdl, EZFS_BADDEV, desc); break; + case ENXIO: + if (nv && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_lookup_nvlist(nvinfo, + ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "The devices below are missing, use " + "'-m' to import the pool anyway:\n")); + print_vdev_tree(hdl, NULL, missing, 2); + (void) printf("\n"); + } + (void) zpool_standard_error(hdl, error, desc); + break; + + case EEXIST: + (void) zpool_standard_error(hdl, error, desc); + break; + default: - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - (void) zpool_standard_error(hdl, errno, desc); + (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, - newname ? origname : thename, -errno, nvi); - nvlist_free(nvi); + newname ? origname : thename, -error, nv); break; } + nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; @@ -1489,15 +1550,12 @@ ret = -1; else if (zhp != NULL) zpool_close(zhp); - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, - ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), - nvi); + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv); } - nvlist_free(nvi); + nvlist_free(nv); return (0); } @@ -2816,6 +2874,7 @@ boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; + int error; if (path) (void) snprintf(msg, sizeof (msg), @@ -2846,14 +2905,21 @@ zpool_get_rewind_policy(rewindnvl, &policy); zc.zc_cookie = policy.zrp_request; - if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0) + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0) return (-1); - if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 || - ((policy.zrp_request & ZPOOL_TRY_REWIND) && + while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { diff -r 59a6f9b695ef -r b521d551715f usr/src/lib/libzfs/common/libzfs_util.c --- a/usr/src/lib/libzfs/common/libzfs_util.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/lib/libzfs/common/libzfs_util.c Wed Jul 28 16:57:31 2010 -0700 @@ -692,7 +692,7 @@ zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) - len = 4*1024; + len = 16 * 1024; zc->zc_nvlist_dst_size = len; if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL) diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/spa.c --- a/usr/src/uts/common/fs/zfs/spa.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/spa.c Wed Jul 28 16:57:31 2010 -0700 @@ -1284,33 +1284,131 @@ } /* - * Load the slog device state from the config object since it's possible - * that the label does not contain the most up-to-date information. + * Validate the current config against the MOS config */ -void -spa_load_log_state(spa_t *spa, nvlist_t *nv) +static boolean_t +spa_config_valid(spa_t *spa, nvlist_t *config) { - vdev_t *ovd, *rvd = spa->spa_root_vdev; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv; + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); + + /* + * If we're doing a normal import, then build up any additional + * diagnostic information about missing devices in this config. + * We'll pass this up to the user for further processing. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops && + mtvd->vdev_islog) + child[idx++] = vdev_config_generate(spa, mtvd, + B_FALSE, 0); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + + for (int i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + } /* - * Load the original root vdev tree from the passed config. + * Compare the root vdev tree with the information we have + * from the MOS config (mrvd). Check each top-level vdev + * with the corresponding MOS config top-level (mtvd). */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - if (cvd->vdev_islog) - vdev_load_log_state(cvd, ovd->vdev_child[c]); + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + /* + * Resolve any "missing" vdevs in the current configuration. + * If we find that the MOS config has more accurate information + * about the top-level vdev then use that vdev instead. + */ + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) { + + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) + continue; + + /* + * Device specific actions. + */ + if (mtvd->vdev_islog) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * XXX - once we have 'readonly' pool + * support we should be able to handle + * missing data devices by transitioning + * the pool to readonly. + */ + continue; + } + + /* + * Swap the missing vdev with the data we were + * able to obtain from the MOS config. + */ + vdev_remove_child(rvd, tvd); + vdev_remove_child(mrvd, mtvd); + + vdev_add_child(rvd, mtvd); + vdev_add_child(mrvd, tvd); + + spa_config_exit(spa, SCL_ALL, FTAG); + vdev_load(mtvd); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + vdev_reopen(rvd); + } else if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS config + * since it's possible that the label does not + * contain the most up-to-date information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } } - vdev_free(ovd); + vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Ensure we were able to validate the config. + */ + return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ -int +static int spa_check_logs(spa_t *spa) { switch (spa->spa_log_state) { @@ -1474,9 +1572,19 @@ if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { + int64_t loss = 0; + verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } @@ -1669,7 +1777,7 @@ nvlist_t *nvroot = NULL; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; @@ -1768,9 +1876,13 @@ /* * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. + * incomplete configuration. We first check to see if the pool + * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). + * If it is, defer the vdev_guid_sum check till later so we + * can handle missing vdevs. */ - if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -1990,13 +2102,6 @@ spa_config_exit(spa, SCL_ALL, FTAG); /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); - - /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); @@ -2005,16 +2110,12 @@ spa_update_dspace(spa); - if (state != SPA_LOAD_TRYIMPORT) { - error = spa_load_verify(spa); - if (error) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } - /* - * Load the intent log state and check log integrity. If we're - * assembling a pool from a split, the log is not transferred over. + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; @@ -2022,17 +2123,37 @@ if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - spa_load_log_state(spa, nvroot); + if (!spa_config_valid(spa, nvconfig)) { + nvlist_free(nvconfig); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } nvlist_free(nvconfig); + /* + * Now that we've validate the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); + if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (state != SPA_LOAD_TRYIMPORT) { + if (error = spa_load_verify(spa)) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; @@ -2074,12 +2195,13 @@ * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * - * If spa_load_verbatim is true, trust the current + * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || - state == SPA_LOAD_RECOVER) + state == SPA_LOAD_IMPORT || + state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -2181,9 +2303,6 @@ rewind_error = spa_load_retry(spa, state, mosconfig); } - if (config) - spa_rewind_data_to_nvlist(spa, config); - spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; @@ -2210,6 +2329,7 @@ nvlist_t **config) { spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; @@ -2233,7 +2353,6 @@ } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_load_state_t state = SPA_LOAD_OPEN; zpool_rewind_policy_t policy; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, @@ -2272,9 +2391,13 @@ * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_config) + if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; @@ -2283,7 +2406,6 @@ *spapp = NULL; return (error); } - } spa_open_ref(spa, tag); @@ -2291,6 +2413,15 @@ if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; @@ -3046,7 +3177,7 @@ spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; - spa->spa_load_verbatim = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. @@ -3115,43 +3246,10 @@ #endif /* - * Take a pool and insert it into the namespace as if it had been loaded at - * boot. - */ -int -spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) -{ - spa_t *spa; - char *altroot = NULL; - - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, config, altroot); - - spa->spa_load_verbatim = B_TRUE; - - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); - - spa_config_sync(spa, B_FALSE, B_TRUE); - - mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); - - return (0); -} - -/* * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; @@ -3171,16 +3269,30 @@ return (EEXIST); } - zpool_get_rewind_policy(config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, config, altroot); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); + + return (0); + } + spa_activate(spa, spa_mode_global); /* @@ -3188,6 +3300,10 @@ */ spa_async_suspend(spa); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when @@ -3195,14 +3311,16 @@ */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* - * Propagate anything learned about failing or best txgs - * back to caller + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). */ - spa_rewind_data_to_nvlist(spa, config); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/spa_config.c --- a/usr/src/uts/common/fs/zfs/spa_config.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_config.c Wed Jul 28 16:57:31 2010 -0700 @@ -304,24 +304,6 @@ mutex_exit(&spa->spa_props_lock); } -/* Add discovered rewind info, if any to the provided nvlist */ -void -spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl) -{ - int64_t loss = 0; - - if (tonvl == NULL || spa->spa_load_txg == 0) - return; - - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME, - spa->spa_load_txg_ts) == 0); - if (spa->spa_last_ubsync_txg) - loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; - VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS, - spa->spa_load_data_errors) == 0); -} - /* * Generate the pool's configuration based on the current in-core state. * We infer whether to generate a complete config or just one top-level config @@ -403,8 +385,7 @@ /* * Add the top-level config. We even add this on pools which - * don't support holes in the namespace as older pools will - * just ignore it. + * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); @@ -449,8 +430,6 @@ kmem_free(dds, sizeof (ddt_stat_t)); } - spa_rewind_data_to_nvlist(spa, config); - if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/spa_misc.c --- a/usr/src/uts/common/fs/zfs/spa_misc.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Wed Jul 28 16:57:31 2010 -0700 @@ -478,6 +478,9 @@ dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + if (config != NULL) VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); @@ -516,6 +519,7 @@ list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); @@ -886,10 +890,6 @@ */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - /* - * If the config changed, notify the scrub that it must restart. - * This will initiate a resilver if needed. - */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/sys/spa.h --- a/usr/src/uts/common/fs/zfs/sys/spa.h Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/spa.h Wed Jul 28 16:57:31 2010 -0700 @@ -418,8 +418,8 @@ extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); extern int spa_import_rootpool(char *devpath, char *devid); -extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); +extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, @@ -620,7 +620,6 @@ extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); -extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to); extern int spa_mode(spa_t *spa); extern uint64_t strtonum(const char *str, char **nptr); diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/sys/spa_impl.h --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Wed Jul 28 16:57:31 2010 -0700 @@ -114,13 +114,14 @@ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ + nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ - boolean_t spa_load_verbatim; /* load the given config? */ + uint64_t spa_import_flags; /* import specific flags */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/sys/vdev_impl.h --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h Wed Jul 28 16:57:31 2010 -0700 @@ -283,6 +283,7 @@ * vdev sync load and sync */ extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); +extern boolean_t vdev_log_state_valid(vdev_t *vd); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/vdev.c --- a/usr/src/uts/common/fs/zfs/vdev.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev.c Wed Jul 28 16:57:31 2010 -0700 @@ -1369,10 +1369,10 @@ nvlist_free(label); /* - * If spa->spa_load_verbatim is true, no need to check the + * If this is a verbatim import, no need to check the * state of the pool. */ - if (!spa->spa_load_verbatim && + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -2064,7 +2064,7 @@ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { - vdev_t *vd; + vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); @@ -2074,6 +2074,8 @@ if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we @@ -2093,7 +2095,7 @@ * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ - if (!vd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -2101,7 +2103,7 @@ * If we reopen the device and it's not dead, only then do we * mark it degraded. */ - vdev_reopen(vd); + vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); @@ -2343,7 +2345,7 @@ */ vd->vdev_forcefault = B_TRUE; - vd->vdev_faulted = vd->vdev_degraded = 0; + vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; @@ -3036,32 +3038,52 @@ /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original - * vdev was offline then we transfer that state to the device - * in the current vdev tree (nvd). + * vdev was offline or faulted then we transfer that state to the + * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; + ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); - if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { + if (nvd->vdev_ops->vdev_op_leaf) { /* - * It would be nice to call vdev_offline() - * directly but the pool isn't fully loaded and - * the txg threads have not been started yet. + * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; - vdev_reopen(nvd->vdev_top); + nvd->vdev_faulted = ovd->vdev_faulted; + nvd->vdev_degraded = ovd->vdev_degraded; + nvd->vdev_removed = ovd->vdev_removed; } } /* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* * Expand a vdev if possible. */ void diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/zfs_ioctl.c --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Wed Jul 28 16:57:31 2010 -0700 @@ -1206,13 +1206,15 @@ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = EINVAL; - else if (zc->zc_cookie) - error = spa_import_verbatim(zc->zc_name, config, props); else - error = spa_import(zc->zc_name, config, props); - - if (zc->zc_nvlist_dst != 0) - (void) put_nvlist(zc, config); + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); + + if (zc->zc_nvlist_dst != 0) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } nvlist_free(config); @@ -3847,7 +3849,10 @@ error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { - (void) put_nvlist(zc, config); + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; nvlist_free(config); } nvlist_free(policy); diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/fs/zfs/zil.c --- a/usr/src/uts/common/fs/zfs/zil.c Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/fs/zfs/zil.c Wed Jul 28 16:57:31 2010 -0700 @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -640,6 +640,7 @@ { zilog_t *zilog; objset_t *os; + blkptr_t *bp; int error; ASSERT(tx == NULL); @@ -651,6 +652,29 @@ } zilog = dmu_objset_zil(os); + bp = (blkptr_t *)&zilog->zl_header->zh_log; + + /* + * Check the first block and determine if it's on a log device + * which may have been removed or faulted prior to loading this + * pool. If so, there's no point in checking the rest of the log + * as its content should have already been synced to the pool. + */ + if (!BP_IS_HOLE(bp)) { + vdev_t *vd; + boolean_t valid = B_TRUE; + + spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); + vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); + if (vd->vdev_islog && vdev_is_dead(vd)) + valid = vdev_log_state_valid(vd); + spa_config_exit(os->os_spa, SCL_STATE, FTAG); + + if (!valid) { + dmu_objset_rele(os, FTAG); + return (0); + } + } /* * Because tx == NULL, zil_claim_log_block() will not actually claim diff -r 59a6f9b695ef -r b521d551715f usr/src/uts/common/sys/fs/zfs.h --- a/usr/src/uts/common/sys/fs/zfs.h Wed Jul 28 16:10:22 2010 -0700 +++ b/usr/src/uts/common/sys/fs/zfs.h Wed Jul 28 16:57:31 2010 -0700 @@ -486,6 +486,8 @@ #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ +#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ +#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -811,6 +813,14 @@ #define ZFS_OFFLINE_TEMPORARY 0x1 /* + * Flags for ZFS_IOC_POOL_IMPORT + */ +#define ZFS_IMPORT_NORMAL 0x0 +#define ZFS_IMPORT_VERBATIM 0x1 +#define ZFS_IMPORT_ANY_HOST 0x2 +#define ZFS_IMPORT_MISSING_LOG 0x4 + +/* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: *