PSARC 2008/486 Intent log replay failure handling
6707530 log device failure needs some work
--- a/usr/src/cmd/fm/dicts/ZFS.dict Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/cmd/fm/dicts/ZFS.dict Mon Aug 04 20:36:57 2008 -0700
@@ -45,3 +45,4 @@
fault.fs.zfs.vdev.checksum=15
fault.fs.zfs.io_failure_wait=16
fault.fs.zfs.io_failure_continue=17
+fault.fs.zfs.log_replay=18
--- a/usr/src/cmd/fm/dicts/ZFS.po Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/cmd/fm/dicts/ZFS.po Mon Aug 04 20:36:57 2008 -0700
@@ -297,3 +297,20 @@
msgstr "Read and write I/Os cannot be serviced."
msgid "ZFS-8000-JQ.action"
msgstr "Make sure the affected devices are connected, then run\n 'zpool clear'."
+#
+# code: ZFS-8000-K4
+# keys: fault.fs.zfs.log_replay
+#
+msgid "ZFS-8000-K4.type"
+msgstr "Error"
+msgid "ZFS-8000-K4.severity"
+msgstr "Major"
+msgid "ZFS-8000-K4.description"
+msgstr "A ZFS intent log device could not be read. Refer to %s for more information."
+msgid "ZFS-8000-K4.response"
+msgstr "No automated response will be taken."
+msgid "ZFS-8000-K4.impact"
+msgstr "The intent log(s) cannot be replayed."
+msgid "ZFS-8000-K4.action"
+msgstr "Either restore the affected device(s) and run 'zpool online',\n or ignore the intent log records by running 'zpool clear'."
+
--- a/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/cmd/fm/modules/common/zfs-diagnosis/zfs_de.c Mon Aug 04 20:36:57 2008 -0700
@@ -563,6 +563,12 @@
}
zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
+ /*
+ * Pool level fault for reading the intent logs.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
/*
* Device fault. If this occurred during pool open, then defer
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c Mon Aug 04 20:36:57 2008 -0700
@@ -1123,6 +1123,24 @@
case VDEV_AUX_BAD_LABEL:
aux = "BAD_LABEL";
break;
+ case VDEV_AUX_VERSION_NEWER:
+ aux = "VERS_NEWER";
+ break;
+ case VDEV_AUX_VERSION_OLDER:
+ aux = "VERS_OLDER";
+ break;
+ case VDEV_AUX_SPARED:
+ aux = "SPARED";
+ break;
+ case VDEV_AUX_ERR_EXCEEDED:
+ aux = "ERR_EXCEEDED";
+ break;
+ case VDEV_AUX_IO_FAILURE:
+ aux = "IO_FAILURE";
+ break;
+ case VDEV_AUX_BAD_LOG:
+ aux = "BAD_LOG";
+ break;
default:
aux = "UNKNOWN";
break;
--- a/usr/src/cmd/zpool/zpool_main.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c Mon Aug 04 20:36:57 2008 -0700
@@ -1142,16 +1142,23 @@
(void) printf(gettext("status: The pool is formatted using an "
"incompatible version.\n"));
break;
+
case ZPOOL_STATUS_HOSTID_MISMATCH:
(void) printf(gettext("status: The pool was last accessed by "
"another system.\n"));
break;
+
case ZPOOL_STATUS_FAULTED_DEV_R:
case ZPOOL_STATUS_FAULTED_DEV_NR:
(void) printf(gettext("status: One or more devices are "
"faulted.\n"));
break;
+ case ZPOOL_STATUS_BAD_LOG:
+ (void) printf(gettext("status: An intent log record cannot be "
+ "read.\n"));
+ break;
+
default:
/*
* No other status can be seen when importing pools.
@@ -2553,7 +2560,7 @@
pool = argv[1];
device = argc == 3 ? argv[2] : NULL;
- if ((zhp = zpool_open(g_zfs, pool)) == NULL)
+ if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
return (1);
if (zpool_clear(zhp, device) != 0)
@@ -2840,6 +2847,10 @@
(void) printf(gettext("experienced I/O failures"));
break;
+ case VDEV_AUX_BAD_LOG:
+ (void) printf(gettext("bad intent log"));
+ break;
+
default:
(void) printf(gettext("corrupted data"));
break;
@@ -3130,6 +3141,17 @@
"are connected, then run 'zpool clear'.\n"));
break;
+ case ZPOOL_STATUS_BAD_LOG:
+ (void) printf(gettext("status: An intent log record "
+ "could not be read.\n"
+ "\tWaiting for adminstrator intervention to fix the "
+ "faulted pool.\n"));
+ (void) printf(gettext("action: Either restore the affected "
+ "device(s) and run 'zpool online',\n"
+ "\tor ignore the intent log records by running "
+ "'zpool clear'.\n"));
+ break;
+
default:
/*
* The remaining errors can't actually be generated, yet.
--- a/usr/src/lib/libzfs/common/libzfs.h Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/lib/libzfs/common/libzfs.h Mon Aug 04 20:36:57 2008 -0700
@@ -260,6 +260,7 @@
ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */
ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */
+ ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */
/*
* The following are not faults per se, but still an error possibly
--- a/usr/src/lib/libzfs/common/libzfs_pool.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c Mon Aug 04 20:36:57 2008 -0700
@@ -137,8 +137,21 @@
uint64_t value;
zprop_source_t source;
- if (zhp->zpool_props == NULL && zpool_get_all_props(zhp))
+ if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
+ /*
+ * zpool_get_all_props() has most likely failed because
+ * the pool is faulted, but if all we need is the top level
+ * vdev's guid then get it from the zhp config nvlist.
+ */
+ if ((prop == ZPOOL_PROP_GUID) &&
+ (nvlist_lookup_nvlist(zhp->zpool_config,
+ ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
+ (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
+ == 0)) {
+ return (value);
+ }
return (zpool_prop_default_numeric(prop));
+ }
nvl = zhp->zpool_props;
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
@@ -169,7 +182,7 @@
case VDEV_STATE_REMOVED:
return (gettext("REMOVED"));
case VDEV_STATE_CANT_OPEN:
- if (aux == VDEV_AUX_CORRUPT_DATA)
+ if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
return (gettext("FAULTED"));
else
return (gettext("UNAVAIL"));
--- a/usr/src/lib/libzfs/common/libzfs_status.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/lib/libzfs/common/libzfs_status.c Mon Aug 04 20:36:57 2008 -0700
@@ -64,7 +64,8 @@
"ZFS-8000-A5",
"ZFS-8000-EY",
"ZFS-8000-HC",
- "ZFS-8000-JQ"
+ "ZFS-8000-JQ",
+ "ZFS-8000-K4",
};
#define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0]))
@@ -243,6 +244,14 @@
}
/*
+ * Could not read a log.
+ */
+ if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+ vs->vs_aux == VDEV_AUX_BAD_LOG) {
+ return (ZPOOL_STATUS_BAD_LOG);
+ }
+
+ /*
* Bad devices in non-replicated config.
*/
if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
--- a/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/lib/libzfs_jni/common/libzfs_jni_pool.c Mon Aug 04 20:36:57 2008 -0700
@@ -133,6 +133,7 @@
{ ZPOOL_STATUS_HOSTID_MISMATCH, "ZPOOL_STATUS_HOSTID_MISMATCH" },
{ ZPOOL_STATUS_FAULTED_DEV_R, "ZPOOL_STATUS_FAULTED_DEV_R" },
{ ZPOOL_STATUS_FAULTED_DEV_NR, "ZPOOL_STATUS_FAULTED_DEV_NR" },
+ { ZPOOL_STATUS_BAD_LOG, "ZPOOL_STATUS_BAD_LOG" },
{ ZPOOL_STATUS_VERSION_OLDER, "ZPOOL_STATUS_VERSION_OLDER" },
{ ZPOOL_STATUS_RESILVERING, "ZPOOL_STATUS_RESILVERING" },
{ ZPOOL_STATUS_OFFLINE_DEV, "ZPOOL_STATUS_OFFLINE_DEV" },
--- a/usr/src/uts/common/fs/zfs/arc.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c Mon Aug 04 20:36:57 2008 -0700
@@ -2338,9 +2338,6 @@
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
- /* convert checksum errors into IO errors */
- if (zio->io_error == ECKSUM)
- zio->io_error = EIO;
}
/*
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Mon Aug 04 20:36:57 2008 -0700
@@ -207,6 +207,9 @@
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
if (err) {
kmem_free(osi, sizeof (objset_impl_t));
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
return (err);
}
osi->os_phys = osi->os_phys_buf->b_data;
--- a/usr/src/uts/common/fs/zfs/spa.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c Mon Aug 04 20:36:57 2008 -0700
@@ -956,6 +956,32 @@
}
/*
+ * Check for missing log devices
+ */
+int
+spa_check_logs(spa_t *spa)
+{
+ switch (spa->spa_log_state) {
+ case SPA_LOG_MISSING:
+ /* need to recheck in case slog has been restored */
+ case SPA_LOG_UNKNOWN:
+ if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
+ DS_FIND_CHILDREN)) {
+ spa->spa_log_state = SPA_LOG_MISSING;
+ return (1);
+ }
+ break;
+
+ case SPA_LOG_CLEAR:
+ (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
+ DS_FIND_CHILDREN);
+ break;
+ }
+ spa->spa_log_state = SPA_LOG_GOOD;
+ return (0);
+}
+
+/*
* Load an existing storage pool, using the pool's builtin spa_config as a
* source of configuration information.
*/
@@ -971,6 +997,7 @@
uint64_t version;
zio_t *zio;
uint64_t autoreplace = 0;
+ char *ereport = FM_EREPORT_ZFS_POOL;
spa->spa_load_state = state;
@@ -1259,6 +1286,15 @@
spa_config_exit(spa, FTAG);
}
+ if (spa_check_logs(spa)) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LOG);
+ error = ENXIO;
+ ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+ goto out;
+ }
+
+
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -1368,7 +1404,7 @@
out:
spa->spa_minref = refcount_count(&spa->spa_refcount);
if (error && error != EBADF)
- zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+ zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
spa->spa_load_state = SPA_LOAD_NONE;
spa->spa_ena = 0;
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h Mon Aug 04 20:36:57 2008 -0700
@@ -80,6 +80,13 @@
char *scd_path;
} spa_config_dirent_t;
+typedef enum spa_log_state {
+ SPA_LOG_UNKNOWN = 0, /* unknown log state */
+ SPA_LOG_MISSING, /* missing log(s) */
+ SPA_LOG_CLEAR, /* clear the log(s) */
+ SPA_LOG_GOOD, /* log(s) are good */
+} spa_log_state_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -159,6 +166,7 @@
boolean_t spa_import_faulted; /* allow faulted vdevs */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
+ spa_log_state_t spa_log_state; /* log state */
/*
* spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
--- a/usr/src/uts/common/fs/zfs/sys/zil.h Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h Mon Aug 04 20:36:57 2008 -0700
@@ -362,6 +362,8 @@
extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
extern int zil_claim(char *osname, void *txarg);
+extern int zil_check_log_chain(char *osname, void *txarg);
+extern int zil_clear_log_chain(char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog);
extern int zil_is_committed(zilog_t *zilog);
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c Mon Aug 04 20:36:57 2008 -0700
@@ -972,6 +972,9 @@
if (error != 0) {
zfs_acl_free(aclp);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
return (error);
}
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c Mon Aug 04 20:36:57 2008 -0700
@@ -2669,6 +2669,21 @@
uint64_t txg;
int error;
+ /*
+ * On zpool clear we also fix up missing slogs
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(zc->zc_name);
+ if (spa == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (EIO);
+ }
+ if (spa->spa_log_state == SPA_LOG_MISSING) {
+ /* we need to let spa_open/spa_load clear the chains */
+ spa->spa_log_state = SPA_LOG_CLEAR;
+ }
+ mutex_exit(&spa_namespace_lock);
+
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
return (error);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c Mon Aug 04 20:36:57 2008 -0700
@@ -501,8 +501,12 @@
error = mappedread(vp, nbytes, uio);
else
error = dmu_read_uio(os, zp->z_id, uio, nbytes);
- if (error)
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
break;
+ }
n -= nbytes;
}
@@ -3897,6 +3901,9 @@
if (err) {
/* On error, toss the entire kluster */
pvn_read_done(pp, B_ERROR);
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = EIO;
return (err);
}
cur_pp = cur_pp->p_next;
--- a/usr/src/uts/common/fs/zfs/zil.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c Mon Aug 04 20:36:57 2008 -0700
@@ -505,7 +505,7 @@
error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
if (error) {
- cmn_err(CE_WARN, "can't process intent log for %s", osname);
+ cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
}
@@ -532,6 +532,83 @@
return (0);
}
+/*
+ * Check the log by walking the log chain.
+ * Checksum errors are ok as they indicate the end of the chain.
+ * Any other error (no device or read failure) returns an error.
+ */
+/* ARGSUSED */
+int
+zil_check_log_chain(char *osname, void *txarg)
+{
+ zilog_t *zilog;
+ zil_header_t *zh;
+ blkptr_t blk;
+ arc_buf_t *abuf;
+ objset_t *os;
+ char *lrbuf;
+ zil_trailer_t *ztp;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't open objset for %s", osname);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ zh = zil_header_in_syncing_context(zilog);
+ blk = zh->zh_log;
+ if (BP_IS_HOLE(&blk)) {
+ dmu_objset_close(os);
+ return (0); /* no chain */
+ }
+
+ for (;;) {
+ error = zil_read_log_block(zilog, &blk, &abuf);
+ if (error)
+ break;
+ lrbuf = abuf->b_data;
+ ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
+ blk = ztp->zit_next_blk;
+ VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+ }
+ dmu_objset_close(os);
+ if (error == ECKSUM)
+ return (0); /* normal end of chain */
+ return (error);
+}
+
+/*
+ * Clear a log chain
+ */
+/* ARGSUSED */
+int
+zil_clear_log_chain(char *osname, void *txarg)
+{
+ zilog_t *zilog;
+ zil_header_t *zh;
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+
+ error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+ if (error) {
+ cmn_err(CE_WARN, "can't open objset for %s", osname);
+ return (0);
+ }
+
+ zilog = dmu_objset_zil(os);
+ tx = dmu_tx_create(zilog->zl_os);
+ (void) dmu_tx_assign(tx, TXG_WAIT);
+ zh = zil_header_in_syncing_context(zilog);
+ BP_ZERO(&zh->zh_log);
+ dsl_dataset_dirty(dmu_objset_ds(os), tx);
+ dmu_tx_commit(tx);
+ dmu_objset_close(os);
+ return (0);
+}
+
static int
zil_vdev_compare(const void *x1, const void *x2)
{
--- a/usr/src/uts/common/fs/zfs/zvol.c Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c Mon Aug 04 20:36:57 2008 -0700
@@ -1284,8 +1284,12 @@
dmu_tx_commit(tx);
}
}
- if (error)
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
break;
+ }
off += size;
addr += size;
resid -= size;
@@ -1388,8 +1392,12 @@
bytes = volsize - uio->uio_loffset;
error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
- if (error)
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = EIO;
break;
+ }
}
zfs_range_unlock(rl);
return (error);
--- a/usr/src/uts/common/sys/fm/fs/zfs.h Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h Mon Aug 04 20:36:57 2008 -0700
@@ -47,6 +47,7 @@
#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
#define FM_EREPORT_ZFS_IO_FAILURE "io_failure"
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
+#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
--- a/usr/src/uts/common/sys/fs/zfs.h Mon Aug 04 19:27:44 2008 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h Mon Aug 04 20:36:57 2008 -0700
@@ -108,7 +108,7 @@
/*
* Pool properties are identified by these constants and must be added to the
- * end of this list to ensure that external conumsers are not affected
+ * end of this list to ensure that external consumers are not affected
* by the change. If you make any changes to this list, be sure to update
* the property table in usr/src/common/zfs/zpool_prop.c.
*/
@@ -409,7 +409,8 @@
VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */
VDEV_AUX_SPARED, /* hot spare used in another pool */
VDEV_AUX_ERR_EXCEEDED, /* too many errors */
- VDEV_AUX_IO_FAILURE /* experienced I/O failure */
+ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
+ VDEV_AUX_BAD_LOG /* cannot read log chain(s) */
} vdev_aux_t;
/*