usr/src/uts/common/fs/zfs/zfs_fm.c
author eschrock
Tue, 05 Sep 2006 11:37:36 -0700
changeset 2676 5cee47eddab6
parent 1955 24fe75aaee9a
child 4451 24fbf2d7a5d7
permissions -rw-r--r--
PSARC 2006/486 ZFS canmount property PSARC 2006/497 ZFS create time properties PSARC 2006/502 ZFS get all datasets PSARC 2006/504 ZFS user properties 6269805 properties should be set via an nvlist. 6281585 user defined properties 6349494 'zfs list' output annoying for even moderately long dataset names 6366244 'canmount' option for container-like functionality 6367103 create-time properties 6416639 RFE: provide zfs get -a 6437808 ZFS module version should match on-disk version 6454551 'zfs create -b blocksize filesystem' should fail. 6457478 unrecognized character in error message with 'zpool create -R' command 6457865 missing device name in the error message of 'zpool clear' command 6458571 zfs_ioc_set_prop() doesn't validate input 6458614 zfs ACL #defines should use prefix 6458638 get_configs() accesses bogus memory 6458678 zvol functions should be moved out of zfs_ioctl.h 6458683 zfs_cmd_t could use more cleanup 6458691 common routines to manage zfs_cmd_t nvlists 6460398 zpool import cores on zfs_prop_get 6461029 zpool status -x noexisting-pool has incorrect error message. 6461223 index translations should live with property definitions 6461424 zpool_unmount_datasets() has some busted logic 6461427 zfs_realloc() would be useful 6461757 'zpool status' can report the wrong number of persistent errors 6461784 recursive zfs_snapshot() leaks memory

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>

#include <sys/fm/fs/zfs.h>
#include <sys/fm/protocol.h>
#include <sys/fm/util.h>
#include <sys/sysevent.h>

/*
 * This general routine is responsible for generating all the different ZFS
 * ereports.  The payload is dependent on the class, and which arguments are
 * supplied to the function:
 *
 * 	EREPORT			POOL	VDEV	IO
 * 	block			X	X	X
 * 	data			X		X
 * 	device			X	X
 * 	pool			X
 *
 * If we are in a loading state, all errors are chained together by the same
 * SPA-wide ENA.
 *
 * For isolated I/O requests, we get the ENA from the zio_t. The propagation
 * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
 * to chain together all ereports associated with a logical piece of data.  For
 * read I/Os, there  are basically three 'types' of I/O, which form a roughly
 * layered diagram:
 *
 *      +---------------+
 * 	| Aggregate I/O |	No associated logical data or device
 * 	+---------------+
 *              |
 *              V
 * 	+---------------+	Reads associated with a piece of logical data.
 * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
 * 	+---------------+       mirrors, gang blocks, retries, etc.
 *              |
 *              V
 * 	+---------------+	Reads associated with a particular device, but
 * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
 * 	+---------------+	and I/O aggregation.
 *
 * Note that 'physical I/O' here is not the same terminology as used in the rest
 * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
 * blockpointer.  But I/O with no associated block pointer can still be related
 * to a logical piece of data (i.e. RAID-Z requests).
 *
 * Purely physical I/O always have unique ENAs.  They are not related to a
 * particular piece of logical data, and therefore cannot be chained together.
 * We still generate an ereport, but the DE doesn't correlate it with any
 * logical piece of data.  When such an I/O fails, the delegated I/O requests
 * will issue a retry, which will trigger the 'real' ereport with the correct
 * ENA.
 *
 * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
 * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
 * then inherit this pointer, so that when it is first set subsequent failures
 * will use the same ENA.  If a physical I/O is issued (by passing the
 * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
 * unique ENA will be generated.  For an aggregate I/O, this pointer is set to
 * NULL, and no ereport will be generated (since it doesn't actually correspond
 * to any particular device or piece of data).
 */
void
zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
    uint64_t stateoroffset, uint64_t size)
{
#ifdef _KERNEL
	nvlist_t *ereport, *detector;
	uint64_t ena;
	char class[64];

	/*
	 * If we are doing a spa_tryimport(), ignore errors.
	 */
	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
		return;

	/*
	 * If we are in the middle of opening a pool, and the previous attempt
	 * failed, don't bother logging any new ereports - we're just going to
	 * get the same diagnosis anyway.
	 */
	if (spa->spa_load_state != SPA_LOAD_NONE &&
	    spa->spa_last_open_failed)
		return;

	/*
	 * Ignore any errors from I/Os that we are going to retry anyway - we
	 * only generate errors from the final failure.
	 */
	if (zio && zio_should_retry(zio))
		return;

	/*
	 * If this is not a read or write zio, ignore the error.  This can occur
	 * if the DKIOCFLUSHWRITECACHE ioctl fails.
	 */
	if (zio && zio->io_type != ZIO_TYPE_READ &&
	    zio->io_type != ZIO_TYPE_WRITE)
		return;

	if ((ereport = fm_nvlist_create(NULL)) == NULL)
		return;

	if ((detector = fm_nvlist_create(NULL)) == NULL) {
		fm_nvlist_destroy(ereport, FM_NVA_FREE);
		return;
	}

	/*
	 * Serialize ereport generation
	 */
	mutex_enter(&spa->spa_errlist_lock);

	/*
	 * Determine the ENA to use for this event.  If we are in a loading
	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
	 */
	if (spa->spa_load_state != SPA_LOAD_NONE) {
		if (spa->spa_ena == 0)
			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
		ena = spa->spa_ena;
	} else if (zio != NULL && zio->io_logical != NULL) {
		if (zio->io_logical->io_ena == 0)
			zio->io_logical->io_ena =
			    fm_ena_generate(0, FM_ENA_FMT1);
		ena = zio->io_logical->io_ena;
	} else {
		ena = fm_ena_generate(0, FM_ENA_FMT1);
	}

	/*
	 * Construct the full class, detector, and other standard FMA fields.
	 */
	(void) snprintf(class, sizeof (class), "%s.%s",
	    ZFS_ERROR_CLASS, subclass);

	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
	    vd != NULL ? vd->vdev_guid : 0);

	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);

	/*
	 * Construct the per-ereport payload, depending on which parameters are
	 * passed in.
	 */

	/*
	 * Generic payload members common to all ereports.
	 *
	 * The direct reference to spa_name is used rather than spa_name()
	 * because of the asynchronous nature of the zio pipeline.  spa_name()
	 * asserts that the config lock is held in some form.  This is always
	 * the case in I/O context, but because the check for RW_WRITER compares
	 * against 'curthread', we may be in an asynchronous context and blow
	 * this assert.  Rather than loosen this assert, we acknowledge that all
	 * contexts in which this function is called (pool open, I/O) are safe,
	 * and dereference the name directly.
	 */
	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
	    DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
	    DATA_TYPE_UINT64, spa_guid(spa),
	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
	    spa->spa_load_state, NULL);

	if (vd != NULL) {
		vdev_t *pvd = vd->vdev_parent;

		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
		    DATA_TYPE_UINT64, vd->vdev_guid,
		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
		if (vd->vdev_path)
			fm_payload_set(ereport,
			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
			    DATA_TYPE_STRING, vd->vdev_path, NULL);
		if (vd->vdev_devid)
			fm_payload_set(ereport,
			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
			    DATA_TYPE_STRING, vd->vdev_devid, NULL);

		if (pvd != NULL) {
			fm_payload_set(ereport,
			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
			    DATA_TYPE_UINT64, pvd->vdev_guid,
			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
			    NULL);
			if (pvd->vdev_path)
				fm_payload_set(ereport,
				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
				    DATA_TYPE_STRING, vd->vdev_path, NULL);
			if (pvd->vdev_devid)
				fm_payload_set(ereport,
				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
		}
	}

	if (zio != NULL) {
		/*
		 * Payload common to all I/Os.
		 */
		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
		    DATA_TYPE_INT32, zio->io_error, NULL);

		/*
		 * If the 'size' parameter is non-zero, it indicates this is a
		 * RAID-Z or other I/O where the physical offset and length are
		 * provided for us, instead of within the zio_t.
		 */
		if (vd != NULL) {
			if (size)
				fm_payload_set(ereport,
				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
				    DATA_TYPE_UINT64, stateoroffset,
				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
				    DATA_TYPE_UINT64, size, NULL);
			else
				fm_payload_set(ereport,
				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
				    DATA_TYPE_UINT64, zio->io_offset,
				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
				    DATA_TYPE_UINT64, zio->io_size, NULL);
		}

		/*
		 * Payload for I/Os with corresponding logical information.
		 */
		if (zio->io_logical != NULL)
			fm_payload_set(ereport,
			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
			    DATA_TYPE_UINT64,
			    zio->io_logical->io_bookmark.zb_objset,
			    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
			    DATA_TYPE_UINT64,
			    zio->io_logical->io_bookmark.zb_object,
			    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
			    DATA_TYPE_INT32,
			    zio->io_logical->io_bookmark.zb_level,
			    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
			    DATA_TYPE_UINT64,
			    zio->io_logical->io_bookmark.zb_blkid, NULL);
	} else if (vd != NULL) {
		/*
		 * If we have a vdev but no zio, this is a device fault, and the
		 * 'stateoroffset' parameter indicates the previous state of the
		 * vdev.
		 */
		fm_payload_set(ereport,
		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
		    DATA_TYPE_UINT64, stateoroffset, NULL);
	}
	mutex_exit(&spa->spa_errlist_lock);

	fm_ereport_post(ereport, EVCH_SLEEP);

	fm_nvlist_destroy(ereport, FM_NVA_FREE);
	fm_nvlist_destroy(detector, FM_NVA_FREE);
#endif
}

/*
 * The 'resource.fs.zfs.ok' event is an internal signal that the associated
 * resource (pool or disk) has been identified by ZFS as healthy.  This will
 * then trigger the DE to close the associated case, if any.
 */
void
zfs_post_ok(spa_t *spa, vdev_t *vd)
{
#ifdef _KERNEL
	nvlist_t *resource;
	char class[64];

	if ((resource = fm_nvlist_create(NULL)) == NULL)
		return;

	(void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
	    ZFS_ERROR_CLASS, FM_RESOURCE_OK);
	VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
	VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
	VERIFY(nvlist_add_uint64(resource,
	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
	if (vd)
		VERIFY(nvlist_add_uint64(resource,
		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);

	fm_ereport_post(resource, EVCH_SLEEP);

	fm_nvlist_destroy(resource, FM_NVA_FREE);
#endif
}