usr/src/uts/common/fs/zfs/zio.c
changeset 7754 b80e4842ad54
parent 7181 8d299641aa23
child 7872 40a9434212f6
equal deleted inserted replaced
7753:ebbac916a413 7754:b80e4842ad54
    21 /*
    21 /*
    22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
    22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
    23  * Use is subject to license terms.
    23  * Use is subject to license terms.
    24  */
    24  */
    25 
    25 
    26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
       
    27 
       
    28 #include <sys/zfs_context.h>
    26 #include <sys/zfs_context.h>
    29 #include <sys/fm/fs/zfs.h>
    27 #include <sys/fm/fs/zfs.h>
    30 #include <sys/spa.h>
    28 #include <sys/spa.h>
    31 #include <sys/txg.h>
    29 #include <sys/txg.h>
    32 #include <sys/spa_impl.h>
    30 #include <sys/spa_impl.h>
    59  * ==========================================================================
    57  * ==========================================================================
    60  */
    58  */
    61 char *zio_type_name[ZIO_TYPES] = {
    59 char *zio_type_name[ZIO_TYPES] = {
    62 	"null", "read", "write", "free", "claim", "ioctl" };
    60 	"null", "read", "write", "free", "claim", "ioctl" };
    63 
    61 
    64 /* Force an allocation failure when non-zero */
    62 #define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
    65 uint16_t zio_zil_fail_shift = 0;
    63 #define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
    66 uint16_t zio_io_fail_shift = 0;
    64 #define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
    67 
       
    68 /* Enable/disable the write-retry logic */
       
    69 int zio_write_retry = 1;
       
    70 
       
    71 /* Taskq to handle reissuing of I/Os */
       
    72 taskq_t *zio_taskq;
       
    73 int zio_resume_threads = 4;
       
    74 
       
    75 typedef struct zio_sync_pass {
       
    76 	int	zp_defer_free;		/* defer frees after this pass */
       
    77 	int	zp_dontcompress;	/* don't compress after this pass */
       
    78 	int	zp_rewrite;		/* rewrite new bps after this pass */
       
    79 } zio_sync_pass_t;
       
    80 
       
    81 zio_sync_pass_t zio_sync_pass = {
       
    82 	1,	/* zp_defer_free */
       
    83 	4,	/* zp_dontcompress */
       
    84 	1,	/* zp_rewrite */
       
    85 };
       
    86 
       
    87 static boolean_t zio_io_should_fail(uint16_t);
       
    88 
    65 
    89 /*
    66 /*
    90  * ==========================================================================
    67  * ==========================================================================
    91  * I/O kmem caches
    68  * I/O kmem caches
    92  * ==========================================================================
    69  * ==========================================================================
    98 #ifdef _KERNEL
    75 #ifdef _KERNEL
    99 extern vmem_t *zio_alloc_arena;
    76 extern vmem_t *zio_alloc_arena;
   100 #endif
    77 #endif
   101 
    78 
   102 /*
    79 /*
   103  * Determine if we are allowed to issue the IO based on the
    80  * An allocating zio is one that either currently has the DVA allocate
   104  * pool state. If we must wait then block until we are told
    81  * stage set or will have it later in its lifetime.
   105  * that we may continue.
       
   106  */
       
   107 #define	ZIO_ENTER(spa) {						\
       
   108 	if (spa->spa_state == POOL_STATE_IO_FAILURE) {			\
       
   109 		mutex_enter(&spa->spa_zio_lock);			\
       
   110 		while (spa->spa_state == POOL_STATE_IO_FAILURE)		\
       
   111 			cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock);	\
       
   112 		mutex_exit(&spa->spa_zio_lock);				\
       
   113 	}								\
       
   114 }
       
   115 
       
   116 /*
       
   117  * An allocation zio is one that either currently has the DVA allocate
       
   118  * stage set or will have it later in it's lifetime.
       
   119  */
    82  */
   120 #define	IO_IS_ALLOCATING(zio) \
    83 #define	IO_IS_ALLOCATING(zio) \
   121 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
    84 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
   122 
    85 
   123 void
    86 void
   127 	vmem_t *data_alloc_arena = NULL;
    90 	vmem_t *data_alloc_arena = NULL;
   128 
    91 
   129 #ifdef _KERNEL
    92 #ifdef _KERNEL
   130 	data_alloc_arena = zio_alloc_arena;
    93 	data_alloc_arena = zio_alloc_arena;
   131 #endif
    94 #endif
   132 
       
   133 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
    95 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
   134 	    NULL, NULL, NULL, NULL, NULL, 0);
    96 	    NULL, NULL, NULL, NULL, NULL, 0);
   135 
    97 
   136 	/*
    98 	/*
   137 	 * For small buffers, we want a cache for each multiple of
    99 	 * For small buffers, we want a cache for each multiple of
   163 
   125 
   164 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
   126 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
   165 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
   127 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
   166 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
   128 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
   167 			    KMC_NODEBUG);
   129 			    KMC_NODEBUG);
   168 
       
   169 		}
   130 		}
   170 	}
   131 	}
   171 
   132 
   172 	while (--c != 0) {
   133 	while (--c != 0) {
   173 		ASSERT(zio_buf_cache[c] != NULL);
   134 		ASSERT(zio_buf_cache[c] != NULL);
   176 
   137 
   177 		ASSERT(zio_data_buf_cache[c] != NULL);
   138 		ASSERT(zio_data_buf_cache[c] != NULL);
   178 		if (zio_data_buf_cache[c - 1] == NULL)
   139 		if (zio_data_buf_cache[c - 1] == NULL)
   179 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
   140 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
   180 	}
   141 	}
   181 
       
   182 	zio_taskq = taskq_create("zio_taskq", zio_resume_threads,
       
   183 	    maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
       
   184 
   142 
   185 	zio_inject_init();
   143 	zio_inject_init();
   186 }
   144 }
   187 
   145 
   188 void
   146 void
   204 			kmem_cache_destroy(zio_data_buf_cache[c]);
   162 			kmem_cache_destroy(zio_data_buf_cache[c]);
   205 		}
   163 		}
   206 		zio_data_buf_cache[c] = NULL;
   164 		zio_data_buf_cache[c] = NULL;
   207 	}
   165 	}
   208 
   166 
   209 	taskq_destroy(zio_taskq);
       
   210 
       
   211 	kmem_cache_destroy(zio_cache);
   167 	kmem_cache_destroy(zio_cache);
   212 
   168 
   213 	zio_inject_fini();
   169 	zio_inject_fini();
   214 }
   170 }
   215 
   171 
   275  * ==========================================================================
   231  * ==========================================================================
   276  * Push and pop I/O transform buffers
   232  * Push and pop I/O transform buffers
   277  * ==========================================================================
   233  * ==========================================================================
   278  */
   234  */
   279 static void
   235 static void
   280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize)
   236 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
       
   237 	zio_transform_func_t *transform)
   281 {
   238 {
   282 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
   239 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
   283 
   240 
   284 	zt->zt_data = data;
   241 	zt->zt_orig_data = zio->io_data;
   285 	zt->zt_size = size;
   242 	zt->zt_orig_size = zio->io_size;
   286 	zt->zt_bufsize = bufsize;
   243 	zt->zt_bufsize = bufsize;
       
   244 	zt->zt_transform = transform;
   287 
   245 
   288 	zt->zt_next = zio->io_transform_stack;
   246 	zt->zt_next = zio->io_transform_stack;
   289 	zio->io_transform_stack = zt;
   247 	zio->io_transform_stack = zt;
   290 
   248 
   291 	zio->io_data = data;
   249 	zio->io_data = data;
   292 	zio->io_size = size;
   250 	zio->io_size = size;
   293 }
   251 }
   294 
   252 
   295 static void
   253 static void
   296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize)
   254 zio_pop_transforms(zio_t *zio)
   297 {
   255 {
   298 	zio_transform_t *zt = zio->io_transform_stack;
   256 	zio_transform_t *zt;
   299 
   257 
   300 	*data = zt->zt_data;
   258 	while ((zt = zio->io_transform_stack) != NULL) {
   301 	*size = zt->zt_size;
   259 		if (zt->zt_transform != NULL)
   302 	*bufsize = zt->zt_bufsize;
   260 			zt->zt_transform(zio,
   303 
   261 			    zt->zt_orig_data, zt->zt_orig_size);
   304 	zio->io_transform_stack = zt->zt_next;
   262 
   305 	kmem_free(zt, sizeof (zio_transform_t));
   263 		zio_buf_free(zio->io_data, zt->zt_bufsize);
   306 
   264 
   307 	if ((zt = zio->io_transform_stack) != NULL) {
   265 		zio->io_data = zt->zt_orig_data;
   308 		zio->io_data = zt->zt_data;
   266 		zio->io_size = zt->zt_orig_size;
   309 		zio->io_size = zt->zt_size;
   267 		zio->io_transform_stack = zt->zt_next;
   310 	}
   268 
   311 }
   269 		kmem_free(zt, sizeof (zio_transform_t));
   312 
   270 	}
       
   271 }
       
   272 
       
   273 /*
       
   274  * ==========================================================================
       
   275  * I/O transform callbacks for subblocks and decompression
       
   276  * ==========================================================================
       
   277  */
   313 static void
   278 static void
   314 zio_clear_transform_stack(zio_t *zio)
   279 zio_subblock(zio_t *zio, void *data, uint64_t size)
   315 {
   280 {
   316 	void *data;
   281 	ASSERT(zio->io_size > size);
   317 	uint64_t size, bufsize;
   282 
   318 
   283 	if (zio->io_type == ZIO_TYPE_READ)
   319 	ASSERT(zio->io_transform_stack != NULL);
   284 		bcopy(zio->io_data, data, size);
   320 
   285 }
   321 	zio_pop_transform(zio, &data, &size, &bufsize);
   286 
   322 	while (zio->io_transform_stack != NULL) {
   287 static void
   323 		zio_buf_free(data, bufsize);
   288 zio_decompress(zio_t *zio, void *data, uint64_t size)
   324 		zio_pop_transform(zio, &data, &size, &bufsize);
   289 {
   325 	}
   290 	if (zio->io_error == 0 &&
   326 }
   291 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
   327 
   292 	    zio->io_data, zio->io_size, data, size) != 0)
   328 /*
   293 		zio->io_error = EIO;
   329  * ==========================================================================
   294 }
   330  * Create the various types of I/O (read, write, free)
   295 
       
   296 /*
       
   297  * ==========================================================================
       
   298  * I/O parent/child relationships and pipeline interlocks
       
   299  * ==========================================================================
       
   300  */
       
   301 
       
   302 static void
       
   303 zio_add_child(zio_t *pio, zio_t *zio)
       
   304 {
       
   305 	mutex_enter(&pio->io_lock);
       
   306 	if (zio->io_stage < ZIO_STAGE_READY)
       
   307 		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
       
   308 	if (zio->io_stage < ZIO_STAGE_DONE)
       
   309 		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
       
   310 	zio->io_sibling_prev = NULL;
       
   311 	zio->io_sibling_next = pio->io_child;
       
   312 	if (pio->io_child != NULL)
       
   313 		pio->io_child->io_sibling_prev = zio;
       
   314 	pio->io_child = zio;
       
   315 	zio->io_parent = pio;
       
   316 	mutex_exit(&pio->io_lock);
       
   317 }
       
   318 
       
   319 static void
       
   320 zio_remove_child(zio_t *pio, zio_t *zio)
       
   321 {
       
   322 	zio_t *next, *prev;
       
   323 
       
   324 	ASSERT(zio->io_parent == pio);
       
   325 
       
   326 	mutex_enter(&pio->io_lock);
       
   327 	next = zio->io_sibling_next;
       
   328 	prev = zio->io_sibling_prev;
       
   329 	if (next != NULL)
       
   330 		next->io_sibling_prev = prev;
       
   331 	if (prev != NULL)
       
   332 		prev->io_sibling_next = next;
       
   333 	if (pio->io_child == zio)
       
   334 		pio->io_child = next;
       
   335 	mutex_exit(&pio->io_lock);
       
   336 }
       
   337 
       
   338 static boolean_t
       
   339 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
       
   340 {
       
   341 	uint64_t *countp = &zio->io_children[child][wait];
       
   342 	boolean_t waiting = B_FALSE;
       
   343 
       
   344 	mutex_enter(&zio->io_lock);
       
   345 	ASSERT(zio->io_stall == NULL);
       
   346 	if (*countp != 0) {
       
   347 		zio->io_stage--;
       
   348 		zio->io_stall = countp;
       
   349 		waiting = B_TRUE;
       
   350 	}
       
   351 	mutex_exit(&zio->io_lock);
       
   352 
       
   353 	return (waiting);
       
   354 }
       
   355 
       
   356 static void
       
   357 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
       
   358 {
       
   359 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
       
   360 	int *errorp = &pio->io_child_error[zio->io_child_type];
       
   361 
       
   362 	mutex_enter(&pio->io_lock);
       
   363 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
       
   364 		*errorp = zio_worst_error(*errorp, zio->io_error);
       
   365 	pio->io_reexecute |= zio->io_reexecute;
       
   366 	ASSERT3U(*countp, >, 0);
       
   367 	if (--*countp == 0 && pio->io_stall == countp) {
       
   368 		pio->io_stall = NULL;
       
   369 		mutex_exit(&pio->io_lock);
       
   370 		zio_execute(pio);
       
   371 	} else {
       
   372 		mutex_exit(&pio->io_lock);
       
   373 	}
       
   374 }
       
   375 
       
   376 static void
       
   377 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
       
   378 {
       
   379 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
       
   380 		zio->io_error = zio->io_child_error[c];
       
   381 }
       
   382 
       
   383 /*
       
   384  * ==========================================================================
       
   385  * Create the various types of I/O (read, write, free, etc)
   331  * ==========================================================================
   386  * ==========================================================================
   332  */
   387  */
   333 static zio_t *
   388 static zio_t *
   334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   389 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   335     void *data, uint64_t size, zio_done_func_t *done, void *private,
   390     void *data, uint64_t size, zio_done_func_t *done, void *private,
   336     zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline)
   391     zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
       
   392     const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
   337 {
   393 {
   338 	zio_t *zio;
   394 	zio_t *zio;
   339 
   395 
   340 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
   396 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
   341 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
   397 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
   342 
   398 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
   343 	/* Only we should set CONFIG_GRABBED */
   399 
   344 	ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED));
   400 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
       
   401 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
       
   402 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
   345 
   403 
   346 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
   404 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
   347 	bzero(zio, sizeof (zio_t));
   405 	bzero(zio, sizeof (zio_t));
   348 	zio->io_parent = pio;
   406 
   349 	zio->io_spa = spa;
   407 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
   350 	zio->io_txg = txg;
   408 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
   351 	zio->io_flags = flags;
   409 
       
   410 	if (vd != NULL)
       
   411 		zio->io_child_type = ZIO_CHILD_VDEV;
       
   412 	else if (flags & ZIO_FLAG_GANG_CHILD)
       
   413 		zio->io_child_type = ZIO_CHILD_GANG;
       
   414 	else
       
   415 		zio->io_child_type = ZIO_CHILD_LOGICAL;
       
   416 
   352 	if (bp != NULL) {
   417 	if (bp != NULL) {
   353 		zio->io_bp = bp;
   418 		zio->io_bp = bp;
   354 		zio->io_bp_copy = *bp;
   419 		zio->io_bp_copy = *bp;
   355 		zio->io_bp_orig = *bp;
   420 		zio->io_bp_orig = *bp;
   356 	}
   421 		if (type != ZIO_TYPE_WRITE)
       
   422 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
       
   423 		if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
       
   424 			if (BP_IS_GANG(bp))
       
   425 				pipeline |= ZIO_GANG_STAGES;
       
   426 			zio->io_logical = zio;
       
   427 		}
       
   428 	}
       
   429 
       
   430 	zio->io_spa = spa;
       
   431 	zio->io_txg = txg;
       
   432 	zio->io_data = data;
       
   433 	zio->io_size = size;
   357 	zio->io_done = done;
   434 	zio->io_done = done;
   358 	zio->io_private = private;
   435 	zio->io_private = private;
   359 	zio->io_type = type;
   436 	zio->io_type = type;
   360 	zio->io_priority = priority;
   437 	zio->io_priority = priority;
   361 	zio->io_stage = stage;
   438 	zio->io_vd = vd;
   362 	zio->io_pipeline = pipeline;
   439 	zio->io_offset = offset;
   363 	zio->io_timestamp = lbolt64;
   440 	zio->io_orig_flags = zio->io_flags = flags;
   364 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
   441 	zio->io_orig_stage = zio->io_stage = stage;
   365 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
   442 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
   366 	zio_push_transform(zio, data, size, size);
   443 
   367 
   444 	if (zb != NULL)
   368 	/*
   445 		zio->io_bookmark = *zb;
   369 	 * Note on config lock:
   446 
   370 	 *
   447 	if (pio != NULL) {
   371 	 * If CONFIG_HELD is set, then the caller already has the config
   448 		/*
   372 	 * lock, so we don't need it for this io.
   449 		 * Logical I/Os can have logical, gang, or vdev children.
   373 	 *
   450 		 * Gang I/Os can have gang or vdev children.
   374 	 * We set CONFIG_GRABBED to indicate that we have grabbed the
   451 		 * Vdev I/Os can only have vdev children.
   375 	 * config lock on behalf of this io, so it should be released
   452 		 * The following ASSERT captures all of these constraints.
   376 	 * in zio_done.
   453 		 */
   377 	 *
   454 		ASSERT(zio->io_child_type <= pio->io_child_type);
   378 	 * Unless CONFIG_HELD is set, we will grab the config lock for
   455 		if (zio->io_logical == NULL)
   379 	 * any top-level (parent-less) io, *except* NULL top-level ios.
       
   380 	 * The NULL top-level ios rarely have any children, so we delay
       
   381 	 * grabbing the lock until the first child is added (but it is
       
   382 	 * still grabbed on behalf of the top-level i/o, so additional
       
   383 	 * children don't need to also grab it).  This greatly reduces
       
   384 	 * contention on the config lock.
       
   385 	 */
       
   386 	if (pio == NULL) {
       
   387 		if (type != ZIO_TYPE_NULL &&
       
   388 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
       
   389 			spa_config_enter(spa, RW_READER, zio);
       
   390 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
       
   391 		}
       
   392 		zio->io_root = zio;
       
   393 	} else {
       
   394 		zio->io_root = pio->io_root;
       
   395 		if (!(flags & ZIO_FLAG_NOBOOKMARK))
       
   396 			zio->io_logical = pio->io_logical;
   456 			zio->io_logical = pio->io_logical;
   397 		mutex_enter(&pio->io_lock);
   457 		zio_add_child(pio, zio);
   398 		if (pio->io_parent == NULL &&
   458 	}
   399 		    pio->io_type == ZIO_TYPE_NULL &&
       
   400 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
       
   401 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
       
   402 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
       
   403 			spa_config_enter(spa, RW_READER, pio);
       
   404 		}
       
   405 		if (stage < ZIO_STAGE_READY)
       
   406 			pio->io_children_notready++;
       
   407 		pio->io_children_notdone++;
       
   408 		zio->io_sibling_next = pio->io_child;
       
   409 		zio->io_sibling_prev = NULL;
       
   410 		if (pio->io_child != NULL)
       
   411 			pio->io_child->io_sibling_prev = zio;
       
   412 		pio->io_child = zio;
       
   413 		zio->io_ndvas = pio->io_ndvas;
       
   414 		mutex_exit(&pio->io_lock);
       
   415 	}
       
   416 
       
   417 	/*
       
   418 	 * Save off the original state incase we need to retry later.
       
   419 	 */
       
   420 	zio->io_orig_stage = zio->io_stage;
       
   421 	zio->io_orig_pipeline = zio->io_pipeline;
       
   422 	zio->io_orig_flags = zio->io_flags;
       
   423 
       
   424 	/*
       
   425 	 * If this is not a null zio, and config is not already held,
       
   426 	 * then the root zio should have grabbed the config lock.
       
   427 	 * If this is not a root zio, it should not have grabbed the
       
   428 	 * config lock.
       
   429 	 */
       
   430 	ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) ||
       
   431 	    zio->io_type == ZIO_TYPE_NULL ||
       
   432 	    (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED));
       
   433 	ASSERT(zio->io_root == zio ||
       
   434 	    !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED));
       
   435 
   459 
   436 	return (zio);
   460 	return (zio);
   437 }
   461 }
   438 
   462 
   439 static void
   463 static void
   440 zio_reset(zio_t *zio)
   464 zio_destroy(zio_t *zio)
   441 {
   465 {
   442 	zio_clear_transform_stack(zio);
   466 	spa_t *spa = zio->io_spa;
   443 
   467 	uint8_t async_root = zio->io_async_root;
   444 	zio->io_flags = zio->io_orig_flags;
   468 
   445 	zio->io_stage = zio->io_orig_stage;
   469 	mutex_destroy(&zio->io_lock);
   446 	zio->io_pipeline = zio->io_orig_pipeline;
   470 	cv_destroy(&zio->io_cv);
   447 	zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size);
   471 	kmem_cache_free(zio_cache, zio);
       
   472 
       
   473 	if (async_root) {
       
   474 		mutex_enter(&spa->spa_async_root_lock);
       
   475 		if (--spa->spa_async_root_count == 0)
       
   476 			cv_broadcast(&spa->spa_async_root_cv);
       
   477 		mutex_exit(&spa->spa_async_root_lock);
       
   478 	}
   448 }
   479 }
   449 
   480 
   450 zio_t *
   481 zio_t *
   451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
   482 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
   452 	int flags)
   483 	int flags)
   453 {
   484 {
   454 	zio_t *zio;
   485 	zio_t *zio;
   455 
   486 
   456 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
   487 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
   457 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN,
   488 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
   458 	    ZIO_WAIT_FOR_CHILDREN_PIPELINE);
   489 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
   459 
   490 
   460 	return (zio);
   491 	return (zio);
   461 }
   492 }
   462 
   493 
   463 zio_t *
   494 zio_t *
   465 {
   496 {
   466 	return (zio_null(NULL, spa, done, private, flags));
   497 	return (zio_null(NULL, spa, done, private, flags));
   467 }
   498 }
   468 
   499 
   469 zio_t *
   500 zio_t *
   470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
   501 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
   471     uint64_t size, zio_done_func_t *done, void *private,
   502     void *data, uint64_t size, zio_done_func_t *done, void *private,
   472     int priority, int flags, const zbookmark_t *zb)
   503     int priority, int flags, const zbookmark_t *zb)
   473 {
   504 {
   474 	zio_t *zio;
   505 	zio_t *zio;
   475 
       
   476 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
       
   477 
       
   478 	/*
       
   479 	 * If the user has specified that we allow I/Os to continue
       
   480 	 * then attempt to satisfy the read.
       
   481 	 */
       
   482 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
       
   483 		ZIO_ENTER(spa);
       
   484 
   506 
   485 	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
   507 	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
   486 	    data, size, done, private,
   508 	    data, size, done, private,
   487 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
   509 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
   488 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
   510 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
   489 	zio->io_bookmark = *zb;
       
   490 
       
   491 	zio->io_logical = zio;
       
   492 
       
   493 	/*
       
   494 	 * Work off our copy of the bp so the caller can free it.
       
   495 	 */
       
   496 	zio->io_bp = &zio->io_bp_copy;
       
   497 
   511 
   498 	return (zio);
   512 	return (zio);
   499 }
   513 }
   500 
   514 
   501 zio_t *
   515 zio_t *
   502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
   516 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   503     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
   517     void *data, uint64_t size, zio_prop_t *zp,
   504     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
   518     zio_done_func_t *ready, zio_done_func_t *done, void *private,
   505     int flags, const zbookmark_t *zb)
   519     int priority, int flags, const zbookmark_t *zb)
   506 {
   520 {
   507 	zio_t *zio;
   521 	zio_t *zio;
   508 
   522 
   509 	ASSERT(checksum >= ZIO_CHECKSUM_OFF &&
   523 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
   510 	    checksum < ZIO_CHECKSUM_FUNCTIONS);
   524 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
   511 
   525 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
   512 	ASSERT(compress >= ZIO_COMPRESS_OFF &&
   526 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
   513 	    compress < ZIO_COMPRESS_FUNCTIONS);
   527 	    zp->zp_type < DMU_OT_NUMTYPES &&
   514 
   528 	    zp->zp_level < 32 &&
   515 	ZIO_ENTER(spa);
   529 	    zp->zp_ndvas > 0 &&
       
   530 	    zp->zp_ndvas <= spa_max_replication(spa));
       
   531 	ASSERT(ready != NULL);
   516 
   532 
   517 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
   533 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
   518 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
   534 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
   519 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
   535 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
   520 
   536 
   521 	zio->io_ready = ready;
   537 	zio->io_ready = ready;
   522 
   538 	zio->io_prop = *zp;
   523 	zio->io_bookmark = *zb;
       
   524 
       
   525 	zio->io_logical = zio;
       
   526 
       
   527 	zio->io_checksum = checksum;
       
   528 	zio->io_compress = compress;
       
   529 	zio->io_ndvas = ncopies;
       
   530 
       
   531 	if (bp->blk_birth != txg) {
       
   532 		/* XXX the bp usually (always?) gets re-zeroed later */
       
   533 		BP_ZERO(bp);
       
   534 		BP_SET_LSIZE(bp, size);
       
   535 		BP_SET_PSIZE(bp, size);
       
   536 	} else {
       
   537 		/* Make sure someone doesn't change their mind on overwrites */
       
   538 		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
       
   539 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
       
   540 	}
       
   541 
   539 
   542 	return (zio);
   540 	return (zio);
   543 }
   541 }
   544 
   542 
   545 zio_t *
   543 zio_t *
   546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg,
   544 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
   547     blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done,
   545     uint64_t size, zio_done_func_t *done, void *private, int priority,
   548     void *private, int priority, int flags, zbookmark_t *zb)
   546     int flags, zbookmark_t *zb)
   549 {
   547 {
   550 	zio_t *zio;
   548 	zio_t *zio;
   551 
   549 
   552 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
   550 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
   553 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
   551 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
   554 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
   552 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
   555 
       
   556 	zio->io_bookmark = *zb;
       
   557 	zio->io_checksum = checksum;
       
   558 	zio->io_compress = ZIO_COMPRESS_OFF;
       
   559 
       
   560 	if (pio != NULL)
       
   561 		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
       
   562 
       
   563 	return (zio);
       
   564 }
       
   565 
       
   566 static void
       
   567 zio_write_allocate_ready(zio_t *zio)
       
   568 {
       
   569 	/* Free up the previous block */
       
   570 	if (!BP_IS_HOLE(&zio->io_bp_orig)) {
       
   571 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
       
   572 		    &zio->io_bp_orig, NULL, NULL));
       
   573 	}
       
   574 }
       
   575 
       
   576 static zio_t *
       
   577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum,
       
   578     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
       
   579     zio_done_func_t *done, void *private, int priority, int flags)
       
   580 {
       
   581 	zio_t *zio;
       
   582 
       
   583 	BP_ZERO(bp);
       
   584 	BP_SET_LSIZE(bp, size);
       
   585 	BP_SET_PSIZE(bp, size);
       
   586 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
       
   587 
       
   588 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
       
   589 	    ZIO_TYPE_WRITE, priority, flags,
       
   590 	    ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE);
       
   591 
       
   592 	zio->io_checksum = checksum;
       
   593 	zio->io_compress = ZIO_COMPRESS_OFF;
       
   594 	zio->io_ready = zio_write_allocate_ready;
       
   595 
   553 
   596 	return (zio);
   554 	return (zio);
   597 }
   555 }
   598 
   556 
   599 zio_t *
   557 zio_t *
   600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   558 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   601     zio_done_func_t *done, void *private)
   559     zio_done_func_t *done, void *private, int flags)
   602 {
   560 {
   603 	zio_t *zio;
   561 	zio_t *zio;
   604 
   562 
   605 	ASSERT(!BP_IS_HOLE(bp));
   563 	ASSERT(!BP_IS_HOLE(bp));
   606 
   564 
       
   565 	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
       
   566 		return (zio_null(pio, spa, NULL, NULL, flags));
       
   567 
   607 	if (txg == spa->spa_syncing_txg &&
   568 	if (txg == spa->spa_syncing_txg &&
   608 	    spa->spa_sync_pass > zio_sync_pass.zp_defer_free) {
   569 	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
   609 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
   570 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
   610 		return (zio_null(pio, spa, NULL, NULL, 0));
   571 		return (zio_null(pio, spa, NULL, NULL, flags));
   611 	}
   572 	}
   612 
   573 
   613 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
   574 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
   614 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
   575 	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
   615 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
   576 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
   616 
       
   617 	zio->io_bp = &zio->io_bp_copy;
       
   618 
   577 
   619 	return (zio);
   578 	return (zio);
   620 }
   579 }
   621 
   580 
   622 zio_t *
   581 zio_t *
   623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   582 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
   624     zio_done_func_t *done, void *private)
   583     zio_done_func_t *done, void *private, int flags)
   625 {
   584 {
   626 	zio_t *zio;
   585 	zio_t *zio;
   627 
   586 
   628 	/*
   587 	/*
   629 	 * A claim is an allocation of a specific block.  Claims are needed
   588 	 * A claim is an allocation of a specific block.  Claims are needed
   637 	 * starts allocating blocks -- so that nothing is allocated twice.
   596 	 * starts allocating blocks -- so that nothing is allocated twice.
   638 	 */
   597 	 */
   639 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
   598 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
   640 	ASSERT3U(spa_first_txg(spa), <=, txg);
   599 	ASSERT3U(spa_first_txg(spa), <=, txg);
   641 
   600 
   642 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
   601 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
   643 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
   602 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
   644 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
   603 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
   645 
       
   646 	zio->io_bp = &zio->io_bp_copy;
       
   647 
   604 
   648 	return (zio);
   605 	return (zio);
   649 }
   606 }
   650 
   607 
   651 zio_t *
   608 zio_t *
   655 	zio_t *zio;
   612 	zio_t *zio;
   656 	int c;
   613 	int c;
   657 
   614 
   658 	if (vd->vdev_children == 0) {
   615 	if (vd->vdev_children == 0) {
   659 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
   616 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
   660 		    ZIO_TYPE_IOCTL, priority, flags,
   617 		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
   661 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
   618 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
   662 
   619 
   663 		zio->io_vd = vd;
       
   664 		zio->io_cmd = cmd;
   620 		zio->io_cmd = cmd;
   665 	} else {
   621 	} else {
   666 		zio = zio_null(pio, spa, NULL, NULL, flags);
   622 		zio = zio_null(pio, spa, NULL, NULL, flags);
   667 
   623 
   668 		for (c = 0; c < vd->vdev_children; c++)
   624 		for (c = 0; c < vd->vdev_children; c++)
   669 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
   625 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
   670 			    done, private, priority, flags));
   626 			    done, private, priority, flags));
   671 	}
   627 	}
   672 
   628 
   673 	return (zio);
   629 	return (zio);
   674 }
       
   675 
       
   676 static void
       
   677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
       
   678     int checksum, boolean_t labels)
       
   679 {
       
   680 	ASSERT(vd->vdev_children == 0);
       
   681 
       
   682 	ASSERT(size <= SPA_MAXBLOCKSIZE);
       
   683 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
       
   684 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
       
   685 
       
   686 #ifdef ZFS_DEBUG
       
   687 	if (labels) {
       
   688 		ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
       
   689 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
       
   690 	}
       
   691 #endif
       
   692 	ASSERT3U(offset + size, <=, vd->vdev_psize);
       
   693 
       
   694 	BP_ZERO(bp);
       
   695 
       
   696 	BP_SET_LSIZE(bp, size);
       
   697 	BP_SET_PSIZE(bp, size);
       
   698 
       
   699 	BP_SET_CHECKSUM(bp, checksum);
       
   700 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
       
   701 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
       
   702 
       
   703 	if (checksum != ZIO_CHECKSUM_OFF)
       
   704 		ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0);
       
   705 }
   630 }
   706 
   631 
   707 zio_t *
   632 zio_t *
   708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
   633 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
   709     void *data, int checksum, zio_done_func_t *done, void *private,
   634     void *data, int checksum, zio_done_func_t *done, void *private,
   710     int priority, int flags, boolean_t labels)
   635     int priority, int flags, boolean_t labels)
   711 {
   636 {
   712 	zio_t *zio;
   637 	zio_t *zio;
   713 	blkptr_t blk;
   638 
   714 
   639 	ASSERT(vd->vdev_children == 0);
   715 	ZIO_ENTER(vd->vdev_spa);
   640 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
   716 
   641 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
   717 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
   642 	ASSERT3U(offset + size, <=, vd->vdev_psize);
   718 
   643 
   719 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
   644 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
   720 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
   645 	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
   721 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
   646 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
   722 
   647 
   723 	zio->io_vd = vd;
   648 	zio->io_prop.zp_checksum = checksum;
   724 	zio->io_offset = offset;
       
   725 
       
   726 	/*
       
   727 	 * Work off our copy of the bp so the caller can free it.
       
   728 	 */
       
   729 	zio->io_bp = &zio->io_bp_copy;
       
   730 
   649 
   731 	return (zio);
   650 	return (zio);
   732 }
   651 }
   733 
   652 
   734 zio_t *
   653 zio_t *
   735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
   654 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
   736     void *data, int checksum, zio_done_func_t *done, void *private,
   655     void *data, int checksum, zio_done_func_t *done, void *private,
   737     int priority, int flags, boolean_t labels)
   656     int priority, int flags, boolean_t labels)
   738 {
   657 {
   739 	zio_block_tail_t *zbt;
       
   740 	void *wbuf;
       
   741 	zio_t *zio;
   658 	zio_t *zio;
   742 	blkptr_t blk;
   659 
   743 
   660 	ASSERT(vd->vdev_children == 0);
   744 	ZIO_ENTER(vd->vdev_spa);
   661 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
   745 
   662 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
   746 	zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
   663 	ASSERT3U(offset + size, <=, vd->vdev_psize);
   747 
   664 
   748 	zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
   665 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
   749 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
   666 	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
   750 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
   667 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
   751 
   668 
   752 	zio->io_vd = vd;
   669 	zio->io_prop.zp_checksum = checksum;
   753 	zio->io_offset = offset;
       
   754 
       
   755 	zio->io_bp = &zio->io_bp_copy;
       
   756 	zio->io_checksum = checksum;
       
   757 
   670 
   758 	if (zio_checksum_table[checksum].ci_zbt) {
   671 	if (zio_checksum_table[checksum].ci_zbt) {
   759 		/*
   672 		/*
   760 		 * zbt checksums are necessarily destructive -- they modify
   673 		 * zbt checksums are necessarily destructive -- they modify
   761 		 * one word of the write buffer to hold the verifier/checksum.
   674 		 * the end of the write buffer to hold the verifier/checksum.
   762 		 * Therefore, we must make a local copy in case the data is
   675 		 * Therefore, we must make a local copy in case the data is
   763 		 * being written to multiple places.
   676 		 * being written to multiple places in parallel.
   764 		 */
   677 		 */
   765 		wbuf = zio_buf_alloc(size);
   678 		void *wbuf = zio_buf_alloc(size);
   766 		bcopy(data, wbuf, size);
   679 		bcopy(data, wbuf, size);
   767 		zio_push_transform(zio, wbuf, size, size);
   680 		zio_push_transform(zio, wbuf, size, size, NULL);
   768 
       
   769 		zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1;
       
   770 		zbt->zbt_cksum = blk.blk_cksum;
       
   771 	}
   681 	}
   772 
   682 
   773 	return (zio);
   683 	return (zio);
   774 }
   684 }
   775 
   685 
   776 /*
   686 /*
   777  * Create a child I/O to do some work for us.  It has no associated bp.
   687  * Create a child I/O to do some work for us.
   778  */
   688  */
   779 zio_t *
   689 zio_t *
   780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
   690 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
   781 	void *data, uint64_t size, int type, int priority, int flags,
   691 	void *data, uint64_t size, int type, int priority, int flags,
   782 	zio_done_func_t *done, void *private)
   692 	zio_done_func_t *done, void *private)
   783 {
   693 {
   784 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
   694 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
   785 	zio_t *cio;
   695 	zio_t *zio;
       
   696 
       
   697 	ASSERT(vd->vdev_parent ==
       
   698 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
   786 
   699 
   787 	if (type == ZIO_TYPE_READ && bp != NULL) {
   700 	if (type == ZIO_TYPE_READ && bp != NULL) {
   788 		/*
   701 		/*
   789 		 * If we have the bp, then the child should perform the
   702 		 * If we have the bp, then the child should perform the
   790 		 * checksum and the parent need not.  This pushes error
   703 		 * checksum and the parent need not.  This pushes error
   791 		 * detection as close to the leaves as possible and
   704 		 * detection as close to the leaves as possible and
   792 		 * eliminates redundant checksums in the interior nodes.
   705 		 * eliminates redundant checksums in the interior nodes.
   793 		 */
   706 		 */
   794 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
   707 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
   795 		zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
   708 		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
   796 	}
   709 	}
   797 
   710 
   798 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
   711 	if (vd->vdev_children == 0)
       
   712 		offset += VDEV_LABEL_START_SIZE;
       
   713 
       
   714 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
   799 	    done, private, type, priority,
   715 	    done, private, type, priority,
   800 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
   716 	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
       
   717 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
       
   718 	    vd, offset, &pio->io_bookmark,
   801 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
   719 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
   802 
   720 
   803 	cio->io_vd = vd;
   721 	return (zio);
   804 	cio->io_offset = offset;
   722 }
   805 
   723 
   806 	return (cio);
   724 zio_t *
   807 }
   725 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
   808 
   726 	int type, int priority, int flags, zio_done_func_t *done, void *private)
   809 /*
   727 {
   810  * ==========================================================================
   728 	zio_t *zio;
   811  * Initiate I/O, either sync or async
   729 
   812  * ==========================================================================
   730 	ASSERT(vd->vdev_ops->vdev_op_leaf);
   813  */
   731 
   814 static void
   732 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
   815 zio_destroy(zio_t *zio)
   733 	    data, size, done, private, type, priority,
   816 {
   734 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
   817 	mutex_destroy(&zio->io_lock);
   735 	    vd, offset, NULL,
   818 	cv_destroy(&zio->io_cv);
   736 	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
   819 	if (zio->io_failed_vds != NULL) {
   737 
   820 		kmem_free(zio->io_failed_vds,
   738 	return (zio);
   821 		    zio->io_failed_vds_count * sizeof (vdev_t *));
       
   822 		zio->io_failed_vds = NULL;
       
   823 		zio->io_failed_vds_count = 0;
       
   824 	}
       
   825 	kmem_cache_free(zio_cache, zio);
       
   826 }
       
   827 
       
   828 int
       
   829 zio_wait(zio_t *zio)
       
   830 {
       
   831 	int error;
       
   832 
       
   833 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
       
   834 
       
   835 	zio->io_waiter = curthread;
       
   836 
       
   837 	zio_execute(zio);
       
   838 
       
   839 	mutex_enter(&zio->io_lock);
       
   840 	while (zio->io_stalled != ZIO_STAGE_DONE)
       
   841 		cv_wait(&zio->io_cv, &zio->io_lock);
       
   842 	mutex_exit(&zio->io_lock);
       
   843 
       
   844 	error = zio->io_error;
       
   845 	zio_destroy(zio);
       
   846 
       
   847 	return (error);
       
   848 }
   739 }
   849 
   740 
   850 void
   741 void
   851 zio_nowait(zio_t *zio)
   742 zio_flush(zio_t *zio, vdev_t *vd)
   852 {
   743 {
   853 	zio_execute(zio);
   744 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
   854 }
   745 	    NULL, NULL, ZIO_PRIORITY_NOW,
   855 
   746 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
   856 void
   747 }
   857 zio_interrupt(zio_t *zio)
   748 
   858 {
   749 /*
   859 	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
   750  * ==========================================================================
   860 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
   751  * Prepare to read and write logical blocks
   861 }
   752  * ==========================================================================
       
   753  */
   862 
   754 
   863 static int
   755 static int
   864 zio_issue_async(zio_t *zio)
   756 zio_read_bp_init(zio_t *zio)
   865 {
       
   866 	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
       
   867 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
       
   868 
       
   869 	return (ZIO_PIPELINE_STOP);
       
   870 }
       
   871 
       
   872 /*
       
   873  * ==========================================================================
       
   874  * I/O pipeline interlocks: parent/child dependency scoreboarding
       
   875  * ==========================================================================
       
   876  */
       
   877 static int
       
   878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
       
   879 {
       
   880 	int rv = ZIO_PIPELINE_CONTINUE;
       
   881 
       
   882 	mutex_enter(&zio->io_lock);
       
   883 	ASSERT(zio->io_stalled == 0);
       
   884 	if (*countp != 0) {
       
   885 		zio->io_stalled = stage;
       
   886 		rv = ZIO_PIPELINE_STOP;
       
   887 	}
       
   888 	mutex_exit(&zio->io_lock);
       
   889 
       
   890 	return (rv);
       
   891 }
       
   892 
       
   893 static void
       
   894 zio_add_failed_vdev(zio_t *pio, zio_t *zio)
       
   895 {
       
   896 	uint64_t oldcount = pio->io_failed_vds_count;
       
   897 	vdev_t **new_vds;
       
   898 	int i;
       
   899 
       
   900 	ASSERT(MUTEX_HELD(&pio->io_lock));
       
   901 
       
   902 	if (zio->io_vd == NULL)
       
   903 		return;
       
   904 
       
   905 	for (i = 0; i < oldcount; i++) {
       
   906 		if (pio->io_failed_vds[i] == zio->io_vd)
       
   907 			return;
       
   908 	}
       
   909 
       
   910 	new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP);
       
   911 	if (pio->io_failed_vds != NULL) {
       
   912 		bcopy(pio->io_failed_vds, new_vds,
       
   913 		    oldcount * sizeof (vdev_t *));
       
   914 		kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *));
       
   915 	}
       
   916 	pio->io_failed_vds = new_vds;
       
   917 	pio->io_failed_vds[oldcount] = zio->io_vd;
       
   918 	pio->io_failed_vds_count++;
       
   919 }
       
   920 
       
   921 static void
       
   922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
       
   923 {
       
   924 	zio_t *pio = zio->io_parent;
       
   925 
       
   926 	mutex_enter(&pio->io_lock);
       
   927 	if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) {
       
   928 		pio->io_error = zio->io_error;
       
   929 		if (zio->io_error && zio->io_error != ENOTSUP)
       
   930 			zio_add_failed_vdev(pio, zio);
       
   931 	}
       
   932 	ASSERT3U(*countp, >, 0);
       
   933 	if (--*countp == 0 && pio->io_stalled == stage) {
       
   934 		pio->io_stalled = 0;
       
   935 		mutex_exit(&pio->io_lock);
       
   936 		zio_execute(pio);
       
   937 	} else {
       
   938 		mutex_exit(&pio->io_lock);
       
   939 	}
       
   940 }
       
   941 
       
   942 int
       
   943 zio_wait_for_children_ready(zio_t *zio)
       
   944 {
       
   945 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
       
   946 	    &zio->io_children_notready));
       
   947 }
       
   948 
       
   949 int
       
   950 zio_wait_for_children_done(zio_t *zio)
       
   951 {
       
   952 	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
       
   953 	    &zio->io_children_notdone));
       
   954 }
       
   955 
       
   956 static int
       
   957 zio_read_init(zio_t *zio)
       
   958 {
   757 {
   959 	blkptr_t *bp = zio->io_bp;
   758 	blkptr_t *bp = zio->io_bp;
   960 
   759 
   961 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
   760 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
   962 		uint64_t csize = BP_GET_PSIZE(bp);
   761 		uint64_t csize = BP_GET_PSIZE(bp);
   963 		void *cbuf = zio_buf_alloc(csize);
   762 		void *cbuf = zio_buf_alloc(csize);
   964 
   763 
   965 		zio_push_transform(zio, cbuf, csize, csize);
   764 		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
   966 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
       
   967 	}
       
   968 
       
   969 	if (BP_IS_GANG(bp)) {
       
   970 		uint64_t gsize = SPA_GANGBLOCKSIZE;
       
   971 		void *gbuf = zio_buf_alloc(gsize);
       
   972 
       
   973 		zio_push_transform(zio, gbuf, gsize, gsize);
       
   974 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
       
   975 	}
   765 	}
   976 
   766 
   977 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
   767 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
   978 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
   768 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
   979 
   769 
   980 	return (ZIO_PIPELINE_CONTINUE);
   770 	return (ZIO_PIPELINE_CONTINUE);
   981 }
   771 }
   982 
   772 
   983 static int
   773 static int
   984 zio_ready(zio_t *zio)
   774 zio_write_bp_init(zio_t *zio)
   985 {
   775 {
   986 	zio_t *pio = zio->io_parent;
   776 	zio_prop_t *zp = &zio->io_prop;
   987 
   777 	int compress = zp->zp_compress;
   988 	if (zio->io_ready)
       
   989 		zio->io_ready(zio);
       
   990 
       
   991 	if (pio != NULL)
       
   992 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
       
   993 		    &pio->io_children_notready);
       
   994 
       
   995 	if (zio->io_bp)
       
   996 		zio->io_bp_copy = *zio->io_bp;
       
   997 
       
   998 	return (ZIO_PIPELINE_CONTINUE);
       
   999 }
       
  1000 
       
  1001 static int
       
  1002 zio_vdev_retry_io(zio_t *zio)
       
  1003 {
       
  1004 	zio_t *pio = zio->io_parent;
       
  1005 
       
  1006 	/*
       
  1007 	 * Preserve the failed bp so that the io_ready() callback can
       
  1008 	 * update the accounting accordingly. The callback will also be
       
  1009 	 * responsible for freeing the previously allocated block, if one
       
  1010 	 * exists.
       
  1011 	 */
       
  1012 	zio->io_bp_orig = *zio->io_bp;
       
  1013 
       
  1014 	/*
       
  1015 	 * We must zero out the old DVA and blk_birth before reallocating
       
  1016 	 * the bp.
       
  1017 	 */
       
  1018 	BP_ZERO_DVAS(zio->io_bp);
       
  1019 	zio_reset(zio);
       
  1020 
       
  1021 	if (pio) {
       
  1022 		/*
       
  1023 		 * Let the parent know that we will
       
  1024 		 * re-alloc the write (=> new bp info).
       
  1025 		 */
       
  1026 		mutex_enter(&pio->io_lock);
       
  1027 		pio->io_children_notready++;
       
  1028 
       
  1029 		/*
       
  1030 		 * If the parent I/O is still in the open stage, then
       
  1031 		 * don't bother telling it to retry since it hasn't
       
  1032 		 * progressed far enough for it to care.
       
  1033 		 */
       
  1034 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
       
  1035 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
       
  1036 
       
  1037 		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
       
  1038 		mutex_exit(&pio->io_lock);
       
  1039 	}
       
  1040 
       
  1041 	/*
       
  1042 	 * We are getting ready to process the retry request so clear
       
  1043 	 * the flag and the zio's current error status.
       
  1044 	 */
       
  1045 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
       
  1046 	zio->io_error = 0;
       
  1047 
       
  1048 	return (ZIO_PIPELINE_CONTINUE);
       
  1049 }
       
  1050 
       
  1051 int
       
  1052 zio_vdev_resume_io(spa_t *spa)
       
  1053 {
       
  1054 	zio_t *zio;
       
  1055 
       
  1056 	mutex_enter(&spa->spa_zio_lock);
       
  1057 
       
  1058 	/*
       
  1059 	 * Probe all of vdevs that have experienced an I/O error.
       
  1060 	 * If we are still unable to verify the integrity of the vdev
       
  1061 	 * then we prevent the resume from proceeeding.
       
  1062 	 */
       
  1063 	for (zio = list_head(&spa->spa_zio_list); zio != NULL;
       
  1064 	    zio = list_next(&spa->spa_zio_list, zio)) {
       
  1065 		int error = 0;
       
  1066 
       
  1067 		/* We only care about I/Os that must succeed */
       
  1068 		if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL)
       
  1069 			continue;
       
  1070 		error = vdev_probe(zio->io_vd);
       
  1071 		if (error) {
       
  1072 			mutex_exit(&spa->spa_zio_lock);
       
  1073 			return (error);
       
  1074 		}
       
  1075 	}
       
  1076 
       
  1077 	/*
       
  1078 	 * Clear the vdev stats so that I/O can flow.
       
  1079 	 */
       
  1080 	vdev_clear(spa, NULL, B_FALSE);
       
  1081 
       
  1082 	spa->spa_state = POOL_STATE_ACTIVE;
       
  1083 	while ((zio = list_head(&spa->spa_zio_list)) != NULL) {
       
  1084 		list_remove(&spa->spa_zio_list, zio);
       
  1085 		zio->io_error = 0;
       
  1086 
       
  1087 		/*
       
  1088 		 * If we are resuming an allocating I/O then we force it
       
  1089 		 * to retry and let it resume operation where it left off.
       
  1090 		 * Otherwise, go back to the ready stage and pick up from
       
  1091 		 * there.
       
  1092 		 */
       
  1093 		if (zio_write_retry && IO_IS_ALLOCATING(zio)) {
       
  1094 			zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
       
  1095 			zio->io_stage--;
       
  1096 		} else {
       
  1097 			zio->io_stage = ZIO_STAGE_READY;
       
  1098 		}
       
  1099 
       
  1100 		(void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
       
  1101 		    zio, TQ_SLEEP);
       
  1102 	}
       
  1103 	mutex_exit(&spa->spa_zio_lock);
       
  1104 
       
  1105 	/*
       
  1106 	 * Wait for the taskqs to finish and recheck the pool state since
       
  1107 	 * it's possible that a resumed I/O has failed again.
       
  1108 	 */
       
  1109 	taskq_wait(zio_taskq);
       
  1110 	if (spa_state(spa) == POOL_STATE_IO_FAILURE)
       
  1111 		return (EIO);
       
  1112 
       
  1113 	mutex_enter(&spa->spa_zio_lock);
       
  1114 	cv_broadcast(&spa->spa_zio_cv);
       
  1115 	mutex_exit(&spa->spa_zio_lock);
       
  1116 
       
  1117 	return (0);
       
  1118 }
       
  1119 
       
  1120 static int
       
  1121 zio_vdev_suspend_io(zio_t *zio)
       
  1122 {
       
  1123 	spa_t *spa = zio->io_spa;
       
  1124 
       
  1125 	/*
       
  1126 	 * We've experienced an unrecoverable failure so
       
  1127 	 * set the pool state accordingly and queue all
       
  1128 	 * failed IOs.
       
  1129 	 */
       
  1130 	spa->spa_state = POOL_STATE_IO_FAILURE;
       
  1131 
       
  1132 	mutex_enter(&spa->spa_zio_lock);
       
  1133 	list_insert_tail(&spa->spa_zio_list, zio);
       
  1134 
       
  1135 #ifndef _KERNEL
       
  1136 	/* Used to notify ztest that the pool has suspended */
       
  1137 	cv_broadcast(&spa->spa_zio_cv);
       
  1138 #endif
       
  1139 	mutex_exit(&spa->spa_zio_lock);
       
  1140 
       
  1141 	return (ZIO_PIPELINE_STOP);
       
  1142 }
       
  1143 
       
  1144 static void
       
  1145 zio_handle_io_failure(zio_t *zio, vdev_t *vd)
       
  1146 {
       
  1147 	spa_t *spa = zio->io_spa;
       
  1148 	blkptr_t *bp = zio->io_bp;
       
  1149 	char *blkbuf;
       
  1150 
       
  1151 #ifdef ZFS_DEBUG
       
  1152 	blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP);
       
  1153 	if (blkbuf) {
       
  1154 		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
       
  1155 		    bp ? bp : &zio->io_bp_copy);
       
  1156 	}
       
  1157 	cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
       
  1158 	    zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
       
  1159 	    zio_type_name[zio->io_type], vdev_description(vd),
       
  1160 	    (u_longlong_t)zio->io_offset, (void *)zio,
       
  1161 	    blkbuf ? blkbuf : "", zio->io_error);
       
  1162 	if (blkbuf)
       
  1163 		kmem_free(blkbuf, BP_SPRINTF_LEN);
       
  1164 #endif
       
  1165 
       
  1166 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) {
       
  1167 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
       
  1168 		    "failure and the failure mode property for this pool "
       
  1169 		    "is set to panic.", spa_name(spa));
       
  1170 	}
       
  1171 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
       
  1172 	vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE,
       
  1173 	    VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE);
       
  1174 }
       
  1175 
       
  1176 static int
       
  1177 zio_assess(zio_t *zio)
       
  1178 {
       
  1179 	spa_t *spa = zio->io_spa;
       
  1180 	blkptr_t *bp = zio->io_bp;
       
  1181 	vdev_t *vd = zio->io_vd;
       
  1182 
       
  1183 	ASSERT(zio->io_children_notready == 0);
       
  1184 	ASSERT(zio->io_children_notdone == 0);
       
  1185 
       
  1186 	if (bp != NULL) {
       
  1187 		ASSERT(bp->blk_pad[0] == 0);
       
  1188 		ASSERT(bp->blk_pad[1] == 0);
       
  1189 		ASSERT(bp->blk_pad[2] == 0);
       
  1190 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
       
  1191 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
       
  1192 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
       
  1193 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
       
  1194 			if (zio->io_ndvas != 0)
       
  1195 				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
       
  1196 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
       
  1197 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
       
  1198 		}
       
  1199 	}
       
  1200 
       
  1201 	/*
       
  1202 	 * Some child I/O has indicated that a retry is necessary, so
       
  1203 	 * we set an error on the I/O and let the logic below do the
       
  1204 	 * rest.
       
  1205 	 */
       
  1206 	if (zio->io_flags & ZIO_FLAG_WRITE_RETRY)
       
  1207 		zio->io_error = ERESTART;
       
  1208 
       
  1209 	if (vd != NULL)
       
  1210 		vdev_stat_update(zio);
       
  1211 
       
  1212 	if (zio->io_error) {
       
  1213 		/*
       
  1214 		 * If this I/O is attached to a particular vdev,
       
  1215 		 * generate an error message describing the I/O failure
       
  1216 		 * at the block level.  We ignore these errors if the
       
  1217 		 * device is currently unavailable.
       
  1218 		 */
       
  1219 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
       
  1220 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
       
  1221 
       
  1222 		if ((zio->io_error == EIO ||
       
  1223 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
       
  1224 		    zio->io_logical == zio) {
       
  1225 			/*
       
  1226 			 * For root I/O requests, tell the SPA to log the error
       
  1227 			 * appropriately.  Also, generate a logical data
       
  1228 			 * ereport.
       
  1229 			 */
       
  1230 			spa_log_error(spa, zio);
       
  1231 
       
  1232 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
       
  1233 			    0, 0);
       
  1234 		}
       
  1235 
       
  1236 		/*
       
  1237 		 * If we are an allocating I/O then we attempt to reissue
       
  1238 		 * the I/O on another vdev unless the pool is out of space.
       
  1239 		 * We handle this condition based on the spa's failmode
       
  1240 		 * property.
       
  1241 		 */
       
  1242 		if (zio_write_retry && zio->io_error != ENOSPC &&
       
  1243 		    IO_IS_ALLOCATING(zio))
       
  1244 			return (zio_vdev_retry_io(zio));
       
  1245 
       
  1246 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
       
  1247 
       
  1248 		/*
       
  1249 		 * For I/O requests that cannot fail, we carry out
       
  1250 		 * the requested behavior based on the failmode pool
       
  1251 		 * property.
       
  1252 		 *
       
  1253 		 * XXX - Need to differentiate between an ENOSPC as
       
  1254 		 * a result of vdev failures vs. a full pool.
       
  1255 		 */
       
  1256 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
       
  1257 			int i;
       
  1258 
       
  1259 			for (i = 0; i < zio->io_failed_vds_count; i++) {
       
  1260 				zio_handle_io_failure(zio,
       
  1261 				    zio->io_failed_vds[i]);
       
  1262 			}
       
  1263 			if (zio->io_failed_vds_count == 0) {
       
  1264 				zio_handle_io_failure(zio,
       
  1265 				    vd ? vd : spa->spa_root_vdev);
       
  1266 			}
       
  1267 			if (zio->io_failed_vds != NULL) {
       
  1268 				kmem_free(zio->io_failed_vds,
       
  1269 				    zio->io_failed_vds_count *
       
  1270 				    sizeof (vdev_t *));
       
  1271 				zio->io_failed_vds = NULL;
       
  1272 				zio->io_failed_vds_count = 0;
       
  1273 			}
       
  1274 			return (zio_vdev_suspend_io(zio));
       
  1275 		}
       
  1276 	}
       
  1277 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
       
  1278 	ASSERT(zio->io_children_notready == 0);
       
  1279 
       
  1280 	return (ZIO_PIPELINE_CONTINUE);
       
  1281 }
       
  1282 
       
  1283 static int
       
  1284 zio_done(zio_t *zio)
       
  1285 {
       
  1286 	zio_t *pio = zio->io_parent;
       
  1287 	spa_t *spa = zio->io_spa;
       
  1288 
       
  1289 	ASSERT(zio->io_children_notready == 0);
       
  1290 	ASSERT(zio->io_children_notdone == 0);
       
  1291 
       
  1292 	zio_clear_transform_stack(zio);
       
  1293 
       
  1294 	if (zio->io_done)
       
  1295 		zio->io_done(zio);
       
  1296 
       
  1297 	ASSERT(zio->io_delegate_list == NULL);
       
  1298 	ASSERT(zio->io_delegate_next == NULL);
       
  1299 
       
  1300 	if (pio != NULL) {
       
  1301 		zio_t *next, *prev;
       
  1302 
       
  1303 		mutex_enter(&pio->io_lock);
       
  1304 		next = zio->io_sibling_next;
       
  1305 		prev = zio->io_sibling_prev;
       
  1306 		if (next != NULL)
       
  1307 			next->io_sibling_prev = prev;
       
  1308 		if (prev != NULL)
       
  1309 			prev->io_sibling_next = next;
       
  1310 		if (pio->io_child == zio)
       
  1311 			pio->io_child = next;
       
  1312 		mutex_exit(&pio->io_lock);
       
  1313 
       
  1314 		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
       
  1315 		    &pio->io_children_notdone);
       
  1316 	}
       
  1317 
       
  1318 	/*
       
  1319 	 * Note: this I/O is now done, and will shortly be freed, so there is no
       
  1320 	 * need to clear this (or any other) flag.
       
  1321 	 */
       
  1322 	if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)
       
  1323 		spa_config_exit(spa, zio);
       
  1324 
       
  1325 	if (zio->io_waiter != NULL) {
       
  1326 		mutex_enter(&zio->io_lock);
       
  1327 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
       
  1328 		zio->io_stalled = zio->io_stage;
       
  1329 		cv_broadcast(&zio->io_cv);
       
  1330 		mutex_exit(&zio->io_lock);
       
  1331 	} else {
       
  1332 		zio_destroy(zio);
       
  1333 	}
       
  1334 
       
  1335 	return (ZIO_PIPELINE_STOP);
       
  1336 }
       
  1337 
       
  1338 /*
       
  1339  * ==========================================================================
       
  1340  * Compression support
       
  1341  * ==========================================================================
       
  1342  */
       
  1343 static int
       
  1344 zio_write_compress(zio_t *zio)
       
  1345 {
       
  1346 	int compress = zio->io_compress;
       
  1347 	blkptr_t *bp = zio->io_bp;
   778 	blkptr_t *bp = zio->io_bp;
  1348 	void *cbuf;
   779 	void *cbuf;
  1349 	uint64_t lsize = zio->io_size;
   780 	uint64_t lsize = zio->io_size;
  1350 	uint64_t csize = lsize;
   781 	uint64_t csize = lsize;
  1351 	uint64_t cbufsize = 0;
   782 	uint64_t cbufsize = 0;
  1352 	int pass;
   783 	int pass = 1;
       
   784 
       
   785 	/*
       
   786 	 * If our children haven't all reached the ready stage,
       
   787 	 * wait for them and then repeat this pipeline stage.
       
   788 	 */
       
   789 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
       
   790 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
       
   791 		return (ZIO_PIPELINE_STOP);
       
   792 
       
   793 	if (!IO_IS_ALLOCATING(zio))
       
   794 		return (ZIO_PIPELINE_CONTINUE);
       
   795 
       
   796 	ASSERT(compress != ZIO_COMPRESS_INHERIT);
  1353 
   797 
  1354 	if (bp->blk_birth == zio->io_txg) {
   798 	if (bp->blk_birth == zio->io_txg) {
  1355 		/*
   799 		/*
  1356 		 * We're rewriting an existing block, which means we're
   800 		 * We're rewriting an existing block, which means we're
  1357 		 * working on behalf of spa_sync().  For spa_sync() to
   801 		 * working on behalf of spa_sync().  For spa_sync() to
  1360 		 * the blocksize, which forces a reallocate, and makes
   804 		 * the blocksize, which forces a reallocate, and makes
  1361 		 * convergence take longer.  Therefore, after the first
   805 		 * convergence take longer.  Therefore, after the first
  1362 		 * few passes, stop compressing to ensure convergence.
   806 		 * few passes, stop compressing to ensure convergence.
  1363 		 */
   807 		 */
  1364 		pass = spa_sync_pass(zio->io_spa);
   808 		pass = spa_sync_pass(zio->io_spa);
  1365 		if (pass > zio_sync_pass.zp_dontcompress)
   809 		ASSERT(pass > 1);
       
   810 
       
   811 		if (pass > SYNC_PASS_DONT_COMPRESS)
  1366 			compress = ZIO_COMPRESS_OFF;
   812 			compress = ZIO_COMPRESS_OFF;
  1367 	} else {
   813 
  1368 		ASSERT(BP_IS_HOLE(bp));
   814 		/*
  1369 		pass = 1;
   815 		 * Only MOS (objset 0) data should need to be rewritten.
  1370 	}
   816 		 */
  1371 
   817 		ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
  1372 	if (compress != ZIO_COMPRESS_OFF)
   818 
       
   819 		/* Make sure someone doesn't change their mind on overwrites */
       
   820 		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
       
   821 		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
       
   822 	}
       
   823 
       
   824 	if (compress != ZIO_COMPRESS_OFF) {
  1373 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
   825 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
  1374 		    &cbuf, &csize, &cbufsize))
   826 		    &cbuf, &csize, &cbufsize)) {
  1375 			compress = ZIO_COMPRESS_OFF;
   827 			compress = ZIO_COMPRESS_OFF;
  1376 
   828 		} else if (csize != 0) {
  1377 	if (compress != ZIO_COMPRESS_OFF && csize != 0)
   829 			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
  1378 		zio_push_transform(zio, cbuf, csize, cbufsize);
   830 		}
       
   831 	}
  1379 
   832 
  1380 	/*
   833 	/*
  1381 	 * The final pass of spa_sync() must be all rewrites, but the first
   834 	 * The final pass of spa_sync() must be all rewrites, but the first
  1382 	 * few passes offer a trade-off: allocating blocks defers convergence,
   835 	 * few passes offer a trade-off: allocating blocks defers convergence,
  1383 	 * but newly allocated blocks are sequential, so they can be written
   836 	 * but newly allocated blocks are sequential, so they can be written
  1384 	 * to disk faster.  Therefore, we allow the first few passes of
   837 	 * to disk faster.  Therefore, we allow the first few passes of
  1385 	 * spa_sync() to reallocate new blocks, but force rewrites after that.
   838 	 * spa_sync() to allocate new blocks, but force rewrites after that.
  1386 	 * There should only be a handful of blocks after pass 1 in any case.
   839 	 * There should only be a handful of blocks after pass 1 in any case.
  1387 	 */
   840 	 */
  1388 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
   841 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
  1389 	    pass > zio_sync_pass.zp_rewrite) {
   842 	    pass > SYNC_PASS_REWRITE) {
  1390 		ASSERT(csize != 0);
   843 		ASSERT(csize != 0);
       
   844 		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
       
   845 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
       
   846 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
       
   847 	} else {
       
   848 		BP_ZERO(bp);
       
   849 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
       
   850 	}
       
   851 
       
   852 	if (csize == 0) {
       
   853 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
       
   854 	} else {
       
   855 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
  1391 		BP_SET_LSIZE(bp, lsize);
   856 		BP_SET_LSIZE(bp, lsize);
       
   857 		BP_SET_PSIZE(bp, csize);
  1392 		BP_SET_COMPRESS(bp, compress);
   858 		BP_SET_COMPRESS(bp, compress);
  1393 		zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
   859 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
  1394 	} else {
   860 		BP_SET_TYPE(bp, zp->zp_type);
  1395 		if (bp->blk_birth == zio->io_txg)
   861 		BP_SET_LEVEL(bp, zp->zp_level);
  1396 			BP_ZERO(bp);
   862 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
  1397 		if (csize == 0) {
       
  1398 			BP_ZERO(bp);
       
  1399 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
       
  1400 		} else {
       
  1401 			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
       
  1402 			BP_SET_LSIZE(bp, lsize);
       
  1403 			BP_SET_PSIZE(bp, csize);
       
  1404 			BP_SET_COMPRESS(bp, compress);
       
  1405 		}
       
  1406 	}
   863 	}
  1407 
   864 
  1408 	return (ZIO_PIPELINE_CONTINUE);
   865 	return (ZIO_PIPELINE_CONTINUE);
  1409 }
   866 }
  1410 
   867 
       
   868 /*
       
   869  * ==========================================================================
       
   870  * Execute the I/O pipeline
       
   871  * ==========================================================================
       
   872  */
       
   873 
       
   874 static void
       
   875 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
       
   876 {
       
   877 	zio_type_t t = zio->io_type;
       
   878 
       
   879 	/*
       
   880 	 * If we're a config writer, the normal issue and interrupt threads
       
   881 	 * may all be blocked waiting for the config lock.  In this case,
       
   882 	 * select the otherwise-unused taskq for ZIO_TYPE_NULL.
       
   883 	 */
       
   884 	if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER)
       
   885 		t = ZIO_TYPE_NULL;
       
   886 
       
   887 	/*
       
   888 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
       
   889 	 */
       
   890 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
       
   891 		t = ZIO_TYPE_NULL;
       
   892 
       
   893 	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
       
   894 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
       
   895 }
       
   896 
       
   897 static boolean_t
       
   898 zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
       
   899 {
       
   900 	kthread_t *executor = zio->io_executor;
       
   901 	spa_t *spa = zio->io_spa;
       
   902 
       
   903 	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
       
   904 		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
       
   905 			return (B_TRUE);
       
   906 
       
   907 	return (B_FALSE);
       
   908 }
       
   909 
  1411 static int
   910 static int
  1412 zio_read_decompress(zio_t *zio)
   911 zio_issue_async(zio_t *zio)
  1413 {
   912 {
  1414 	blkptr_t *bp = zio->io_bp;
   913 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
  1415 	void *data;
   914 
  1416 	uint64_t size;
   915 	return (ZIO_PIPELINE_STOP);
  1417 	uint64_t bufsize;
       
  1418 	int compress = BP_GET_COMPRESS(bp);
       
  1419 
       
  1420 	ASSERT(compress != ZIO_COMPRESS_OFF);
       
  1421 
       
  1422 	zio_pop_transform(zio, &data, &size, &bufsize);
       
  1423 
       
  1424 	if (zio_decompress_data(compress, data, size,
       
  1425 	    zio->io_data, zio->io_size))
       
  1426 		zio->io_error = EIO;
       
  1427 
       
  1428 	zio_buf_free(data, bufsize);
       
  1429 
       
  1430 	return (ZIO_PIPELINE_CONTINUE);
       
  1431 }
       
  1432 
       
  1433 /*
       
  1434  * ==========================================================================
       
  1435  * Gang block support
       
  1436  * ==========================================================================
       
  1437  */
       
  1438 static void
       
  1439 zio_gang_byteswap(zio_t *zio)
       
  1440 {
       
  1441 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
       
  1442 
       
  1443 	if (BP_SHOULD_BYTESWAP(zio->io_bp))
       
  1444 		byteswap_uint64_array(zio->io_data, zio->io_size);
       
  1445 }
       
  1446 
       
  1447 static int
       
  1448 zio_get_gang_header(zio_t *zio)
       
  1449 {
       
  1450 	blkptr_t *bp = zio->io_bp;
       
  1451 	uint64_t gsize = SPA_GANGBLOCKSIZE;
       
  1452 	void *gbuf = zio_buf_alloc(gsize);
       
  1453 
       
  1454 	ASSERT(BP_IS_GANG(bp));
       
  1455 
       
  1456 	zio_push_transform(zio, gbuf, gsize, gsize);
       
  1457 
       
  1458 	zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize,
       
  1459 	    NULL, NULL, ZIO_TYPE_READ, zio->io_priority,
       
  1460 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
       
  1461 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
       
  1462 
       
  1463 	return (zio_wait_for_children_done(zio));
       
  1464 }
       
  1465 
       
  1466 static int
       
  1467 zio_read_gang_members(zio_t *zio)
       
  1468 {
       
  1469 	zio_gbh_phys_t *gbh;
       
  1470 	uint64_t gsize, gbufsize, loff, lsize;
       
  1471 	int i;
       
  1472 
       
  1473 	ASSERT(BP_IS_GANG(zio->io_bp));
       
  1474 
       
  1475 	zio_gang_byteswap(zio);
       
  1476 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
       
  1477 
       
  1478 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
       
  1479 		blkptr_t *gbp = &gbh->zg_blkptr[i];
       
  1480 		lsize = BP_GET_PSIZE(gbp);
       
  1481 
       
  1482 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
       
  1483 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
       
  1484 		ASSERT3U(loff + lsize, <=, zio->io_size);
       
  1485 		ASSERT(i < SPA_GBH_NBLKPTRS);
       
  1486 		ASSERT(!BP_IS_HOLE(gbp));
       
  1487 
       
  1488 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
       
  1489 		    (char *)zio->io_data + loff, lsize,
       
  1490 		    NULL, NULL, zio->io_priority,
       
  1491 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
       
  1492 	}
       
  1493 
       
  1494 	zio_buf_free(gbh, gbufsize);
       
  1495 
       
  1496 	return (zio_wait_for_children_done(zio));
       
  1497 }
       
  1498 
       
  1499 static int
       
  1500 zio_rewrite_gang_members(zio_t *zio)
       
  1501 {
       
  1502 	zio_gbh_phys_t *gbh;
       
  1503 	uint64_t gsize, gbufsize, loff, lsize;
       
  1504 	int i;
       
  1505 
       
  1506 	ASSERT(BP_IS_GANG(zio->io_bp));
       
  1507 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
       
  1508 
       
  1509 	zio_gang_byteswap(zio);
       
  1510 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
       
  1511 
       
  1512 	ASSERT(gsize == gbufsize);
       
  1513 
       
  1514 	for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) {
       
  1515 		blkptr_t *gbp = &gbh->zg_blkptr[i];
       
  1516 		lsize = BP_GET_PSIZE(gbp);
       
  1517 
       
  1518 		ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF);
       
  1519 		ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp));
       
  1520 		ASSERT3U(loff + lsize, <=, zio->io_size);
       
  1521 		ASSERT(i < SPA_GBH_NBLKPTRS);
       
  1522 		ASSERT(!BP_IS_HOLE(gbp));
       
  1523 
       
  1524 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
       
  1525 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
       
  1526 		    NULL, NULL, zio->io_priority,
       
  1527 		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
       
  1528 	}
       
  1529 
       
  1530 	zio_push_transform(zio, gbh, gsize, gbufsize);
       
  1531 
       
  1532 	return (zio_wait_for_children_ready(zio));
       
  1533 }
       
  1534 
       
  1535 static int
       
  1536 zio_free_gang_members(zio_t *zio)
       
  1537 {
       
  1538 	zio_gbh_phys_t *gbh;
       
  1539 	uint64_t gsize, gbufsize;
       
  1540 	int i;
       
  1541 
       
  1542 	ASSERT(BP_IS_GANG(zio->io_bp));
       
  1543 
       
  1544 	zio_gang_byteswap(zio);
       
  1545 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
       
  1546 
       
  1547 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
       
  1548 		blkptr_t *gbp = &gbh->zg_blkptr[i];
       
  1549 
       
  1550 		if (BP_IS_HOLE(gbp))
       
  1551 			continue;
       
  1552 		zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg,
       
  1553 		    gbp, NULL, NULL));
       
  1554 	}
       
  1555 
       
  1556 	zio_buf_free(gbh, gbufsize);
       
  1557 
       
  1558 	return (ZIO_PIPELINE_CONTINUE);
       
  1559 }
       
  1560 
       
  1561 static int
       
  1562 zio_claim_gang_members(zio_t *zio)
       
  1563 {
       
  1564 	zio_gbh_phys_t *gbh;
       
  1565 	uint64_t gsize, gbufsize;
       
  1566 	int i;
       
  1567 
       
  1568 	ASSERT(BP_IS_GANG(zio->io_bp));
       
  1569 
       
  1570 	zio_gang_byteswap(zio);
       
  1571 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
       
  1572 
       
  1573 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
       
  1574 		blkptr_t *gbp = &gbh->zg_blkptr[i];
       
  1575 		if (BP_IS_HOLE(gbp))
       
  1576 			continue;
       
  1577 		zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg,
       
  1578 		    gbp, NULL, NULL));
       
  1579 	}
       
  1580 
       
  1581 	zio_buf_free(gbh, gbufsize);
       
  1582 
       
  1583 	return (ZIO_PIPELINE_CONTINUE);
       
  1584 }
       
  1585 
       
  1586 static void
       
  1587 zio_write_allocate_gang_member_done(zio_t *zio)
       
  1588 {
       
  1589 	zio_t *pio = zio->io_parent;
       
  1590 	dva_t *cdva = zio->io_bp->blk_dva;
       
  1591 	dva_t *pdva = pio->io_bp->blk_dva;
       
  1592 	uint64_t asize;
       
  1593 	int d;
       
  1594 
       
  1595 	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
       
  1596 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
       
  1597 	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
       
  1598 	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
       
  1599 
       
  1600 	mutex_enter(&pio->io_lock);
       
  1601 	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
       
  1602 		ASSERT(DVA_GET_GANG(&pdva[d]));
       
  1603 		asize = DVA_GET_ASIZE(&pdva[d]);
       
  1604 		asize += DVA_GET_ASIZE(&cdva[d]);
       
  1605 		DVA_SET_ASIZE(&pdva[d], asize);
       
  1606 	}
       
  1607 	mutex_exit(&pio->io_lock);
       
  1608 }
       
  1609 
       
  1610 static int
       
  1611 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc)
       
  1612 {
       
  1613 	blkptr_t *bp = zio->io_bp;
       
  1614 	dva_t *dva = bp->blk_dva;
       
  1615 	spa_t *spa = zio->io_spa;
       
  1616 	zio_gbh_phys_t *gbh;
       
  1617 	uint64_t txg = zio->io_txg;
       
  1618 	uint64_t resid = zio->io_size;
       
  1619 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
       
  1620 	uint64_t gsize, loff, lsize;
       
  1621 	uint32_t gbps_left;
       
  1622 	int ndvas = zio->io_ndvas;
       
  1623 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
       
  1624 	int error;
       
  1625 	int i, d;
       
  1626 
       
  1627 	gsize = SPA_GANGBLOCKSIZE;
       
  1628 	gbps_left = SPA_GBH_NBLKPTRS;
       
  1629 
       
  1630 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
       
  1631 	    B_FALSE);
       
  1632 	if (error) {
       
  1633 		zio->io_error = error;
       
  1634 		return (ZIO_PIPELINE_CONTINUE);
       
  1635 	}
       
  1636 
       
  1637 	for (d = 0; d < gbh_ndvas; d++)
       
  1638 		DVA_SET_GANG(&dva[d], 1);
       
  1639 
       
  1640 	bp->blk_birth = txg;
       
  1641 
       
  1642 	gbh = zio_buf_alloc(gsize);
       
  1643 	bzero(gbh, gsize);
       
  1644 
       
  1645 	for (loff = 0, i = 0; loff != zio->io_size;
       
  1646 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
       
  1647 		blkptr_t *gbp = &gbh->zg_blkptr[i];
       
  1648 		dva = gbp->blk_dva;
       
  1649 
       
  1650 		ASSERT(gbps_left != 0);
       
  1651 		maxalloc = MIN(maxalloc, resid);
       
  1652 
       
  1653 		while (resid <= maxalloc * gbps_left) {
       
  1654 			error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas,
       
  1655 			    txg, bp, B_FALSE);
       
  1656 			if (error == 0)
       
  1657 				break;
       
  1658 			ASSERT3U(error, ==, ENOSPC);
       
  1659 			/* XXX - free up previous allocations? */
       
  1660 			if (maxalloc == SPA_MINBLOCKSIZE) {
       
  1661 				zio->io_error = error;
       
  1662 				return (ZIO_PIPELINE_CONTINUE);
       
  1663 			}
       
  1664 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
       
  1665 		}
       
  1666 
       
  1667 		if (resid <= maxalloc * gbps_left) {
       
  1668 			lsize = maxalloc;
       
  1669 			BP_SET_LSIZE(gbp, lsize);
       
  1670 			BP_SET_PSIZE(gbp, lsize);
       
  1671 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
       
  1672 			gbp->blk_birth = txg;
       
  1673 			zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, txg,
       
  1674 			    gbp, (char *)zio->io_data + loff, lsize,
       
  1675 			    zio_write_allocate_gang_member_done, NULL,
       
  1676 			    zio->io_priority,
       
  1677 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
       
  1678 			    &zio->io_bookmark));
       
  1679 		} else {
       
  1680 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
       
  1681 			ASSERT(lsize != SPA_MINBLOCKSIZE);
       
  1682 			zio_nowait(zio_write_allocate(zio, spa,
       
  1683 			    zio->io_checksum, txg, gbp,
       
  1684 			    (char *)zio->io_data + loff, lsize,
       
  1685 			    zio_write_allocate_gang_member_done, NULL,
       
  1686 			    zio->io_priority,
       
  1687 			    zio->io_flags & ZIO_FLAG_GANG_INHERIT));
       
  1688 		}
       
  1689 	}
       
  1690 
       
  1691 	ASSERT(resid == 0 && loff == zio->io_size);
       
  1692 
       
  1693 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
       
  1694 
       
  1695 	zio_push_transform(zio, gbh, gsize, gsize);
       
  1696 
       
  1697 	/*
       
  1698 	 * As much as we'd like this to be 'ready' instead of 'done',
       
  1699 	 * updating our ASIZE doesn't happen until the io_done callback,
       
  1700 	 * so we have to wait for that to finish in order for our BP
       
  1701 	 * to be stable.
       
  1702 	 */
       
  1703 	return (zio_wait_for_children_done(zio));
       
  1704 }
       
  1705 
       
  1706 /*
       
  1707  * ==========================================================================
       
  1708  * Allocate and free blocks
       
  1709  * ==========================================================================
       
  1710  */
       
  1711 static int
       
  1712 zio_dva_allocate(zio_t *zio)
       
  1713 {
       
  1714 	spa_t *spa = zio->io_spa;
       
  1715 	metaslab_class_t *mc = spa->spa_normal_class;
       
  1716 	blkptr_t *bp = zio->io_bp;
       
  1717 	int error;
       
  1718 
       
  1719 	ASSERT(BP_IS_HOLE(bp));
       
  1720 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
       
  1721 	ASSERT3U(zio->io_ndvas, >, 0);
       
  1722 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
       
  1723 
       
  1724 	/*
       
  1725 	 * For testing purposes, we force I/Os to retry. We don't allow
       
  1726 	 * retries beyond the first pass since those I/Os are non-allocating
       
  1727 	 * writes.
       
  1728 	 */
       
  1729 	if (zio_io_fail_shift &&
       
  1730 	    spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite &&
       
  1731 	    zio_io_should_fail(zio_io_fail_shift))
       
  1732 		zio->io_flags |= ZIO_FLAG_WRITE_RETRY;
       
  1733 
       
  1734 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
       
  1735 
       
  1736 	error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas,
       
  1737 	    zio->io_txg, NULL, B_FALSE);
       
  1738 
       
  1739 	if (error == 0) {
       
  1740 		bp->blk_birth = zio->io_txg;
       
  1741 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
       
  1742 		return (zio_write_allocate_gang_members(zio, mc));
       
  1743 	} else {
       
  1744 		zio->io_error = error;
       
  1745 	}
       
  1746 
       
  1747 	return (ZIO_PIPELINE_CONTINUE);
       
  1748 }
       
  1749 
       
  1750 static int
       
  1751 zio_dva_free(zio_t *zio)
       
  1752 {
       
  1753 	blkptr_t *bp = zio->io_bp;
       
  1754 
       
  1755 	metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE);
       
  1756 
       
  1757 	BP_ZERO(bp);
       
  1758 
       
  1759 	return (ZIO_PIPELINE_CONTINUE);
       
  1760 }
       
  1761 
       
  1762 static int
       
  1763 zio_dva_claim(zio_t *zio)
       
  1764 {
       
  1765 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
       
  1766 
       
  1767 	return (ZIO_PIPELINE_CONTINUE);
       
  1768 }
       
  1769 
       
  1770 /*
       
  1771  * ==========================================================================
       
  1772  * Read and write to physical devices
       
  1773  * ==========================================================================
       
  1774  */
       
  1775 
       
  1776 static int
       
  1777 zio_vdev_io_start(zio_t *zio)
       
  1778 {
       
  1779 	vdev_t *vd = zio->io_vd;
       
  1780 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
       
  1781 	blkptr_t *bp = zio->io_bp;
       
  1782 	uint64_t align;
       
  1783 	spa_t *spa = zio->io_spa;
       
  1784 
       
  1785 	/*
       
  1786 	 * If the pool is already in a failure state then just suspend
       
  1787 	 * this IO until the problem is resolved. We will reissue them
       
  1788 	 * at that time.
       
  1789 	 */
       
  1790 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
       
  1791 	    zio->io_type == ZIO_TYPE_WRITE)
       
  1792 		return (zio_vdev_suspend_io(zio));
       
  1793 
       
  1794 	/*
       
  1795 	 * The mirror_ops handle multiple DVAs in a single BP
       
  1796 	 */
       
  1797 	if (vd == NULL)
       
  1798 		return (vdev_mirror_ops.vdev_op_io_start(zio));
       
  1799 
       
  1800 	align = 1ULL << tvd->vdev_ashift;
       
  1801 
       
  1802 	if (zio->io_retries == 0 && vd == tvd)
       
  1803 		zio->io_flags |= ZIO_FLAG_FAILFAST;
       
  1804 
       
  1805 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
       
  1806 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
       
  1807 		zio->io_offset += VDEV_LABEL_START_SIZE;
       
  1808 	}
       
  1809 
       
  1810 	if (P2PHASE(zio->io_size, align) != 0) {
       
  1811 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
       
  1812 		char *abuf = zio_buf_alloc(asize);
       
  1813 		ASSERT(vd == tvd);
       
  1814 		if (zio->io_type == ZIO_TYPE_WRITE) {
       
  1815 			bcopy(zio->io_data, abuf, zio->io_size);
       
  1816 			bzero(abuf + zio->io_size, asize - zio->io_size);
       
  1817 		}
       
  1818 		zio_push_transform(zio, abuf, asize, asize);
       
  1819 		ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK));
       
  1820 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
       
  1821 	}
       
  1822 
       
  1823 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
       
  1824 	ASSERT(P2PHASE(zio->io_size, align) == 0);
       
  1825 	ASSERT(bp == NULL ||
       
  1826 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
       
  1827 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
       
  1828 
       
  1829 	return (vd->vdev_ops->vdev_op_io_start(zio));
       
  1830 }
       
  1831 
       
  1832 static int
       
  1833 zio_vdev_io_done(zio_t *zio)
       
  1834 {
       
  1835 	if (zio->io_vd == NULL)
       
  1836 		return (vdev_mirror_ops.vdev_op_io_done(zio));
       
  1837 
       
  1838 	return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
       
  1839 }
       
  1840 
       
  1841 /* XXPOLICY */
       
  1842 boolean_t
       
  1843 zio_should_retry(zio_t *zio)
       
  1844 {
       
  1845 	vdev_t *vd = zio->io_vd;
       
  1846 
       
  1847 	if (zio->io_error == 0)
       
  1848 		return (B_FALSE);
       
  1849 	if (zio->io_delegate_list != NULL)
       
  1850 		return (B_FALSE);
       
  1851 	if (vd != NULL) {
       
  1852 		if (vd != vd->vdev_top)
       
  1853 			return (B_FALSE);
       
  1854 		if (vd->vdev_is_failing)
       
  1855 			return (B_FALSE);
       
  1856 	}
       
  1857 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
       
  1858 		return (B_FALSE);
       
  1859 	if (zio->io_retries > 0)
       
  1860 		return (B_FALSE);
       
  1861 
       
  1862 	return (B_TRUE);
       
  1863 }
       
  1864 
       
  1865 static int
       
  1866 zio_vdev_io_assess(zio_t *zio)
       
  1867 {
       
  1868 	vdev_t *vd = zio->io_vd;
       
  1869 	vdev_t *tvd = vd ? vd->vdev_top : NULL;
       
  1870 
       
  1871 	ASSERT(zio->io_vsd == NULL);
       
  1872 
       
  1873 	if (zio->io_flags & ZIO_FLAG_SUBBLOCK) {
       
  1874 		void *abuf;
       
  1875 		uint64_t asize;
       
  1876 		ASSERT(vd == tvd);
       
  1877 		zio_pop_transform(zio, &abuf, &asize, &asize);
       
  1878 		if (zio->io_type == ZIO_TYPE_READ)
       
  1879 			bcopy(abuf, zio->io_data, zio->io_size);
       
  1880 		zio_buf_free(abuf, asize);
       
  1881 		zio->io_flags &= ~ZIO_FLAG_SUBBLOCK;
       
  1882 	}
       
  1883 
       
  1884 	if (zio_injection_enabled && !zio->io_error)
       
  1885 		zio->io_error = zio_handle_fault_injection(zio, EIO);
       
  1886 
       
  1887 	/*
       
  1888 	 * If the I/O failed, determine whether we should attempt to retry it.
       
  1889 	 */
       
  1890 	/* XXPOLICY */
       
  1891 	if (zio_should_retry(zio)) {
       
  1892 		ASSERT(tvd == vd);
       
  1893 
       
  1894 		zio->io_retries++;
       
  1895 		zio->io_error = 0;
       
  1896 		zio->io_flags &= ZIO_FLAG_RETRY_INHERIT;
       
  1897 		/* XXPOLICY */
       
  1898 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
       
  1899 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
       
  1900 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
       
  1901 
       
  1902 		return (ZIO_PIPELINE_CONTINUE);
       
  1903 	}
       
  1904 
       
  1905 	return (ZIO_PIPELINE_CONTINUE);
       
  1906 }
   916 }
  1907 
   917 
  1908 void
   918 void
  1909 zio_vdev_io_reissue(zio_t *zio)
   919 zio_interrupt(zio_t *zio)
  1910 {
   920 {
  1911 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
   921 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
  1912 	ASSERT(zio->io_error == 0);
   922 }
  1913 
       
  1914 	zio->io_stage--;
       
  1915 }
       
  1916 
       
  1917 void
       
  1918 zio_vdev_io_redone(zio_t *zio)
       
  1919 {
       
  1920 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
       
  1921 
       
  1922 	zio->io_stage--;
       
  1923 }
       
  1924 
       
  1925 void
       
  1926 zio_vdev_io_bypass(zio_t *zio)
       
  1927 {
       
  1928 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
       
  1929 	ASSERT(zio->io_error == 0);
       
  1930 
       
  1931 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
       
  1932 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
       
  1933 }
       
  1934 
       
  1935 /*
       
  1936  * ==========================================================================
       
  1937  * Generate and verify checksums
       
  1938  * ==========================================================================
       
  1939  */
       
  1940 static int
       
  1941 zio_checksum_generate(zio_t *zio)
       
  1942 {
       
  1943 	int checksum = zio->io_checksum;
       
  1944 	blkptr_t *bp = zio->io_bp;
       
  1945 
       
  1946 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
       
  1947 
       
  1948 	BP_SET_CHECKSUM(bp, checksum);
       
  1949 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
       
  1950 
       
  1951 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
       
  1952 
       
  1953 	return (ZIO_PIPELINE_CONTINUE);
       
  1954 }
       
  1955 
       
  1956 static int
       
  1957 zio_gang_checksum_generate(zio_t *zio)
       
  1958 {
       
  1959 	zio_cksum_t zc;
       
  1960 	zio_gbh_phys_t *gbh = zio->io_data;
       
  1961 
       
  1962 	ASSERT(BP_IS_GANG(zio->io_bp));
       
  1963 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
       
  1964 
       
  1965 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
       
  1966 
       
  1967 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
       
  1968 
       
  1969 	return (ZIO_PIPELINE_CONTINUE);
       
  1970 }
       
  1971 
       
  1972 static int
       
  1973 zio_checksum_verify(zio_t *zio)
       
  1974 {
       
  1975 	if (zio->io_bp != NULL) {
       
  1976 		zio->io_error = zio_checksum_error(zio);
       
  1977 		if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
       
  1978 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
       
  1979 			    zio->io_spa, zio->io_vd, zio, 0, 0);
       
  1980 	}
       
  1981 
       
  1982 	return (ZIO_PIPELINE_CONTINUE);
       
  1983 }
       
  1984 
       
  1985 /*
       
  1986  * Called by RAID-Z to ensure we don't compute the checksum twice.
       
  1987  */
       
  1988 void
       
  1989 zio_checksum_verified(zio_t *zio)
       
  1990 {
       
  1991 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
       
  1992 }
       
  1993 
       
  1994 /*
       
  1995  * Set the external verifier for a gang block based on stuff in the bp
       
  1996  */
       
  1997 void
       
  1998 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
       
  1999 {
       
  2000 	blkptr_t *bp = zio->io_bp;
       
  2001 
       
  2002 	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
       
  2003 	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
       
  2004 	zcp->zc_word[2] = bp->blk_birth;
       
  2005 	zcp->zc_word[3] = 0;
       
  2006 }
       
  2007 
       
  2008 /*
       
  2009  * ==========================================================================
       
  2010  * Define the pipeline
       
  2011  * ==========================================================================
       
  2012  */
       
  2013 typedef int zio_pipe_stage_t(zio_t *zio);
       
  2014 
       
  2015 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
       
  2016 	NULL,
       
  2017 	zio_wait_for_children_ready,
       
  2018 	zio_read_init,
       
  2019 	zio_issue_async,
       
  2020 	zio_write_compress,
       
  2021 	zio_checksum_generate,
       
  2022 	zio_get_gang_header,
       
  2023 	zio_rewrite_gang_members,
       
  2024 	zio_free_gang_members,
       
  2025 	zio_claim_gang_members,
       
  2026 	zio_dva_allocate,
       
  2027 	zio_dva_free,
       
  2028 	zio_dva_claim,
       
  2029 	zio_gang_checksum_generate,
       
  2030 	zio_ready,
       
  2031 	zio_vdev_io_start,
       
  2032 	zio_vdev_io_done,
       
  2033 	zio_vdev_io_assess,
       
  2034 	zio_wait_for_children_done,
       
  2035 	zio_checksum_verify,
       
  2036 	zio_read_gang_members,
       
  2037 	zio_read_decompress,
       
  2038 	zio_assess,
       
  2039 	zio_done,
       
  2040 	NULL
       
  2041 };
       
  2042 
   923 
  2043 /*
   924 /*
  2044  * Execute the I/O pipeline until one of the following occurs:
   925  * Execute the I/O pipeline until one of the following occurs:
  2045  * (1) the I/O completes; (2) the pipeline stalls waiting for
   926  * (1) the I/O completes; (2) the pipeline stalls waiting for
  2046  * dependent child I/Os; (3) the I/O issues, so we're waiting
   927  * dependent child I/Os; (3) the I/O issues, so we're waiting
  2051  * there's no CPU work; it never burns a thread in cv_wait().
   932  * there's no CPU work; it never burns a thread in cv_wait().
  2052  *
   933  *
  2053  * There's no locking on io_stage because there's no legitimate way
   934  * There's no locking on io_stage because there's no legitimate way
  2054  * for multiple threads to be attempting to process the same I/O.
   935  * for multiple threads to be attempting to process the same I/O.
  2055  */
   936  */
       
   937 static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
       
   938 
  2056 void
   939 void
  2057 zio_execute(zio_t *zio)
   940 zio_execute(zio_t *zio)
  2058 {
   941 {
       
   942 	zio->io_executor = curthread;
       
   943 
  2059 	while (zio->io_stage < ZIO_STAGE_DONE) {
   944 	while (zio->io_stage < ZIO_STAGE_DONE) {
  2060 		uint32_t pipeline = zio->io_pipeline;
   945 		uint32_t pipeline = zio->io_pipeline;
       
   946 		zio_stage_t stage = zio->io_stage;
  2061 		int rv;
   947 		int rv;
  2062 
   948 
  2063 		ASSERT(!MUTEX_HELD(&zio->io_lock));
   949 		ASSERT(!MUTEX_HELD(&zio->io_lock));
  2064 
   950 
       
   951 		while (((1U << ++stage) & pipeline) == 0)
       
   952 			continue;
       
   953 
       
   954 		ASSERT(stage <= ZIO_STAGE_DONE);
       
   955 		ASSERT(zio->io_stall == NULL);
       
   956 
  2065 		/*
   957 		/*
  2066 		 * If an error occurred outside the vdev stack,
   958 		 * If we are in interrupt context and this pipeline stage
  2067 		 * just execute the interlock stages to clean up.
   959 		 * will grab a config lock that is held across I/O,
       
   960 		 * issue async to avoid deadlock.
  2068 		 */
   961 		 */
  2069 		if (zio->io_error &&
   962 		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
  2070 		    ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
   963 		    zio->io_vd == NULL &&
  2071 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
   964 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
  2072 
   965 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
  2073 		while (((1U << ++zio->io_stage) & pipeline) == 0)
   966 			return;
  2074 			continue;
   967 		}
  2075 
   968 
  2076 		ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
   969 		zio->io_stage = stage;
  2077 		ASSERT(zio->io_stalled == 0);
   970 		rv = zio_pipeline[stage](zio);
  2078 
       
  2079 		rv = zio_pipeline[zio->io_stage](zio);
       
  2080 
   971 
  2081 		if (rv == ZIO_PIPELINE_STOP)
   972 		if (rv == ZIO_PIPELINE_STOP)
  2082 			return;
   973 			return;
  2083 
   974 
  2084 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
   975 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
  2085 	}
   976 	}
  2086 }
   977 }
  2087 
   978 
  2088 static boolean_t
   979 /*
  2089 zio_io_should_fail(uint16_t range)
   980  * ==========================================================================
  2090 {
   981  * Initiate I/O, either sync or async
  2091 	static uint16_t	allocs = 0;
   982  * ==========================================================================
  2092 
   983  */
  2093 	return (P2PHASE(allocs++, 1U<<range) == 0);
   984 int
       
   985 zio_wait(zio_t *zio)
       
   986 {
       
   987 	int error;
       
   988 
       
   989 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
       
   990 	ASSERT(zio->io_executor == NULL);
       
   991 
       
   992 	zio->io_waiter = curthread;
       
   993 
       
   994 	zio_execute(zio);
       
   995 
       
   996 	mutex_enter(&zio->io_lock);
       
   997 	while (zio->io_executor != NULL)
       
   998 		cv_wait(&zio->io_cv, &zio->io_lock);
       
   999 	mutex_exit(&zio->io_lock);
       
  1000 
       
  1001 	error = zio->io_error;
       
  1002 	zio_destroy(zio);
       
  1003 
       
  1004 	return (error);
       
  1005 }
       
  1006 
       
  1007 void
       
  1008 zio_nowait(zio_t *zio)
       
  1009 {
       
  1010 	ASSERT(zio->io_executor == NULL);
       
  1011 
       
  1012 	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
       
  1013 		/*
       
  1014 		 * This is a logical async I/O with no parent to wait for it.
       
  1015 		 * Attach it to the pool's global async root zio so that
       
  1016 		 * spa_unload() has a way of waiting for async I/O to finish.
       
  1017 		 */
       
  1018 		spa_t *spa = zio->io_spa;
       
  1019 		zio->io_async_root = B_TRUE;
       
  1020 		mutex_enter(&spa->spa_async_root_lock);
       
  1021 		spa->spa_async_root_count++;
       
  1022 		mutex_exit(&spa->spa_async_root_lock);
       
  1023 	}
       
  1024 
       
  1025 	zio_execute(zio);
       
  1026 }
       
  1027 
       
  1028 /*
       
  1029  * ==========================================================================
       
  1030  * Reexecute or suspend/resume failed I/O
       
  1031  * ==========================================================================
       
  1032  */
       
  1033 
       
  1034 static void
       
  1035 zio_reexecute(zio_t *pio)
       
  1036 {
       
  1037 	zio_t *zio, *zio_next;
       
  1038 
       
  1039 	pio->io_flags = pio->io_orig_flags;
       
  1040 	pio->io_stage = pio->io_orig_stage;
       
  1041 	pio->io_pipeline = pio->io_orig_pipeline;
       
  1042 	pio->io_reexecute = 0;
       
  1043 	pio->io_error = 0;
       
  1044 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
       
  1045 		pio->io_child_error[c] = 0;
       
  1046 
       
  1047 	if (IO_IS_ALLOCATING(pio)) {
       
  1048 		/*
       
  1049 		 * Remember the failed bp so that the io_ready() callback
       
  1050 		 * can update its accounting upon reexecution.  The block
       
  1051 		 * was already freed in zio_done(); we indicate this with
       
  1052 		 * a fill count of -1 so that zio_free() knows to skip it.
       
  1053 		 */
       
  1054 		blkptr_t *bp = pio->io_bp;
       
  1055 		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
       
  1056 		bp->blk_fill = BLK_FILL_ALREADY_FREED;
       
  1057 		pio->io_bp_orig = *bp;
       
  1058 		BP_ZERO(bp);
       
  1059 	}
       
  1060 
       
  1061 	/*
       
  1062 	 * As we reexecute pio's children, new children could be created.
       
  1063 	 * New children go to the head of the io_child list, however,
       
  1064 	 * so we will (correctly) not reexecute them.  The key is that
       
  1065 	 * the remainder of the io_child list, from 'zio_next' onward,
       
  1066 	 * cannot be affected by any side effects of reexecuting 'zio'.
       
  1067 	 */
       
  1068 	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
       
  1069 		zio_next = zio->io_sibling_next;
       
  1070 		mutex_enter(&pio->io_lock);
       
  1071 		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
       
  1072 		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
       
  1073 		mutex_exit(&pio->io_lock);
       
  1074 		zio_reexecute(zio);
       
  1075 	}
       
  1076 
       
  1077 	/*
       
  1078 	 * Now that all children have been reexecuted, execute the parent.
       
  1079 	 */
       
  1080 	zio_execute(pio);
       
  1081 }
       
  1082 
       
  1083 void
       
  1084 zio_suspend(spa_t *spa, zio_t *zio)
       
  1085 {
       
  1086 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
       
  1087 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
       
  1088 		    "failure and the failure mode property for this pool "
       
  1089 		    "is set to panic.", spa_name(spa));
       
  1090 
       
  1091 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
       
  1092 
       
  1093 	mutex_enter(&spa->spa_suspend_lock);
       
  1094 
       
  1095 	if (spa->spa_suspend_zio_root == NULL)
       
  1096 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
       
  1097 
       
  1098 	spa->spa_suspended = B_TRUE;
       
  1099 
       
  1100 	if (zio != NULL) {
       
  1101 		ASSERT(zio != spa->spa_suspend_zio_root);
       
  1102 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
       
  1103 		ASSERT(zio->io_parent == NULL);
       
  1104 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
       
  1105 		zio_add_child(spa->spa_suspend_zio_root, zio);
       
  1106 	}
       
  1107 
       
  1108 	mutex_exit(&spa->spa_suspend_lock);
       
  1109 }
       
  1110 
       
  1111 void
       
  1112 zio_resume(spa_t *spa)
       
  1113 {
       
  1114 	zio_t *pio, *zio;
       
  1115 
       
  1116 	/*
       
  1117 	 * Reexecute all previously suspended i/o.
       
  1118 	 */
       
  1119 	mutex_enter(&spa->spa_suspend_lock);
       
  1120 	spa->spa_suspended = B_FALSE;
       
  1121 	cv_broadcast(&spa->spa_suspend_cv);
       
  1122 	pio = spa->spa_suspend_zio_root;
       
  1123 	spa->spa_suspend_zio_root = NULL;
       
  1124 	mutex_exit(&spa->spa_suspend_lock);
       
  1125 
       
  1126 	if (pio == NULL)
       
  1127 		return;
       
  1128 
       
  1129 	while ((zio = pio->io_child) != NULL) {
       
  1130 		zio_remove_child(pio, zio);
       
  1131 		zio->io_parent = NULL;
       
  1132 		zio_reexecute(zio);
       
  1133 	}
       
  1134 
       
  1135 	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
       
  1136 
       
  1137 	(void) zio_wait(pio);
       
  1138 }
       
  1139 
       
  1140 void
       
  1141 zio_resume_wait(spa_t *spa)
       
  1142 {
       
  1143 	mutex_enter(&spa->spa_suspend_lock);
       
  1144 	while (spa_suspended(spa))
       
  1145 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
       
  1146 	mutex_exit(&spa->spa_suspend_lock);
       
  1147 }
       
  1148 
       
  1149 /*
       
  1150  * ==========================================================================
       
  1151  * Gang blocks.
       
  1152  *
       
  1153  * A gang block is a collection of small blocks that looks to the DMU
       
  1154  * like one large block.  When zio_dva_allocate() cannot find a block
       
  1155  * of the requested size, due to either severe fragmentation or the pool
       
  1156  * being nearly full, it calls zio_write_gang_block() to construct the
       
  1157  * block from smaller fragments.
       
  1158  *
       
  1159  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
       
  1160  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
       
  1161  * an indirect block: it's an array of block pointers.  It consumes
       
  1162  * only one sector and hence is allocatable regardless of fragmentation.
       
  1163  * The gang header's bps point to its gang members, which hold the data.
       
  1164  *
       
  1165  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
       
  1166  * as the verifier to ensure uniqueness of the SHA256 checksum.
       
  1167  * Critically, the gang block bp's blk_cksum is the checksum of the data,
       
  1168  * not the gang header.  This ensures that data block signatures (needed for
       
  1169  * deduplication) are independent of how the block is physically stored.
       
  1170  *
       
  1171  * Gang blocks can be nested: a gang member may itself be a gang block.
       
  1172  * Thus every gang block is a tree in which root and all interior nodes are
       
  1173  * gang headers, and the leaves are normal blocks that contain user data.
       
  1174  * The root of the gang tree is called the gang leader.
       
  1175  *
       
  1176  * To perform any operation (read, rewrite, free, claim) on a gang block,
       
  1177  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
       
  1178  * in the io_gang_tree field of the original logical i/o by recursively
       
  1179  * reading the gang leader and all gang headers below it.  This yields
       
  1180  * an in-core tree containing the contents of every gang header and the
       
  1181  * bps for every constituent of the gang block.
       
  1182  *
       
  1183  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
       
  1184  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
       
  1185  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
       
  1186  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
       
  1187  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
       
  1188  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
       
  1189  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
       
  1190  * of the gang header plus zio_checksum_compute() of the data to update the
       
  1191  * gang header's blk_cksum as described above.
       
  1192  *
       
  1193  * The two-phase assemble/issue model solves the problem of partial failure --
       
  1194  * what if you'd freed part of a gang block but then couldn't read the
       
  1195  * gang header for another part?  Assembling the entire gang tree first
       
  1196  * ensures that all the necessary gang header I/O has succeeded before
       
  1197  * starting the actual work of free, claim, or write.  Once the gang tree
       
  1198  * is assembled, free and claim are in-memory operations that cannot fail.
       
  1199  *
       
  1200  * In the event that a gang write fails, zio_dva_unallocate() walks the
       
  1201  * gang tree to immediately free (i.e. insert back into the space map)
       
  1202  * everything we've allocated.  This ensures that we don't get ENOSPC
       
  1203  * errors during repeated suspend/resume cycles due to a flaky device.
       
  1204  *
       
  1205  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
       
  1206  * the gang tree, we won't modify the block, so we can safely defer the free
       
  1207  * (knowing that the block is still intact).  If we *can* assemble the gang
       
  1208  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
       
  1209  * each constituent bp and we can allocate a new block on the next sync pass.
       
  1210  *
       
  1211  * In all cases, the gang tree allows complete recovery from partial failure.
       
  1212  * ==========================================================================
       
  1213  */
       
  1214 
       
  1215 static zio_t *
       
  1216 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
       
  1217 {
       
  1218 	if (gn != NULL)
       
  1219 		return (pio);
       
  1220 
       
  1221 	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
       
  1222 	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
       
  1223 	    &pio->io_bookmark));
       
  1224 }
       
  1225 
       
  1226 zio_t *
       
  1227 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
       
  1228 {
       
  1229 	zio_t *zio;
       
  1230 
       
  1231 	if (gn != NULL) {
       
  1232 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
       
  1233 		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
       
  1234 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
       
  1235 		/*
       
  1236 		 * As we rewrite each gang header, the pipeline will compute
       
  1237 		 * a new gang block header checksum for it; but no one will
       
  1238 		 * compute a new data checksum, so we do that here.  The one
       
  1239 		 * exception is the gang leader: the pipeline already computed
       
  1240 		 * its data checksum because that stage precedes gang assembly.
       
  1241 		 * (Presently, nothing actually uses interior data checksums;
       
  1242 		 * this is just good hygiene.)
       
  1243 		 */
       
  1244 		if (gn != pio->io_logical->io_gang_tree) {
       
  1245 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
       
  1246 			    data, BP_GET_PSIZE(bp));
       
  1247 		}
       
  1248 	} else {
       
  1249 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
       
  1250 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
       
  1251 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
       
  1252 	}
       
  1253 
       
  1254 	return (zio);
       
  1255 }
       
  1256 
       
  1257 /* ARGSUSED */
       
  1258 zio_t *
       
  1259 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
       
  1260 {
       
  1261 	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
       
  1262 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
       
  1263 }
       
  1264 
       
  1265 /* ARGSUSED */
       
  1266 zio_t *
       
  1267 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
       
  1268 {
       
  1269 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
       
  1270 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
       
  1271 }
       
  1272 
       
  1273 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
       
  1274 	NULL,
       
  1275 	zio_read_gang,
       
  1276 	zio_rewrite_gang,
       
  1277 	zio_free_gang,
       
  1278 	zio_claim_gang,
       
  1279 	NULL
       
  1280 };
       
  1281 
       
  1282 static void zio_gang_tree_assemble_done(zio_t *zio);
       
  1283 
       
  1284 static zio_gang_node_t *
       
  1285 zio_gang_node_alloc(zio_gang_node_t **gnpp)
       
  1286 {
       
  1287 	zio_gang_node_t *gn;
       
  1288 
       
  1289 	ASSERT(*gnpp == NULL);
       
  1290 
       
  1291 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
       
  1292 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
       
  1293 	*gnpp = gn;
       
  1294 
       
  1295 	return (gn);
       
  1296 }
       
  1297 
       
  1298 static void
       
  1299 zio_gang_node_free(zio_gang_node_t **gnpp)
       
  1300 {
       
  1301 	zio_gang_node_t *gn = *gnpp;
       
  1302 
       
  1303 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
       
  1304 		ASSERT(gn->gn_child[g] == NULL);
       
  1305 
       
  1306 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
       
  1307 	kmem_free(gn, sizeof (*gn));
       
  1308 	*gnpp = NULL;
       
  1309 }
       
  1310 
       
  1311 static void
       
  1312 zio_gang_tree_free(zio_gang_node_t **gnpp)
       
  1313 {
       
  1314 	zio_gang_node_t *gn = *gnpp;
       
  1315 
       
  1316 	if (gn == NULL)
       
  1317 		return;
       
  1318 
       
  1319 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
       
  1320 		zio_gang_tree_free(&gn->gn_child[g]);
       
  1321 
       
  1322 	zio_gang_node_free(gnpp);
       
  1323 }
       
  1324 
       
  1325 static void
       
  1326 zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
       
  1327 {
       
  1328 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
       
  1329 
       
  1330 	ASSERT(lio->io_logical == lio);
       
  1331 	ASSERT(BP_IS_GANG(bp));
       
  1332 
       
  1333 	zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
       
  1334 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
       
  1335 	    lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
       
  1336 }
       
  1337 
       
  1338 static void
       
  1339 zio_gang_tree_assemble_done(zio_t *zio)
       
  1340 {
       
  1341 	zio_t *lio = zio->io_logical;
       
  1342 	zio_gang_node_t *gn = zio->io_private;
       
  1343 	blkptr_t *bp = zio->io_bp;
       
  1344 
       
  1345 	ASSERT(zio->io_parent == lio);
       
  1346 	ASSERT(zio->io_child == NULL);
       
  1347 
       
  1348 	if (zio->io_error)
       
  1349 		return;
       
  1350 
       
  1351 	if (BP_SHOULD_BYTESWAP(bp))
       
  1352 		byteswap_uint64_array(zio->io_data, zio->io_size);
       
  1353 
       
  1354 	ASSERT(zio->io_data == gn->gn_gbh);
       
  1355 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
       
  1356 	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
       
  1357 
       
  1358 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
       
  1359 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
       
  1360 		if (!BP_IS_GANG(gbp))
       
  1361 			continue;
       
  1362 		zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
       
  1363 	}
       
  1364 }
       
  1365 
       
  1366 static void
       
  1367 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
       
  1368 {
       
  1369 	zio_t *lio = pio->io_logical;
       
  1370 	zio_t *zio;
       
  1371 
       
  1372 	ASSERT(BP_IS_GANG(bp) == !!gn);
       
  1373 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
       
  1374 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
       
  1375 
       
  1376 	/*
       
  1377 	 * If you're a gang header, your data is in gn->gn_gbh.
       
  1378 	 * If you're a gang member, your data is in 'data' and gn == NULL.
       
  1379 	 */
       
  1380 	zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
       
  1381 
       
  1382 	if (gn != NULL) {
       
  1383 		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
       
  1384 
       
  1385 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
       
  1386 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
       
  1387 			if (BP_IS_HOLE(gbp))
       
  1388 				continue;
       
  1389 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
       
  1390 			data = (char *)data + BP_GET_PSIZE(gbp);
       
  1391 		}
       
  1392 	}
       
  1393 
       
  1394 	if (gn == lio->io_gang_tree)
       
  1395 		ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
       
  1396 
       
  1397 	if (zio != pio)
       
  1398 		zio_nowait(zio);
       
  1399 }
       
  1400 
       
  1401 static int
       
  1402 zio_gang_assemble(zio_t *zio)
       
  1403 {
       
  1404 	blkptr_t *bp = zio->io_bp;
       
  1405 
       
  1406 	ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
       
  1407 
       
  1408 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
       
  1409 
       
  1410 	return (ZIO_PIPELINE_CONTINUE);
       
  1411 }
       
  1412 
       
  1413 static int
       
  1414 zio_gang_issue(zio_t *zio)
       
  1415 {
       
  1416 	zio_t *lio = zio->io_logical;
       
  1417 	blkptr_t *bp = zio->io_bp;
       
  1418 
       
  1419 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
       
  1420 		return (ZIO_PIPELINE_STOP);
       
  1421 
       
  1422 	ASSERT(BP_IS_GANG(bp) && zio == lio);
       
  1423 
       
  1424 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
       
  1425 		zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
       
  1426 	else
       
  1427 		zio_gang_tree_free(&lio->io_gang_tree);
       
  1428 
       
  1429 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
       
  1430 
       
  1431 	return (ZIO_PIPELINE_CONTINUE);
       
  1432 }
       
  1433 
       
  1434 static void
       
  1435 zio_write_gang_member_ready(zio_t *zio)
       
  1436 {
       
  1437 	zio_t *pio = zio->io_parent;
       
  1438 	zio_t *lio = zio->io_logical;
       
  1439 	dva_t *cdva = zio->io_bp->blk_dva;
       
  1440 	dva_t *pdva = pio->io_bp->blk_dva;
       
  1441 	uint64_t asize;
       
  1442 
       
  1443 	if (BP_IS_HOLE(zio->io_bp))
       
  1444 		return;
       
  1445 
       
  1446 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
       
  1447 
       
  1448 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
       
  1449 	ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
       
  1450 	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
       
  1451 	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
       
  1452 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
       
  1453 
       
  1454 	mutex_enter(&pio->io_lock);
       
  1455 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
       
  1456 		ASSERT(DVA_GET_GANG(&pdva[d]));
       
  1457 		asize = DVA_GET_ASIZE(&pdva[d]);
       
  1458 		asize += DVA_GET_ASIZE(&cdva[d]);
       
  1459 		DVA_SET_ASIZE(&pdva[d], asize);
       
  1460 	}
       
  1461 	mutex_exit(&pio->io_lock);
       
  1462 }
       
  1463 
       
  1464 static int
       
  1465 zio_write_gang_block(zio_t *pio)
       
  1466 {
       
  1467 	spa_t *spa = pio->io_spa;
       
  1468 	blkptr_t *bp = pio->io_bp;
       
  1469 	zio_t *lio = pio->io_logical;
       
  1470 	zio_t *zio;
       
  1471 	zio_gang_node_t *gn, **gnpp;
       
  1472 	zio_gbh_phys_t *gbh;
       
  1473 	uint64_t txg = pio->io_txg;
       
  1474 	uint64_t resid = pio->io_size;
       
  1475 	uint64_t lsize;
       
  1476 	int ndvas = lio->io_prop.zp_ndvas;
       
  1477 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
       
  1478 	zio_prop_t zp;
       
  1479 	int error;
       
  1480 
       
  1481 	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
       
  1482 	    bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
       
  1483 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
       
  1484 	if (error) {
       
  1485 		pio->io_error = error;
       
  1486 		return (ZIO_PIPELINE_CONTINUE);
       
  1487 	}
       
  1488 
       
  1489 	if (pio == lio) {
       
  1490 		gnpp = &lio->io_gang_tree;
       
  1491 	} else {
       
  1492 		gnpp = pio->io_private;
       
  1493 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
       
  1494 	}
       
  1495 
       
  1496 	gn = zio_gang_node_alloc(gnpp);
       
  1497 	gbh = gn->gn_gbh;
       
  1498 	bzero(gbh, SPA_GANGBLOCKSIZE);
       
  1499 
       
  1500 	/*
       
  1501 	 * Create the gang header.
       
  1502 	 */
       
  1503 	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
       
  1504 	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
       
  1505 
       
  1506 	/*
       
  1507 	 * Create and nowait the gang children.
       
  1508 	 */
       
  1509 	for (int g = 0; resid != 0; resid -= lsize, g++) {
       
  1510 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
       
  1511 		    SPA_MINBLOCKSIZE);
       
  1512 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
       
  1513 
       
  1514 		zp.zp_checksum = lio->io_prop.zp_checksum;
       
  1515 		zp.zp_compress = ZIO_COMPRESS_OFF;
       
  1516 		zp.zp_type = DMU_OT_NONE;
       
  1517 		zp.zp_level = 0;
       
  1518 		zp.zp_ndvas = lio->io_prop.zp_ndvas;
       
  1519 
       
  1520 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
       
  1521 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
       
  1522 		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
       
  1523 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
       
  1524 		    &pio->io_bookmark));
       
  1525 	}
       
  1526 
       
  1527 	/*
       
  1528 	 * Set pio's pipeline to just wait for zio to finish.
       
  1529 	 */
       
  1530 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
       
  1531 
       
  1532 	zio_nowait(zio);
       
  1533 
       
  1534 	return (ZIO_PIPELINE_CONTINUE);
       
  1535 }
       
  1536 
       
  1537 /*
       
  1538  * ==========================================================================
       
  1539  * Allocate and free blocks
       
  1540  * ==========================================================================
       
  1541  */
       
  1542 
       
  1543 static int
       
  1544 zio_dva_allocate(zio_t *zio)
       
  1545 {
       
  1546 	spa_t *spa = zio->io_spa;
       
  1547 	metaslab_class_t *mc = spa->spa_normal_class;
       
  1548 	blkptr_t *bp = zio->io_bp;
       
  1549 	int error;
       
  1550 
       
  1551 	ASSERT(BP_IS_HOLE(bp));
       
  1552 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
       
  1553 	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
       
  1554 	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
       
  1555 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
       
  1556 
       
  1557 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
       
  1558 	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
       
  1559 
       
  1560 	if (error) {
       
  1561 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
       
  1562 			return (zio_write_gang_block(zio));
       
  1563 		zio->io_error = error;
       
  1564 	}
       
  1565 
       
  1566 	return (ZIO_PIPELINE_CONTINUE);
       
  1567 }
       
  1568 
       
  1569 static int
       
  1570 zio_dva_free(zio_t *zio)
       
  1571 {
       
  1572 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
       
  1573 
       
  1574 	return (ZIO_PIPELINE_CONTINUE);
       
  1575 }
       
  1576 
       
  1577 static int
       
  1578 zio_dva_claim(zio_t *zio)
       
  1579 {
       
  1580 	int error;
       
  1581 
       
  1582 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
       
  1583 	if (error)
       
  1584 		zio->io_error = error;
       
  1585 
       
  1586 	return (ZIO_PIPELINE_CONTINUE);
       
  1587 }
       
  1588 
       
  1589 /*
       
  1590  * Undo an allocation.  This is used by zio_done() when an I/O fails
       
  1591  * and we want to give back the block we just allocated.
       
  1592  * This handles both normal blocks and gang blocks.
       
  1593  */
       
  1594 static void
       
  1595 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
       
  1596 {
       
  1597 	spa_t *spa = zio->io_spa;
       
  1598 	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
       
  1599 
       
  1600 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
       
  1601 
       
  1602 	if (zio->io_bp == bp && !now) {
       
  1603 		/*
       
  1604 		 * This is a rewrite for sync-to-convergence.
       
  1605 		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
       
  1606 		 * during this sync pass, which means that metaslab_sync()
       
  1607 		 * already committed the allocation.
       
  1608 		 */
       
  1609 		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
       
  1610 		    BP_IDENTITY(&zio->io_bp_orig)));
       
  1611 		ASSERT(spa_sync_pass(spa) > 1);
       
  1612 
       
  1613 		if (BP_IS_GANG(bp) && gn == NULL) {
       
  1614 			/*
       
  1615 			 * This is a gang leader whose gang header(s) we
       
  1616 			 * couldn't read now, so defer the free until later.
       
  1617 			 * The block should still be intact because without
       
  1618 			 * the headers, we'd never even start the rewrite.
       
  1619 			 */
       
  1620 			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
       
  1621 			return;
       
  1622 		}
       
  1623 	}
       
  1624 
       
  1625 	if (!BP_IS_HOLE(bp))
       
  1626 		metaslab_free(spa, bp, bp->blk_birth, now);
       
  1627 
       
  1628 	if (gn != NULL) {
       
  1629 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
       
  1630 			zio_dva_unallocate(zio, gn->gn_child[g],
       
  1631 			    &gn->gn_gbh->zg_blkptr[g]);
       
  1632 		}
       
  1633 	}
  2094 }
  1634 }
  2095 
  1635 
  2096 /*
  1636 /*
  2097  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  1637  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  2098  */
  1638  */
  2100 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
  1640 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
  2101     uint64_t txg)
  1641     uint64_t txg)
  2102 {
  1642 {
  2103 	int error;
  1643 	int error;
  2104 
  1644 
  2105 	spa_config_enter(spa, RW_READER, FTAG);
       
  2106 
       
  2107 	if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) {
       
  2108 		spa_config_exit(spa, FTAG);
       
  2109 		return (ENOSPC);
       
  2110 	}
       
  2111 
       
  2112 	/*
       
  2113 	 * We were passed the previous log block's DVA in bp->blk_dva[0].
       
  2114 	 * We use that as a hint for which vdev to allocate from next.
       
  2115 	 */
       
  2116 	error = metaslab_alloc(spa, spa->spa_log_class, size,
  1645 	error = metaslab_alloc(spa, spa->spa_log_class, size,
  2117 	    new_bp, 1, txg, old_bp, B_TRUE);
  1646 	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
  2118 
  1647 
  2119 	if (error)
  1648 	if (error)
  2120 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
  1649 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
  2121 		    new_bp, 1, txg, old_bp, B_TRUE);
  1650 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
  2122 
  1651 
  2123 	if (error == 0) {
  1652 	if (error == 0) {
  2124 		BP_SET_LSIZE(new_bp, size);
  1653 		BP_SET_LSIZE(new_bp, size);
  2125 		BP_SET_PSIZE(new_bp, size);
  1654 		BP_SET_PSIZE(new_bp, size);
  2126 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
  1655 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
  2127 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
  1656 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
  2128 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
  1657 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
  2129 		BP_SET_LEVEL(new_bp, 0);
  1658 		BP_SET_LEVEL(new_bp, 0);
  2130 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
  1659 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
  2131 		new_bp->blk_birth = txg;
  1660 	}
  2132 	}
       
  2133 
       
  2134 	spa_config_exit(spa, FTAG);
       
  2135 
  1661 
  2136 	return (error);
  1662 	return (error);
  2137 }
  1663 }
  2138 
  1664 
  2139 /*
  1665 /*
  2143 void
  1669 void
  2144 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
  1670 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
  2145 {
  1671 {
  2146 	ASSERT(!BP_IS_GANG(bp));
  1672 	ASSERT(!BP_IS_GANG(bp));
  2147 
  1673 
  2148 	spa_config_enter(spa, RW_READER, FTAG);
       
  2149 
       
  2150 	metaslab_free(spa, bp, txg, B_FALSE);
  1674 	metaslab_free(spa, bp, txg, B_FALSE);
  2151 
  1675 }
  2152 	spa_config_exit(spa, FTAG);
  1676 
  2153 }
  1677 /*
  2154 
  1678  * ==========================================================================
  2155 /*
  1679  * Read and write to physical devices
  2156  * start an async flush of the write cache for this vdev
  1680  * ==========================================================================
  2157  */
  1681  */
       
  1682 
       
  1683 static void
       
  1684 zio_vdev_io_probe_done(zio_t *zio)
       
  1685 {
       
  1686 	zio_t *dio;
       
  1687 	vdev_t *vd = zio->io_private;
       
  1688 
       
  1689 	mutex_enter(&vd->vdev_probe_lock);
       
  1690 	ASSERT(vd->vdev_probe_zio == zio);
       
  1691 	vd->vdev_probe_zio = NULL;
       
  1692 	mutex_exit(&vd->vdev_probe_lock);
       
  1693 
       
  1694 	while ((dio = zio->io_delegate_list) != NULL) {
       
  1695 		zio->io_delegate_list = dio->io_delegate_next;
       
  1696 		dio->io_delegate_next = NULL;
       
  1697 		if (!vdev_accessible(vd, dio))
       
  1698 			dio->io_error = ENXIO;
       
  1699 		zio_execute(dio);
       
  1700 	}
       
  1701 }
       
  1702 
       
  1703 /*
       
  1704  * Probe the device to determine whether I/O failure is specific to this
       
  1705  * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
       
  1706  */
       
  1707 static int
       
  1708 zio_vdev_io_probe(zio_t *zio)
       
  1709 {
       
  1710 	vdev_t *vd = zio->io_vd;
       
  1711 	zio_t *pio = NULL;
       
  1712 	boolean_t created_pio = B_FALSE;
       
  1713 
       
  1714 	/*
       
  1715 	 * Don't probe the probe.
       
  1716 	 */
       
  1717 	if (zio->io_flags & ZIO_FLAG_PROBE)
       
  1718 		return (ZIO_PIPELINE_CONTINUE);
       
  1719 
       
  1720 	/*
       
  1721 	 * To prevent 'probe storms' when a device fails, we create
       
  1722 	 * just one probe i/o at a time.  All zios that want to probe
       
  1723 	 * this vdev will join the probe zio's io_delegate_list.
       
  1724 	 */
       
  1725 	mutex_enter(&vd->vdev_probe_lock);
       
  1726 
       
  1727 	if ((pio = vd->vdev_probe_zio) == NULL) {
       
  1728 		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
       
  1729 		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
       
  1730 		created_pio = B_TRUE;
       
  1731 		vd->vdev_probe_wanted = B_TRUE;
       
  1732 		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
       
  1733 	}
       
  1734 
       
  1735 	zio->io_delegate_next = pio->io_delegate_list;
       
  1736 	pio->io_delegate_list = zio;
       
  1737 
       
  1738 	mutex_exit(&vd->vdev_probe_lock);
       
  1739 
       
  1740 	if (created_pio) {
       
  1741 		zio_nowait(vdev_probe(vd, pio));
       
  1742 		zio_nowait(pio);
       
  1743 	}
       
  1744 
       
  1745 	return (ZIO_PIPELINE_STOP);
       
  1746 }
       
  1747 
       
  1748 static int
       
  1749 zio_vdev_io_start(zio_t *zio)
       
  1750 {
       
  1751 	vdev_t *vd = zio->io_vd;
       
  1752 	uint64_t align;
       
  1753 	spa_t *spa = zio->io_spa;
       
  1754 
       
  1755 	ASSERT(zio->io_error == 0);
       
  1756 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
       
  1757 
       
  1758 	if (vd == NULL) {
       
  1759 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
       
  1760 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
       
  1761 
       
  1762 		/*
       
  1763 		 * The mirror_ops handle multiple DVAs in a single BP.
       
  1764 		 */
       
  1765 		return (vdev_mirror_ops.vdev_op_io_start(zio));
       
  1766 	}
       
  1767 
       
  1768 	align = 1ULL << vd->vdev_top->vdev_ashift;
       
  1769 
       
  1770 	if (P2PHASE(zio->io_size, align) != 0) {
       
  1771 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
       
  1772 		char *abuf = zio_buf_alloc(asize);
       
  1773 		ASSERT(vd == vd->vdev_top);
       
  1774 		if (zio->io_type == ZIO_TYPE_WRITE) {
       
  1775 			bcopy(zio->io_data, abuf, zio->io_size);
       
  1776 			bzero(abuf + zio->io_size, asize - zio->io_size);
       
  1777 		}
       
  1778 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
       
  1779 	}
       
  1780 
       
  1781 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
       
  1782 	ASSERT(P2PHASE(zio->io_size, align) == 0);
       
  1783 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
       
  1784 
       
  1785 	if (vd->vdev_ops->vdev_op_leaf &&
       
  1786 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
       
  1787 
       
  1788 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
       
  1789 			return (ZIO_PIPELINE_STOP);
       
  1790 
       
  1791 		if ((zio = vdev_queue_io(zio)) == NULL)
       
  1792 			return (ZIO_PIPELINE_STOP);
       
  1793 
       
  1794 		if (!vdev_accessible(vd, zio)) {
       
  1795 			zio->io_error = ENXIO;
       
  1796 			zio_interrupt(zio);
       
  1797 			return (ZIO_PIPELINE_STOP);
       
  1798 		}
       
  1799 
       
  1800 	}
       
  1801 
       
  1802 	return (vd->vdev_ops->vdev_op_io_start(zio));
       
  1803 }
       
  1804 
       
  1805 static int
       
  1806 zio_vdev_io_done(zio_t *zio)
       
  1807 {
       
  1808 	vdev_t *vd = zio->io_vd;
       
  1809 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
       
  1810 	boolean_t unexpected_error = B_FALSE;
       
  1811 
       
  1812 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
       
  1813 		return (ZIO_PIPELINE_STOP);
       
  1814 
       
  1815 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
       
  1816 
       
  1817 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
       
  1818 
       
  1819 		vdev_queue_io_done(zio);
       
  1820 
       
  1821 		if (zio->io_type == ZIO_TYPE_WRITE)
       
  1822 			vdev_cache_write(zio);
       
  1823 
       
  1824 		if (zio_injection_enabled && zio->io_error == 0)
       
  1825 			zio->io_error = zio_handle_device_injection(vd, EIO);
       
  1826 
       
  1827 		if (zio_injection_enabled && zio->io_error == 0)
       
  1828 			zio->io_error = zio_handle_label_injection(zio, EIO);
       
  1829 
       
  1830 		if (zio->io_error) {
       
  1831 			if (!vdev_accessible(vd, zio)) {
       
  1832 				zio->io_error = ENXIO;
       
  1833 			} else {
       
  1834 				unexpected_error = B_TRUE;
       
  1835 			}
       
  1836 		}
       
  1837 	}
       
  1838 
       
  1839 	ops->vdev_op_io_done(zio);
       
  1840 
       
  1841 	if (unexpected_error)
       
  1842 		return (zio_vdev_io_probe(zio));
       
  1843 
       
  1844 	return (ZIO_PIPELINE_CONTINUE);
       
  1845 }
       
  1846 
       
  1847 static int
       
  1848 zio_vdev_io_assess(zio_t *zio)
       
  1849 {
       
  1850 	vdev_t *vd = zio->io_vd;
       
  1851 
       
  1852 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
       
  1853 		return (ZIO_PIPELINE_STOP);
       
  1854 
       
  1855 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
       
  1856 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
       
  1857 
       
  1858 	if (zio->io_vsd != NULL) {
       
  1859 		zio->io_vsd_free(zio);
       
  1860 		zio->io_vsd = NULL;
       
  1861 	}
       
  1862 
       
  1863 	if (zio_injection_enabled && zio->io_error == 0)
       
  1864 		zio->io_error = zio_handle_fault_injection(zio, EIO);
       
  1865 
       
  1866 	/*
       
  1867 	 * If the I/O failed, determine whether we should attempt to retry it.
       
  1868 	 */
       
  1869 	if (zio->io_error && vd == NULL &&
       
  1870 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
       
  1871 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
       
  1872 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
       
  1873 		zio->io_error = 0;
       
  1874 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
       
  1875 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
       
  1876 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
       
  1877 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
       
  1878 		return (ZIO_PIPELINE_STOP);
       
  1879 	}
       
  1880 
       
  1881 	/*
       
  1882 	 * If we got an error on a leaf device, convert it to ENXIO
       
  1883 	 * if the device is not accessible at all.
       
  1884 	 */
       
  1885 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
       
  1886 	    !vdev_accessible(vd, zio))
       
  1887 		zio->io_error = ENXIO;
       
  1888 
       
  1889 	/*
       
  1890 	 * If we can't write to an interior vdev (mirror or RAID-Z),
       
  1891 	 * set vdev_cant_write so that we stop trying to allocate from it.
       
  1892 	 */
       
  1893 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
       
  1894 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
       
  1895 		vd->vdev_cant_write = B_TRUE;
       
  1896 
       
  1897 	if (zio->io_error)
       
  1898 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
       
  1899 
       
  1900 	return (ZIO_PIPELINE_CONTINUE);
       
  1901 }
       
  1902 
  2158 void
  1903 void
  2159 zio_flush(zio_t *zio, vdev_t *vd)
  1904 zio_vdev_io_reissue(zio_t *zio)
  2160 {
  1905 {
  2161 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
  1906 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
  2162 	    NULL, NULL, ZIO_PRIORITY_NOW,
  1907 	ASSERT(zio->io_error == 0);
  2163 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
  1908 
  2164 }
  1909 	zio->io_stage--;
       
  1910 }
       
  1911 
       
  1912 void
       
  1913 zio_vdev_io_redone(zio_t *zio)
       
  1914 {
       
  1915 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
       
  1916 
       
  1917 	zio->io_stage--;
       
  1918 }
       
  1919 
       
  1920 void
       
  1921 zio_vdev_io_bypass(zio_t *zio)
       
  1922 {
       
  1923 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
       
  1924 	ASSERT(zio->io_error == 0);
       
  1925 
       
  1926 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
       
  1927 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
       
  1928 }
       
  1929 
       
  1930 /*
       
  1931  * ==========================================================================
       
  1932  * Generate and verify checksums
       
  1933  * ==========================================================================
       
  1934  */
       
  1935 static int
       
  1936 zio_checksum_generate(zio_t *zio)
       
  1937 {
       
  1938 	blkptr_t *bp = zio->io_bp;
       
  1939 	enum zio_checksum checksum;
       
  1940 
       
  1941 	if (bp == NULL) {
       
  1942 		/*
       
  1943 		 * This is zio_write_phys().
       
  1944 		 * We're either generating a label checksum, or none at all.
       
  1945 		 */
       
  1946 		checksum = zio->io_prop.zp_checksum;
       
  1947 
       
  1948 		if (checksum == ZIO_CHECKSUM_OFF)
       
  1949 			return (ZIO_PIPELINE_CONTINUE);
       
  1950 
       
  1951 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
       
  1952 	} else {
       
  1953 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
       
  1954 			ASSERT(!IO_IS_ALLOCATING(zio));
       
  1955 			checksum = ZIO_CHECKSUM_GANG_HEADER;
       
  1956 		} else {
       
  1957 			checksum = BP_GET_CHECKSUM(bp);
       
  1958 		}
       
  1959 	}
       
  1960 
       
  1961 	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
       
  1962 
       
  1963 	return (ZIO_PIPELINE_CONTINUE);
       
  1964 }
       
  1965 
       
  1966 static int
       
  1967 zio_checksum_verify(zio_t *zio)
       
  1968 {
       
  1969 	blkptr_t *bp = zio->io_bp;
       
  1970 	int error;
       
  1971 
       
  1972 	if (bp == NULL) {
       
  1973 		/*
       
  1974 		 * This is zio_read_phys().
       
  1975 		 * We're either verifying a label checksum, or nothing at all.
       
  1976 		 */
       
  1977 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
       
  1978 			return (ZIO_PIPELINE_CONTINUE);
       
  1979 
       
  1980 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
       
  1981 	}
       
  1982 
       
  1983 	if ((error = zio_checksum_error(zio)) != 0) {
       
  1984 		zio->io_error = error;
       
  1985 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
       
  1986 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
       
  1987 			    zio->io_spa, zio->io_vd, zio, 0, 0);
       
  1988 		}
       
  1989 	}
       
  1990 
       
  1991 	return (ZIO_PIPELINE_CONTINUE);
       
  1992 }
       
  1993 
       
  1994 /*
       
  1995  * Called by RAID-Z to ensure we don't compute the checksum twice.
       
  1996  */
       
  1997 void
       
  1998 zio_checksum_verified(zio_t *zio)
       
  1999 {
       
  2000 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
       
  2001 }
       
  2002 
       
  2003 /*
       
  2004  * ==========================================================================
       
  2005  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
       
  2006  * An error of 0 indictes success.  ENXIO indicates whole-device failure,
       
  2007  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
       
  2008  * indicate errors that are specific to one I/O, and most likely permanent.
       
  2009  * Any other error is presumed to be worse because we weren't expecting it.
       
  2010  * ==========================================================================
       
  2011  */
       
  2012 int
       
  2013 zio_worst_error(int e1, int e2)
       
  2014 {
       
  2015 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
       
  2016 	int r1, r2;
       
  2017 
       
  2018 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
       
  2019 		if (e1 == zio_error_rank[r1])
       
  2020 			break;
       
  2021 
       
  2022 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
       
  2023 		if (e2 == zio_error_rank[r2])
       
  2024 			break;
       
  2025 
       
  2026 	return (r1 > r2 ? e1 : e2);
       
  2027 }
       
  2028 
       
  2029 /*
       
  2030  * ==========================================================================
       
  2031  * I/O completion
       
  2032  * ==========================================================================
       
  2033  */
       
  2034 static int
       
  2035 zio_ready(zio_t *zio)
       
  2036 {
       
  2037 	blkptr_t *bp = zio->io_bp;
       
  2038 	zio_t *pio = zio->io_parent;
       
  2039 
       
  2040 	if (zio->io_ready) {
       
  2041 		if (BP_IS_GANG(bp) &&
       
  2042 		    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
       
  2043 			return (ZIO_PIPELINE_STOP);
       
  2044 
       
  2045 		ASSERT(IO_IS_ALLOCATING(zio));
       
  2046 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
       
  2047 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
       
  2048 
       
  2049 		zio->io_ready(zio);
       
  2050 	}
       
  2051 
       
  2052 	if (bp != NULL && bp != &zio->io_bp_copy)
       
  2053 		zio->io_bp_copy = *bp;
       
  2054 
       
  2055 	if (zio->io_error)
       
  2056 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
       
  2057 
       
  2058 	if (pio != NULL)
       
  2059 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
       
  2060 
       
  2061 	return (ZIO_PIPELINE_CONTINUE);
       
  2062 }
       
  2063 
       
  2064 static int
       
  2065 zio_done(zio_t *zio)
       
  2066 {
       
  2067 	spa_t *spa = zio->io_spa;
       
  2068 	zio_t *pio = zio->io_parent;
       
  2069 	zio_t *lio = zio->io_logical;
       
  2070 	blkptr_t *bp = zio->io_bp;
       
  2071 	vdev_t *vd = zio->io_vd;
       
  2072 	uint64_t psize = zio->io_size;
       
  2073 
       
  2074 	/*
       
  2075 	 * If our of children haven't all completed,
       
  2076 	 * wait for them and then repeat this pipeline stage.
       
  2077 	 */
       
  2078 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
       
  2079 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
       
  2080 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
       
  2081 		return (ZIO_PIPELINE_STOP);
       
  2082 
       
  2083 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
       
  2084 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
       
  2085 			ASSERT(zio->io_children[c][w] == 0);
       
  2086 
       
  2087 	if (bp != NULL) {
       
  2088 		ASSERT(bp->blk_pad[0] == 0);
       
  2089 		ASSERT(bp->blk_pad[1] == 0);
       
  2090 		ASSERT(bp->blk_pad[2] == 0);
       
  2091 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
       
  2092 		    (pio != NULL && bp == pio->io_bp));
       
  2093 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
       
  2094 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
       
  2095 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
       
  2096 			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
       
  2097 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
       
  2098 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
       
  2099 		}
       
  2100 	}
       
  2101 
       
  2102 	/*
       
  2103 	 * If there were child vdev or gang errors, they apply to us now.
       
  2104 	 */
       
  2105 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
       
  2106 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
       
  2107 
       
  2108 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
       
  2109 
       
  2110 	vdev_stat_update(zio, psize);
       
  2111 
       
  2112 	if (zio->io_error) {
       
  2113 		/*
       
  2114 		 * If this I/O is attached to a particular vdev,
       
  2115 		 * generate an error message describing the I/O failure
       
  2116 		 * at the block level.  We ignore these errors if the
       
  2117 		 * device is currently unavailable.
       
  2118 		 */
       
  2119 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
       
  2120 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
       
  2121 
       
  2122 		if ((zio->io_error == EIO ||
       
  2123 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
       
  2124 			/*
       
  2125 			 * For logical I/O requests, tell the SPA to log the
       
  2126 			 * error and generate a logical data ereport.
       
  2127 			 */
       
  2128 			spa_log_error(spa, zio);
       
  2129 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
       
  2130 			    0, 0);
       
  2131 		}
       
  2132 	}
       
  2133 
       
  2134 	if (zio->io_error && zio == lio) {
       
  2135 		/*
       
  2136 		 * Determine whether zio should be reexecuted.  This will
       
  2137 		 * propagate all the way to the root via zio_notify_parent().
       
  2138 		 */
       
  2139 		ASSERT(vd == NULL && bp != NULL);
       
  2140 
       
  2141 		if (IO_IS_ALLOCATING(zio))
       
  2142 			if (zio->io_error != ENOSPC)
       
  2143 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
       
  2144 			else
       
  2145 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
       
  2146 
       
  2147 		if ((zio->io_type == ZIO_TYPE_READ ||
       
  2148 		    zio->io_type == ZIO_TYPE_FREE) &&
       
  2149 		    zio->io_error == ENXIO &&
       
  2150 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
       
  2151 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
       
  2152 
       
  2153 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
       
  2154 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
       
  2155 	}
       
  2156 
       
  2157 	/*
       
  2158 	 * If there were logical child errors, they apply to us now.
       
  2159 	 * We defer this until now to avoid conflating logical child
       
  2160 	 * errors with errors that happened to the zio itself when
       
  2161 	 * updating vdev stats and reporting FMA events above.
       
  2162 	 */
       
  2163 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
       
  2164 
       
  2165 	if (zio->io_reexecute) {
       
  2166 		/*
       
  2167 		 * This is a logical I/O that wants to reexecute.
       
  2168 		 *
       
  2169 		 * Reexecute is top-down.  When an i/o fails, if it's not
       
  2170 		 * the root, it simply notifies its parent and sticks around.
       
  2171 		 * The parent, seeing that it still has children in zio_done(),
       
  2172 		 * does the same.  This percolates all the way up to the root.
       
  2173 		 * The root i/o will reexecute or suspend the entire tree.
       
  2174 		 *
       
  2175 		 * This approach ensures that zio_reexecute() honors
       
  2176 		 * all the original i/o dependency relationships, e.g.
       
  2177 		 * parents not executing until children are ready.
       
  2178 		 */
       
  2179 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
       
  2180 
       
  2181 		if (IO_IS_ALLOCATING(zio))
       
  2182 			zio_dva_unallocate(zio, zio->io_gang_tree, bp);
       
  2183 
       
  2184 		zio_gang_tree_free(&zio->io_gang_tree);
       
  2185 
       
  2186 		if (pio != NULL) {
       
  2187 			/*
       
  2188 			 * We're not a root i/o, so there's nothing to do
       
  2189 			 * but notify our parent.  Don't propagate errors
       
  2190 			 * upward since we haven't permanently failed yet.
       
  2191 			 */
       
  2192 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
       
  2193 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
       
  2194 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
       
  2195 			/*
       
  2196 			 * We'd fail again if we reexecuted now, so suspend
       
  2197 			 * until conditions improve (e.g. device comes online).
       
  2198 			 */
       
  2199 			zio_suspend(spa, zio);
       
  2200 		} else {
       
  2201 			/*
       
  2202 			 * Reexecution is potentially a huge amount of work.
       
  2203 			 * Hand it off to the otherwise-unused claim taskq.
       
  2204 			 */
       
  2205 			(void) taskq_dispatch(
       
  2206 			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
       
  2207 			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
       
  2208 		}
       
  2209 		return (ZIO_PIPELINE_STOP);
       
  2210 	}
       
  2211 
       
  2212 	ASSERT(zio->io_child == NULL);
       
  2213 	ASSERT(zio->io_reexecute == 0);
       
  2214 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
       
  2215 
       
  2216 	if (zio->io_done)
       
  2217 		zio->io_done(zio);
       
  2218 
       
  2219 	zio_gang_tree_free(&zio->io_gang_tree);
       
  2220 
       
  2221 	ASSERT(zio->io_delegate_list == NULL);
       
  2222 	ASSERT(zio->io_delegate_next == NULL);
       
  2223 
       
  2224 	if (pio != NULL) {
       
  2225 		zio_remove_child(pio, zio);
       
  2226 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
       
  2227 	}
       
  2228 
       
  2229 	if (zio->io_waiter != NULL) {
       
  2230 		mutex_enter(&zio->io_lock);
       
  2231 		zio->io_executor = NULL;
       
  2232 		cv_broadcast(&zio->io_cv);
       
  2233 		mutex_exit(&zio->io_lock);
       
  2234 	} else {
       
  2235 		zio_destroy(zio);
       
  2236 	}
       
  2237 
       
  2238 	return (ZIO_PIPELINE_STOP);
       
  2239 }
       
  2240 
       
  2241 /*
       
  2242  * ==========================================================================
       
  2243  * I/O pipeline definition
       
  2244  * ==========================================================================
       
  2245  */
       
  2246 static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
       
  2247 	NULL,
       
  2248 	zio_issue_async,
       
  2249 	zio_read_bp_init,
       
  2250 	zio_write_bp_init,
       
  2251 	zio_checksum_generate,
       
  2252 	zio_gang_assemble,
       
  2253 	zio_gang_issue,
       
  2254 	zio_dva_allocate,
       
  2255 	zio_dva_free,
       
  2256 	zio_dva_claim,
       
  2257 	zio_ready,
       
  2258 	zio_vdev_io_start,
       
  2259 	zio_vdev_io_done,
       
  2260 	zio_vdev_io_assess,
       
  2261 	zio_checksum_verify,
       
  2262 	zio_done
       
  2263 };