275 * ========================================================================== |
231 * ========================================================================== |
276 * Push and pop I/O transform buffers |
232 * Push and pop I/O transform buffers |
277 * ========================================================================== |
233 * ========================================================================== |
278 */ |
234 */ |
279 static void |
235 static void |
280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) |
236 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, |
|
237 zio_transform_func_t *transform) |
281 { |
238 { |
282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); |
239 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); |
283 |
240 |
284 zt->zt_data = data; |
241 zt->zt_orig_data = zio->io_data; |
285 zt->zt_size = size; |
242 zt->zt_orig_size = zio->io_size; |
286 zt->zt_bufsize = bufsize; |
243 zt->zt_bufsize = bufsize; |
|
244 zt->zt_transform = transform; |
287 |
245 |
288 zt->zt_next = zio->io_transform_stack; |
246 zt->zt_next = zio->io_transform_stack; |
289 zio->io_transform_stack = zt; |
247 zio->io_transform_stack = zt; |
290 |
248 |
291 zio->io_data = data; |
249 zio->io_data = data; |
292 zio->io_size = size; |
250 zio->io_size = size; |
293 } |
251 } |
294 |
252 |
295 static void |
253 static void |
296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) |
254 zio_pop_transforms(zio_t *zio) |
297 { |
255 { |
298 zio_transform_t *zt = zio->io_transform_stack; |
256 zio_transform_t *zt; |
299 |
257 |
300 *data = zt->zt_data; |
258 while ((zt = zio->io_transform_stack) != NULL) { |
301 *size = zt->zt_size; |
259 if (zt->zt_transform != NULL) |
302 *bufsize = zt->zt_bufsize; |
260 zt->zt_transform(zio, |
303 |
261 zt->zt_orig_data, zt->zt_orig_size); |
304 zio->io_transform_stack = zt->zt_next; |
262 |
305 kmem_free(zt, sizeof (zio_transform_t)); |
263 zio_buf_free(zio->io_data, zt->zt_bufsize); |
306 |
264 |
307 if ((zt = zio->io_transform_stack) != NULL) { |
265 zio->io_data = zt->zt_orig_data; |
308 zio->io_data = zt->zt_data; |
266 zio->io_size = zt->zt_orig_size; |
309 zio->io_size = zt->zt_size; |
267 zio->io_transform_stack = zt->zt_next; |
310 } |
268 |
311 } |
269 kmem_free(zt, sizeof (zio_transform_t)); |
312 |
270 } |
|
271 } |
|
272 |
|
273 /* |
|
274 * ========================================================================== |
|
275 * I/O transform callbacks for subblocks and decompression |
|
276 * ========================================================================== |
|
277 */ |
313 static void |
278 static void |
314 zio_clear_transform_stack(zio_t *zio) |
279 zio_subblock(zio_t *zio, void *data, uint64_t size) |
315 { |
280 { |
316 void *data; |
281 ASSERT(zio->io_size > size); |
317 uint64_t size, bufsize; |
282 |
318 |
283 if (zio->io_type == ZIO_TYPE_READ) |
319 ASSERT(zio->io_transform_stack != NULL); |
284 bcopy(zio->io_data, data, size); |
320 |
285 } |
321 zio_pop_transform(zio, &data, &size, &bufsize); |
286 |
322 while (zio->io_transform_stack != NULL) { |
287 static void |
323 zio_buf_free(data, bufsize); |
288 zio_decompress(zio_t *zio, void *data, uint64_t size) |
324 zio_pop_transform(zio, &data, &size, &bufsize); |
289 { |
325 } |
290 if (zio->io_error == 0 && |
326 } |
291 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), |
327 |
292 zio->io_data, zio->io_size, data, size) != 0) |
328 /* |
293 zio->io_error = EIO; |
329 * ========================================================================== |
294 } |
330 * Create the various types of I/O (read, write, free) |
295 |
|
296 /* |
|
297 * ========================================================================== |
|
298 * I/O parent/child relationships and pipeline interlocks |
|
299 * ========================================================================== |
|
300 */ |
|
301 |
|
302 static void |
|
303 zio_add_child(zio_t *pio, zio_t *zio) |
|
304 { |
|
305 mutex_enter(&pio->io_lock); |
|
306 if (zio->io_stage < ZIO_STAGE_READY) |
|
307 pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; |
|
308 if (zio->io_stage < ZIO_STAGE_DONE) |
|
309 pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; |
|
310 zio->io_sibling_prev = NULL; |
|
311 zio->io_sibling_next = pio->io_child; |
|
312 if (pio->io_child != NULL) |
|
313 pio->io_child->io_sibling_prev = zio; |
|
314 pio->io_child = zio; |
|
315 zio->io_parent = pio; |
|
316 mutex_exit(&pio->io_lock); |
|
317 } |
|
318 |
|
319 static void |
|
320 zio_remove_child(zio_t *pio, zio_t *zio) |
|
321 { |
|
322 zio_t *next, *prev; |
|
323 |
|
324 ASSERT(zio->io_parent == pio); |
|
325 |
|
326 mutex_enter(&pio->io_lock); |
|
327 next = zio->io_sibling_next; |
|
328 prev = zio->io_sibling_prev; |
|
329 if (next != NULL) |
|
330 next->io_sibling_prev = prev; |
|
331 if (prev != NULL) |
|
332 prev->io_sibling_next = next; |
|
333 if (pio->io_child == zio) |
|
334 pio->io_child = next; |
|
335 mutex_exit(&pio->io_lock); |
|
336 } |
|
337 |
|
338 static boolean_t |
|
339 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) |
|
340 { |
|
341 uint64_t *countp = &zio->io_children[child][wait]; |
|
342 boolean_t waiting = B_FALSE; |
|
343 |
|
344 mutex_enter(&zio->io_lock); |
|
345 ASSERT(zio->io_stall == NULL); |
|
346 if (*countp != 0) { |
|
347 zio->io_stage--; |
|
348 zio->io_stall = countp; |
|
349 waiting = B_TRUE; |
|
350 } |
|
351 mutex_exit(&zio->io_lock); |
|
352 |
|
353 return (waiting); |
|
354 } |
|
355 |
|
356 static void |
|
357 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) |
|
358 { |
|
359 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; |
|
360 int *errorp = &pio->io_child_error[zio->io_child_type]; |
|
361 |
|
362 mutex_enter(&pio->io_lock); |
|
363 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) |
|
364 *errorp = zio_worst_error(*errorp, zio->io_error); |
|
365 pio->io_reexecute |= zio->io_reexecute; |
|
366 ASSERT3U(*countp, >, 0); |
|
367 if (--*countp == 0 && pio->io_stall == countp) { |
|
368 pio->io_stall = NULL; |
|
369 mutex_exit(&pio->io_lock); |
|
370 zio_execute(pio); |
|
371 } else { |
|
372 mutex_exit(&pio->io_lock); |
|
373 } |
|
374 } |
|
375 |
|
376 static void |
|
377 zio_inherit_child_errors(zio_t *zio, enum zio_child c) |
|
378 { |
|
379 if (zio->io_child_error[c] != 0 && zio->io_error == 0) |
|
380 zio->io_error = zio->io_child_error[c]; |
|
381 } |
|
382 |
|
383 /* |
|
384 * ========================================================================== |
|
385 * Create the various types of I/O (read, write, free, etc) |
331 * ========================================================================== |
386 * ========================================================================== |
332 */ |
387 */ |
333 static zio_t * |
388 static zio_t * |
334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
389 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
335 void *data, uint64_t size, zio_done_func_t *done, void *private, |
390 void *data, uint64_t size, zio_done_func_t *done, void *private, |
336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) |
391 zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, |
|
392 const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) |
337 { |
393 { |
338 zio_t *zio; |
394 zio_t *zio; |
339 |
395 |
340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); |
396 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); |
341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); |
397 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); |
342 |
398 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); |
343 /* Only we should set CONFIG_GRABBED */ |
399 |
344 ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED)); |
400 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); |
|
401 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); |
|
402 ASSERT(vd || stage == ZIO_STAGE_OPEN); |
345 |
403 |
346 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); |
404 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); |
347 bzero(zio, sizeof (zio_t)); |
405 bzero(zio, sizeof (zio_t)); |
348 zio->io_parent = pio; |
406 |
349 zio->io_spa = spa; |
407 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); |
350 zio->io_txg = txg; |
408 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); |
351 zio->io_flags = flags; |
409 |
|
410 if (vd != NULL) |
|
411 zio->io_child_type = ZIO_CHILD_VDEV; |
|
412 else if (flags & ZIO_FLAG_GANG_CHILD) |
|
413 zio->io_child_type = ZIO_CHILD_GANG; |
|
414 else |
|
415 zio->io_child_type = ZIO_CHILD_LOGICAL; |
|
416 |
352 if (bp != NULL) { |
417 if (bp != NULL) { |
353 zio->io_bp = bp; |
418 zio->io_bp = bp; |
354 zio->io_bp_copy = *bp; |
419 zio->io_bp_copy = *bp; |
355 zio->io_bp_orig = *bp; |
420 zio->io_bp_orig = *bp; |
356 } |
421 if (type != ZIO_TYPE_WRITE) |
|
422 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ |
|
423 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { |
|
424 if (BP_IS_GANG(bp)) |
|
425 pipeline |= ZIO_GANG_STAGES; |
|
426 zio->io_logical = zio; |
|
427 } |
|
428 } |
|
429 |
|
430 zio->io_spa = spa; |
|
431 zio->io_txg = txg; |
|
432 zio->io_data = data; |
|
433 zio->io_size = size; |
357 zio->io_done = done; |
434 zio->io_done = done; |
358 zio->io_private = private; |
435 zio->io_private = private; |
359 zio->io_type = type; |
436 zio->io_type = type; |
360 zio->io_priority = priority; |
437 zio->io_priority = priority; |
361 zio->io_stage = stage; |
438 zio->io_vd = vd; |
362 zio->io_pipeline = pipeline; |
439 zio->io_offset = offset; |
363 zio->io_timestamp = lbolt64; |
440 zio->io_orig_flags = zio->io_flags = flags; |
364 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); |
441 zio->io_orig_stage = zio->io_stage = stage; |
365 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); |
442 zio->io_orig_pipeline = zio->io_pipeline = pipeline; |
366 zio_push_transform(zio, data, size, size); |
443 |
367 |
444 if (zb != NULL) |
368 /* |
445 zio->io_bookmark = *zb; |
369 * Note on config lock: |
446 |
370 * |
447 if (pio != NULL) { |
371 * If CONFIG_HELD is set, then the caller already has the config |
448 /* |
372 * lock, so we don't need it for this io. |
449 * Logical I/Os can have logical, gang, or vdev children. |
373 * |
450 * Gang I/Os can have gang or vdev children. |
374 * We set CONFIG_GRABBED to indicate that we have grabbed the |
451 * Vdev I/Os can only have vdev children. |
375 * config lock on behalf of this io, so it should be released |
452 * The following ASSERT captures all of these constraints. |
376 * in zio_done. |
453 */ |
377 * |
454 ASSERT(zio->io_child_type <= pio->io_child_type); |
378 * Unless CONFIG_HELD is set, we will grab the config lock for |
455 if (zio->io_logical == NULL) |
379 * any top-level (parent-less) io, *except* NULL top-level ios. |
|
380 * The NULL top-level ios rarely have any children, so we delay |
|
381 * grabbing the lock until the first child is added (but it is |
|
382 * still grabbed on behalf of the top-level i/o, so additional |
|
383 * children don't need to also grab it). This greatly reduces |
|
384 * contention on the config lock. |
|
385 */ |
|
386 if (pio == NULL) { |
|
387 if (type != ZIO_TYPE_NULL && |
|
388 !(flags & ZIO_FLAG_CONFIG_HELD)) { |
|
389 spa_config_enter(spa, RW_READER, zio); |
|
390 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; |
|
391 } |
|
392 zio->io_root = zio; |
|
393 } else { |
|
394 zio->io_root = pio->io_root; |
|
395 if (!(flags & ZIO_FLAG_NOBOOKMARK)) |
|
396 zio->io_logical = pio->io_logical; |
456 zio->io_logical = pio->io_logical; |
397 mutex_enter(&pio->io_lock); |
457 zio_add_child(pio, zio); |
398 if (pio->io_parent == NULL && |
458 } |
399 pio->io_type == ZIO_TYPE_NULL && |
|
400 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && |
|
401 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { |
|
402 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; |
|
403 spa_config_enter(spa, RW_READER, pio); |
|
404 } |
|
405 if (stage < ZIO_STAGE_READY) |
|
406 pio->io_children_notready++; |
|
407 pio->io_children_notdone++; |
|
408 zio->io_sibling_next = pio->io_child; |
|
409 zio->io_sibling_prev = NULL; |
|
410 if (pio->io_child != NULL) |
|
411 pio->io_child->io_sibling_prev = zio; |
|
412 pio->io_child = zio; |
|
413 zio->io_ndvas = pio->io_ndvas; |
|
414 mutex_exit(&pio->io_lock); |
|
415 } |
|
416 |
|
417 /* |
|
418 * Save off the original state incase we need to retry later. |
|
419 */ |
|
420 zio->io_orig_stage = zio->io_stage; |
|
421 zio->io_orig_pipeline = zio->io_pipeline; |
|
422 zio->io_orig_flags = zio->io_flags; |
|
423 |
|
424 /* |
|
425 * If this is not a null zio, and config is not already held, |
|
426 * then the root zio should have grabbed the config lock. |
|
427 * If this is not a root zio, it should not have grabbed the |
|
428 * config lock. |
|
429 */ |
|
430 ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) || |
|
431 zio->io_type == ZIO_TYPE_NULL || |
|
432 (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED)); |
|
433 ASSERT(zio->io_root == zio || |
|
434 !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)); |
|
435 |
459 |
436 return (zio); |
460 return (zio); |
437 } |
461 } |
438 |
462 |
439 static void |
463 static void |
440 zio_reset(zio_t *zio) |
464 zio_destroy(zio_t *zio) |
441 { |
465 { |
442 zio_clear_transform_stack(zio); |
466 spa_t *spa = zio->io_spa; |
443 |
467 uint8_t async_root = zio->io_async_root; |
444 zio->io_flags = zio->io_orig_flags; |
468 |
445 zio->io_stage = zio->io_orig_stage; |
469 mutex_destroy(&zio->io_lock); |
446 zio->io_pipeline = zio->io_orig_pipeline; |
470 cv_destroy(&zio->io_cv); |
447 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); |
471 kmem_cache_free(zio_cache, zio); |
|
472 |
|
473 if (async_root) { |
|
474 mutex_enter(&spa->spa_async_root_lock); |
|
475 if (--spa->spa_async_root_count == 0) |
|
476 cv_broadcast(&spa->spa_async_root_cv); |
|
477 mutex_exit(&spa->spa_async_root_lock); |
|
478 } |
448 } |
479 } |
449 |
480 |
450 zio_t * |
481 zio_t * |
451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, |
482 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, |
452 int flags) |
483 int flags) |
453 { |
484 { |
454 zio_t *zio; |
485 zio_t *zio; |
455 |
486 |
456 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, |
487 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, |
457 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, |
488 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, |
458 ZIO_WAIT_FOR_CHILDREN_PIPELINE); |
489 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); |
459 |
490 |
460 return (zio); |
491 return (zio); |
461 } |
492 } |
462 |
493 |
463 zio_t * |
494 zio_t * |
465 { |
496 { |
466 return (zio_null(NULL, spa, done, private, flags)); |
497 return (zio_null(NULL, spa, done, private, flags)); |
467 } |
498 } |
468 |
499 |
469 zio_t * |
500 zio_t * |
470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, |
501 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, |
471 uint64_t size, zio_done_func_t *done, void *private, |
502 void *data, uint64_t size, zio_done_func_t *done, void *private, |
472 int priority, int flags, const zbookmark_t *zb) |
503 int priority, int flags, const zbookmark_t *zb) |
473 { |
504 { |
474 zio_t *zio; |
505 zio_t *zio; |
475 |
|
476 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); |
|
477 |
|
478 /* |
|
479 * If the user has specified that we allow I/Os to continue |
|
480 * then attempt to satisfy the read. |
|
481 */ |
|
482 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) |
|
483 ZIO_ENTER(spa); |
|
484 |
506 |
485 zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, |
507 zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, |
486 data, size, done, private, |
508 data, size, done, private, |
487 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, |
509 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, |
488 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); |
510 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); |
489 zio->io_bookmark = *zb; |
|
490 |
|
491 zio->io_logical = zio; |
|
492 |
|
493 /* |
|
494 * Work off our copy of the bp so the caller can free it. |
|
495 */ |
|
496 zio->io_bp = &zio->io_bp_copy; |
|
497 |
511 |
498 return (zio); |
512 return (zio); |
499 } |
513 } |
500 |
514 |
501 zio_t * |
515 zio_t * |
502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, |
516 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
503 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, |
517 void *data, uint64_t size, zio_prop_t *zp, |
504 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, |
518 zio_done_func_t *ready, zio_done_func_t *done, void *private, |
505 int flags, const zbookmark_t *zb) |
519 int priority, int flags, const zbookmark_t *zb) |
506 { |
520 { |
507 zio_t *zio; |
521 zio_t *zio; |
508 |
522 |
509 ASSERT(checksum >= ZIO_CHECKSUM_OFF && |
523 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && |
510 checksum < ZIO_CHECKSUM_FUNCTIONS); |
524 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && |
511 |
525 zp->zp_compress >= ZIO_COMPRESS_OFF && |
512 ASSERT(compress >= ZIO_COMPRESS_OFF && |
526 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && |
513 compress < ZIO_COMPRESS_FUNCTIONS); |
527 zp->zp_type < DMU_OT_NUMTYPES && |
514 |
528 zp->zp_level < 32 && |
515 ZIO_ENTER(spa); |
529 zp->zp_ndvas > 0 && |
|
530 zp->zp_ndvas <= spa_max_replication(spa)); |
|
531 ASSERT(ready != NULL); |
516 |
532 |
517 zio = zio_create(pio, spa, txg, bp, data, size, done, private, |
533 zio = zio_create(pio, spa, txg, bp, data, size, done, private, |
518 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, |
534 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, |
519 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); |
535 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); |
520 |
536 |
521 zio->io_ready = ready; |
537 zio->io_ready = ready; |
522 |
538 zio->io_prop = *zp; |
523 zio->io_bookmark = *zb; |
|
524 |
|
525 zio->io_logical = zio; |
|
526 |
|
527 zio->io_checksum = checksum; |
|
528 zio->io_compress = compress; |
|
529 zio->io_ndvas = ncopies; |
|
530 |
|
531 if (bp->blk_birth != txg) { |
|
532 /* XXX the bp usually (always?) gets re-zeroed later */ |
|
533 BP_ZERO(bp); |
|
534 BP_SET_LSIZE(bp, size); |
|
535 BP_SET_PSIZE(bp, size); |
|
536 } else { |
|
537 /* Make sure someone doesn't change their mind on overwrites */ |
|
538 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), |
|
539 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); |
|
540 } |
|
541 |
539 |
542 return (zio); |
540 return (zio); |
543 } |
541 } |
544 |
542 |
545 zio_t * |
543 zio_t * |
546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, |
544 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, |
547 blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, |
545 uint64_t size, zio_done_func_t *done, void *private, int priority, |
548 void *private, int priority, int flags, zbookmark_t *zb) |
546 int flags, zbookmark_t *zb) |
549 { |
547 { |
550 zio_t *zio; |
548 zio_t *zio; |
551 |
549 |
552 zio = zio_create(pio, spa, txg, bp, data, size, done, private, |
550 zio = zio_create(pio, spa, txg, bp, data, size, done, private, |
553 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, |
551 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, |
554 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); |
552 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); |
555 |
|
556 zio->io_bookmark = *zb; |
|
557 zio->io_checksum = checksum; |
|
558 zio->io_compress = ZIO_COMPRESS_OFF; |
|
559 |
|
560 if (pio != NULL) |
|
561 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); |
|
562 |
|
563 return (zio); |
|
564 } |
|
565 |
|
566 static void |
|
567 zio_write_allocate_ready(zio_t *zio) |
|
568 { |
|
569 /* Free up the previous block */ |
|
570 if (!BP_IS_HOLE(&zio->io_bp_orig)) { |
|
571 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, |
|
572 &zio->io_bp_orig, NULL, NULL)); |
|
573 } |
|
574 } |
|
575 |
|
576 static zio_t * |
|
577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, |
|
578 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, |
|
579 zio_done_func_t *done, void *private, int priority, int flags) |
|
580 { |
|
581 zio_t *zio; |
|
582 |
|
583 BP_ZERO(bp); |
|
584 BP_SET_LSIZE(bp, size); |
|
585 BP_SET_PSIZE(bp, size); |
|
586 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); |
|
587 |
|
588 zio = zio_create(pio, spa, txg, bp, data, size, done, private, |
|
589 ZIO_TYPE_WRITE, priority, flags, |
|
590 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); |
|
591 |
|
592 zio->io_checksum = checksum; |
|
593 zio->io_compress = ZIO_COMPRESS_OFF; |
|
594 zio->io_ready = zio_write_allocate_ready; |
|
595 |
553 |
596 return (zio); |
554 return (zio); |
597 } |
555 } |
598 |
556 |
599 zio_t * |
557 zio_t * |
600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
558 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
601 zio_done_func_t *done, void *private) |
559 zio_done_func_t *done, void *private, int flags) |
602 { |
560 { |
603 zio_t *zio; |
561 zio_t *zio; |
604 |
562 |
605 ASSERT(!BP_IS_HOLE(bp)); |
563 ASSERT(!BP_IS_HOLE(bp)); |
606 |
564 |
|
565 if (bp->blk_fill == BLK_FILL_ALREADY_FREED) |
|
566 return (zio_null(pio, spa, NULL, NULL, flags)); |
|
567 |
607 if (txg == spa->spa_syncing_txg && |
568 if (txg == spa->spa_syncing_txg && |
608 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { |
569 spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { |
609 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); |
570 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); |
610 return (zio_null(pio, spa, NULL, NULL, 0)); |
571 return (zio_null(pio, spa, NULL, NULL, flags)); |
611 } |
572 } |
612 |
573 |
613 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, |
574 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), |
614 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, |
575 done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, |
615 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); |
576 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); |
616 |
|
617 zio->io_bp = &zio->io_bp_copy; |
|
618 |
577 |
619 return (zio); |
578 return (zio); |
620 } |
579 } |
621 |
580 |
622 zio_t * |
581 zio_t * |
623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
582 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, |
624 zio_done_func_t *done, void *private) |
583 zio_done_func_t *done, void *private, int flags) |
625 { |
584 { |
626 zio_t *zio; |
585 zio_t *zio; |
627 |
586 |
628 /* |
587 /* |
629 * A claim is an allocation of a specific block. Claims are needed |
588 * A claim is an allocation of a specific block. Claims are needed |
655 zio_t *zio; |
612 zio_t *zio; |
656 int c; |
613 int c; |
657 |
614 |
658 if (vd->vdev_children == 0) { |
615 if (vd->vdev_children == 0) { |
659 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, |
616 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, |
660 ZIO_TYPE_IOCTL, priority, flags, |
617 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, |
661 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); |
618 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); |
662 |
619 |
663 zio->io_vd = vd; |
|
664 zio->io_cmd = cmd; |
620 zio->io_cmd = cmd; |
665 } else { |
621 } else { |
666 zio = zio_null(pio, spa, NULL, NULL, flags); |
622 zio = zio_null(pio, spa, NULL, NULL, flags); |
667 |
623 |
668 for (c = 0; c < vd->vdev_children; c++) |
624 for (c = 0; c < vd->vdev_children; c++) |
669 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, |
625 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, |
670 done, private, priority, flags)); |
626 done, private, priority, flags)); |
671 } |
627 } |
672 |
628 |
673 return (zio); |
629 return (zio); |
674 } |
|
675 |
|
676 static void |
|
677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, |
|
678 int checksum, boolean_t labels) |
|
679 { |
|
680 ASSERT(vd->vdev_children == 0); |
|
681 |
|
682 ASSERT(size <= SPA_MAXBLOCKSIZE); |
|
683 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); |
|
684 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); |
|
685 |
|
686 #ifdef ZFS_DEBUG |
|
687 if (labels) { |
|
688 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || |
|
689 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); |
|
690 } |
|
691 #endif |
|
692 ASSERT3U(offset + size, <=, vd->vdev_psize); |
|
693 |
|
694 BP_ZERO(bp); |
|
695 |
|
696 BP_SET_LSIZE(bp, size); |
|
697 BP_SET_PSIZE(bp, size); |
|
698 |
|
699 BP_SET_CHECKSUM(bp, checksum); |
|
700 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); |
|
701 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); |
|
702 |
|
703 if (checksum != ZIO_CHECKSUM_OFF) |
|
704 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); |
|
705 } |
630 } |
706 |
631 |
707 zio_t * |
632 zio_t * |
708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, |
633 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, |
709 void *data, int checksum, zio_done_func_t *done, void *private, |
634 void *data, int checksum, zio_done_func_t *done, void *private, |
710 int priority, int flags, boolean_t labels) |
635 int priority, int flags, boolean_t labels) |
711 { |
636 { |
712 zio_t *zio; |
637 zio_t *zio; |
713 blkptr_t blk; |
638 |
714 |
639 ASSERT(vd->vdev_children == 0); |
715 ZIO_ENTER(vd->vdev_spa); |
640 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || |
716 |
641 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); |
717 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); |
642 ASSERT3U(offset + size, <=, vd->vdev_psize); |
718 |
643 |
719 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, |
644 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, |
720 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, |
645 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, |
721 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); |
646 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); |
722 |
647 |
723 zio->io_vd = vd; |
648 zio->io_prop.zp_checksum = checksum; |
724 zio->io_offset = offset; |
|
725 |
|
726 /* |
|
727 * Work off our copy of the bp so the caller can free it. |
|
728 */ |
|
729 zio->io_bp = &zio->io_bp_copy; |
|
730 |
649 |
731 return (zio); |
650 return (zio); |
732 } |
651 } |
733 |
652 |
734 zio_t * |
653 zio_t * |
735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, |
654 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, |
736 void *data, int checksum, zio_done_func_t *done, void *private, |
655 void *data, int checksum, zio_done_func_t *done, void *private, |
737 int priority, int flags, boolean_t labels) |
656 int priority, int flags, boolean_t labels) |
738 { |
657 { |
739 zio_block_tail_t *zbt; |
|
740 void *wbuf; |
|
741 zio_t *zio; |
658 zio_t *zio; |
742 blkptr_t blk; |
659 |
743 |
660 ASSERT(vd->vdev_children == 0); |
744 ZIO_ENTER(vd->vdev_spa); |
661 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || |
745 |
662 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); |
746 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); |
663 ASSERT3U(offset + size, <=, vd->vdev_psize); |
747 |
664 |
748 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, |
665 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, |
749 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, |
666 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, |
750 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); |
667 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); |
751 |
668 |
752 zio->io_vd = vd; |
669 zio->io_prop.zp_checksum = checksum; |
753 zio->io_offset = offset; |
|
754 |
|
755 zio->io_bp = &zio->io_bp_copy; |
|
756 zio->io_checksum = checksum; |
|
757 |
670 |
758 if (zio_checksum_table[checksum].ci_zbt) { |
671 if (zio_checksum_table[checksum].ci_zbt) { |
759 /* |
672 /* |
760 * zbt checksums are necessarily destructive -- they modify |
673 * zbt checksums are necessarily destructive -- they modify |
761 * one word of the write buffer to hold the verifier/checksum. |
674 * the end of the write buffer to hold the verifier/checksum. |
762 * Therefore, we must make a local copy in case the data is |
675 * Therefore, we must make a local copy in case the data is |
763 * being written to multiple places. |
676 * being written to multiple places in parallel. |
764 */ |
677 */ |
765 wbuf = zio_buf_alloc(size); |
678 void *wbuf = zio_buf_alloc(size); |
766 bcopy(data, wbuf, size); |
679 bcopy(data, wbuf, size); |
767 zio_push_transform(zio, wbuf, size, size); |
680 zio_push_transform(zio, wbuf, size, size, NULL); |
768 |
|
769 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; |
|
770 zbt->zbt_cksum = blk.blk_cksum; |
|
771 } |
681 } |
772 |
682 |
773 return (zio); |
683 return (zio); |
774 } |
684 } |
775 |
685 |
776 /* |
686 /* |
777 * Create a child I/O to do some work for us. It has no associated bp. |
687 * Create a child I/O to do some work for us. |
778 */ |
688 */ |
779 zio_t * |
689 zio_t * |
780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, |
690 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, |
781 void *data, uint64_t size, int type, int priority, int flags, |
691 void *data, uint64_t size, int type, int priority, int flags, |
782 zio_done_func_t *done, void *private) |
692 zio_done_func_t *done, void *private) |
783 { |
693 { |
784 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; |
694 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; |
785 zio_t *cio; |
695 zio_t *zio; |
|
696 |
|
697 ASSERT(vd->vdev_parent == |
|
698 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); |
786 |
699 |
787 if (type == ZIO_TYPE_READ && bp != NULL) { |
700 if (type == ZIO_TYPE_READ && bp != NULL) { |
788 /* |
701 /* |
789 * If we have the bp, then the child should perform the |
702 * If we have the bp, then the child should perform the |
790 * checksum and the parent need not. This pushes error |
703 * checksum and the parent need not. This pushes error |
791 * detection as close to the leaves as possible and |
704 * detection as close to the leaves as possible and |
792 * eliminates redundant checksums in the interior nodes. |
705 * eliminates redundant checksums in the interior nodes. |
793 */ |
706 */ |
794 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; |
707 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; |
795 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); |
708 pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); |
796 } |
709 } |
797 |
710 |
798 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, |
711 if (vd->vdev_children == 0) |
|
712 offset += VDEV_LABEL_START_SIZE; |
|
713 |
|
714 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, |
799 done, private, type, priority, |
715 done, private, type, priority, |
800 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, |
716 (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | |
|
717 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, |
|
718 vd, offset, &pio->io_bookmark, |
801 ZIO_STAGE_VDEV_IO_START - 1, pipeline); |
719 ZIO_STAGE_VDEV_IO_START - 1, pipeline); |
802 |
720 |
803 cio->io_vd = vd; |
721 return (zio); |
804 cio->io_offset = offset; |
722 } |
805 |
723 |
806 return (cio); |
724 zio_t * |
807 } |
725 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, |
808 |
726 int type, int priority, int flags, zio_done_func_t *done, void *private) |
809 /* |
727 { |
810 * ========================================================================== |
728 zio_t *zio; |
811 * Initiate I/O, either sync or async |
729 |
812 * ========================================================================== |
730 ASSERT(vd->vdev_ops->vdev_op_leaf); |
813 */ |
731 |
814 static void |
732 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, |
815 zio_destroy(zio_t *zio) |
733 data, size, done, private, type, priority, |
816 { |
734 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, |
817 mutex_destroy(&zio->io_lock); |
735 vd, offset, NULL, |
818 cv_destroy(&zio->io_cv); |
736 ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); |
819 if (zio->io_failed_vds != NULL) { |
737 |
820 kmem_free(zio->io_failed_vds, |
738 return (zio); |
821 zio->io_failed_vds_count * sizeof (vdev_t *)); |
|
822 zio->io_failed_vds = NULL; |
|
823 zio->io_failed_vds_count = 0; |
|
824 } |
|
825 kmem_cache_free(zio_cache, zio); |
|
826 } |
|
827 |
|
828 int |
|
829 zio_wait(zio_t *zio) |
|
830 { |
|
831 int error; |
|
832 |
|
833 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); |
|
834 |
|
835 zio->io_waiter = curthread; |
|
836 |
|
837 zio_execute(zio); |
|
838 |
|
839 mutex_enter(&zio->io_lock); |
|
840 while (zio->io_stalled != ZIO_STAGE_DONE) |
|
841 cv_wait(&zio->io_cv, &zio->io_lock); |
|
842 mutex_exit(&zio->io_lock); |
|
843 |
|
844 error = zio->io_error; |
|
845 zio_destroy(zio); |
|
846 |
|
847 return (error); |
|
848 } |
739 } |
849 |
740 |
850 void |
741 void |
851 zio_nowait(zio_t *zio) |
742 zio_flush(zio_t *zio, vdev_t *vd) |
852 { |
743 { |
853 zio_execute(zio); |
744 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, |
854 } |
745 NULL, NULL, ZIO_PRIORITY_NOW, |
855 |
746 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); |
856 void |
747 } |
857 zio_interrupt(zio_t *zio) |
748 |
858 { |
749 /* |
859 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], |
750 * ========================================================================== |
860 (task_func_t *)zio_execute, zio, TQ_SLEEP); |
751 * Prepare to read and write logical blocks |
861 } |
752 * ========================================================================== |
|
753 */ |
862 |
754 |
863 static int |
755 static int |
864 zio_issue_async(zio_t *zio) |
756 zio_read_bp_init(zio_t *zio) |
865 { |
|
866 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], |
|
867 (task_func_t *)zio_execute, zio, TQ_SLEEP); |
|
868 |
|
869 return (ZIO_PIPELINE_STOP); |
|
870 } |
|
871 |
|
872 /* |
|
873 * ========================================================================== |
|
874 * I/O pipeline interlocks: parent/child dependency scoreboarding |
|
875 * ========================================================================== |
|
876 */ |
|
877 static int |
|
878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) |
|
879 { |
|
880 int rv = ZIO_PIPELINE_CONTINUE; |
|
881 |
|
882 mutex_enter(&zio->io_lock); |
|
883 ASSERT(zio->io_stalled == 0); |
|
884 if (*countp != 0) { |
|
885 zio->io_stalled = stage; |
|
886 rv = ZIO_PIPELINE_STOP; |
|
887 } |
|
888 mutex_exit(&zio->io_lock); |
|
889 |
|
890 return (rv); |
|
891 } |
|
892 |
|
893 static void |
|
894 zio_add_failed_vdev(zio_t *pio, zio_t *zio) |
|
895 { |
|
896 uint64_t oldcount = pio->io_failed_vds_count; |
|
897 vdev_t **new_vds; |
|
898 int i; |
|
899 |
|
900 ASSERT(MUTEX_HELD(&pio->io_lock)); |
|
901 |
|
902 if (zio->io_vd == NULL) |
|
903 return; |
|
904 |
|
905 for (i = 0; i < oldcount; i++) { |
|
906 if (pio->io_failed_vds[i] == zio->io_vd) |
|
907 return; |
|
908 } |
|
909 |
|
910 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); |
|
911 if (pio->io_failed_vds != NULL) { |
|
912 bcopy(pio->io_failed_vds, new_vds, |
|
913 oldcount * sizeof (vdev_t *)); |
|
914 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); |
|
915 } |
|
916 pio->io_failed_vds = new_vds; |
|
917 pio->io_failed_vds[oldcount] = zio->io_vd; |
|
918 pio->io_failed_vds_count++; |
|
919 } |
|
920 |
|
921 static void |
|
922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) |
|
923 { |
|
924 zio_t *pio = zio->io_parent; |
|
925 |
|
926 mutex_enter(&pio->io_lock); |
|
927 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { |
|
928 pio->io_error = zio->io_error; |
|
929 if (zio->io_error && zio->io_error != ENOTSUP) |
|
930 zio_add_failed_vdev(pio, zio); |
|
931 } |
|
932 ASSERT3U(*countp, >, 0); |
|
933 if (--*countp == 0 && pio->io_stalled == stage) { |
|
934 pio->io_stalled = 0; |
|
935 mutex_exit(&pio->io_lock); |
|
936 zio_execute(pio); |
|
937 } else { |
|
938 mutex_exit(&pio->io_lock); |
|
939 } |
|
940 } |
|
941 |
|
942 int |
|
943 zio_wait_for_children_ready(zio_t *zio) |
|
944 { |
|
945 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, |
|
946 &zio->io_children_notready)); |
|
947 } |
|
948 |
|
949 int |
|
950 zio_wait_for_children_done(zio_t *zio) |
|
951 { |
|
952 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, |
|
953 &zio->io_children_notdone)); |
|
954 } |
|
955 |
|
956 static int |
|
957 zio_read_init(zio_t *zio) |
|
958 { |
757 { |
959 blkptr_t *bp = zio->io_bp; |
758 blkptr_t *bp = zio->io_bp; |
960 |
759 |
961 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { |
760 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { |
962 uint64_t csize = BP_GET_PSIZE(bp); |
761 uint64_t csize = BP_GET_PSIZE(bp); |
963 void *cbuf = zio_buf_alloc(csize); |
762 void *cbuf = zio_buf_alloc(csize); |
964 |
763 |
965 zio_push_transform(zio, cbuf, csize, csize); |
764 zio_push_transform(zio, cbuf, csize, csize, zio_decompress); |
966 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; |
|
967 } |
|
968 |
|
969 if (BP_IS_GANG(bp)) { |
|
970 uint64_t gsize = SPA_GANGBLOCKSIZE; |
|
971 void *gbuf = zio_buf_alloc(gsize); |
|
972 |
|
973 zio_push_transform(zio, gbuf, gsize, gsize); |
|
974 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; |
|
975 } |
765 } |
976 |
766 |
977 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) |
767 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) |
978 zio->io_flags |= ZIO_FLAG_DONT_CACHE; |
768 zio->io_flags |= ZIO_FLAG_DONT_CACHE; |
979 |
769 |
980 return (ZIO_PIPELINE_CONTINUE); |
770 return (ZIO_PIPELINE_CONTINUE); |
981 } |
771 } |
982 |
772 |
983 static int |
773 static int |
984 zio_ready(zio_t *zio) |
774 zio_write_bp_init(zio_t *zio) |
985 { |
775 { |
986 zio_t *pio = zio->io_parent; |
776 zio_prop_t *zp = &zio->io_prop; |
987 |
777 int compress = zp->zp_compress; |
988 if (zio->io_ready) |
|
989 zio->io_ready(zio); |
|
990 |
|
991 if (pio != NULL) |
|
992 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, |
|
993 &pio->io_children_notready); |
|
994 |
|
995 if (zio->io_bp) |
|
996 zio->io_bp_copy = *zio->io_bp; |
|
997 |
|
998 return (ZIO_PIPELINE_CONTINUE); |
|
999 } |
|
1000 |
|
1001 static int |
|
1002 zio_vdev_retry_io(zio_t *zio) |
|
1003 { |
|
1004 zio_t *pio = zio->io_parent; |
|
1005 |
|
1006 /* |
|
1007 * Preserve the failed bp so that the io_ready() callback can |
|
1008 * update the accounting accordingly. The callback will also be |
|
1009 * responsible for freeing the previously allocated block, if one |
|
1010 * exists. |
|
1011 */ |
|
1012 zio->io_bp_orig = *zio->io_bp; |
|
1013 |
|
1014 /* |
|
1015 * We must zero out the old DVA and blk_birth before reallocating |
|
1016 * the bp. |
|
1017 */ |
|
1018 BP_ZERO_DVAS(zio->io_bp); |
|
1019 zio_reset(zio); |
|
1020 |
|
1021 if (pio) { |
|
1022 /* |
|
1023 * Let the parent know that we will |
|
1024 * re-alloc the write (=> new bp info). |
|
1025 */ |
|
1026 mutex_enter(&pio->io_lock); |
|
1027 pio->io_children_notready++; |
|
1028 |
|
1029 /* |
|
1030 * If the parent I/O is still in the open stage, then |
|
1031 * don't bother telling it to retry since it hasn't |
|
1032 * progressed far enough for it to care. |
|
1033 */ |
|
1034 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) |
|
1035 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; |
|
1036 |
|
1037 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); |
|
1038 mutex_exit(&pio->io_lock); |
|
1039 } |
|
1040 |
|
1041 /* |
|
1042 * We are getting ready to process the retry request so clear |
|
1043 * the flag and the zio's current error status. |
|
1044 */ |
|
1045 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; |
|
1046 zio->io_error = 0; |
|
1047 |
|
1048 return (ZIO_PIPELINE_CONTINUE); |
|
1049 } |
|
1050 |
|
1051 int |
|
1052 zio_vdev_resume_io(spa_t *spa) |
|
1053 { |
|
1054 zio_t *zio; |
|
1055 |
|
1056 mutex_enter(&spa->spa_zio_lock); |
|
1057 |
|
1058 /* |
|
1059 * Probe all of vdevs that have experienced an I/O error. |
|
1060 * If we are still unable to verify the integrity of the vdev |
|
1061 * then we prevent the resume from proceeeding. |
|
1062 */ |
|
1063 for (zio = list_head(&spa->spa_zio_list); zio != NULL; |
|
1064 zio = list_next(&spa->spa_zio_list, zio)) { |
|
1065 int error = 0; |
|
1066 |
|
1067 /* We only care about I/Os that must succeed */ |
|
1068 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) |
|
1069 continue; |
|
1070 error = vdev_probe(zio->io_vd); |
|
1071 if (error) { |
|
1072 mutex_exit(&spa->spa_zio_lock); |
|
1073 return (error); |
|
1074 } |
|
1075 } |
|
1076 |
|
1077 /* |
|
1078 * Clear the vdev stats so that I/O can flow. |
|
1079 */ |
|
1080 vdev_clear(spa, NULL, B_FALSE); |
|
1081 |
|
1082 spa->spa_state = POOL_STATE_ACTIVE; |
|
1083 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { |
|
1084 list_remove(&spa->spa_zio_list, zio); |
|
1085 zio->io_error = 0; |
|
1086 |
|
1087 /* |
|
1088 * If we are resuming an allocating I/O then we force it |
|
1089 * to retry and let it resume operation where it left off. |
|
1090 * Otherwise, go back to the ready stage and pick up from |
|
1091 * there. |
|
1092 */ |
|
1093 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { |
|
1094 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; |
|
1095 zio->io_stage--; |
|
1096 } else { |
|
1097 zio->io_stage = ZIO_STAGE_READY; |
|
1098 } |
|
1099 |
|
1100 (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, |
|
1101 zio, TQ_SLEEP); |
|
1102 } |
|
1103 mutex_exit(&spa->spa_zio_lock); |
|
1104 |
|
1105 /* |
|
1106 * Wait for the taskqs to finish and recheck the pool state since |
|
1107 * it's possible that a resumed I/O has failed again. |
|
1108 */ |
|
1109 taskq_wait(zio_taskq); |
|
1110 if (spa_state(spa) == POOL_STATE_IO_FAILURE) |
|
1111 return (EIO); |
|
1112 |
|
1113 mutex_enter(&spa->spa_zio_lock); |
|
1114 cv_broadcast(&spa->spa_zio_cv); |
|
1115 mutex_exit(&spa->spa_zio_lock); |
|
1116 |
|
1117 return (0); |
|
1118 } |
|
1119 |
|
1120 static int |
|
1121 zio_vdev_suspend_io(zio_t *zio) |
|
1122 { |
|
1123 spa_t *spa = zio->io_spa; |
|
1124 |
|
1125 /* |
|
1126 * We've experienced an unrecoverable failure so |
|
1127 * set the pool state accordingly and queue all |
|
1128 * failed IOs. |
|
1129 */ |
|
1130 spa->spa_state = POOL_STATE_IO_FAILURE; |
|
1131 |
|
1132 mutex_enter(&spa->spa_zio_lock); |
|
1133 list_insert_tail(&spa->spa_zio_list, zio); |
|
1134 |
|
1135 #ifndef _KERNEL |
|
1136 /* Used to notify ztest that the pool has suspended */ |
|
1137 cv_broadcast(&spa->spa_zio_cv); |
|
1138 #endif |
|
1139 mutex_exit(&spa->spa_zio_lock); |
|
1140 |
|
1141 return (ZIO_PIPELINE_STOP); |
|
1142 } |
|
1143 |
|
1144 static void |
|
1145 zio_handle_io_failure(zio_t *zio, vdev_t *vd) |
|
1146 { |
|
1147 spa_t *spa = zio->io_spa; |
|
1148 blkptr_t *bp = zio->io_bp; |
|
1149 char *blkbuf; |
|
1150 |
|
1151 #ifdef ZFS_DEBUG |
|
1152 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); |
|
1153 if (blkbuf) { |
|
1154 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, |
|
1155 bp ? bp : &zio->io_bp_copy); |
|
1156 } |
|
1157 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d", |
|
1158 zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", |
|
1159 zio_type_name[zio->io_type], vdev_description(vd), |
|
1160 (u_longlong_t)zio->io_offset, (void *)zio, |
|
1161 blkbuf ? blkbuf : "", zio->io_error); |
|
1162 if (blkbuf) |
|
1163 kmem_free(blkbuf, BP_SPRINTF_LEN); |
|
1164 #endif |
|
1165 |
|
1166 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { |
|
1167 fm_panic("Pool '%s' has encountered an uncorrectable I/O " |
|
1168 "failure and the failure mode property for this pool " |
|
1169 "is set to panic.", spa_name(spa)); |
|
1170 } |
|
1171 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); |
|
1172 vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE, |
|
1173 VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE); |
|
1174 } |
|
1175 |
|
1176 static int |
|
1177 zio_assess(zio_t *zio) |
|
1178 { |
|
1179 spa_t *spa = zio->io_spa; |
|
1180 blkptr_t *bp = zio->io_bp; |
|
1181 vdev_t *vd = zio->io_vd; |
|
1182 |
|
1183 ASSERT(zio->io_children_notready == 0); |
|
1184 ASSERT(zio->io_children_notdone == 0); |
|
1185 |
|
1186 if (bp != NULL) { |
|
1187 ASSERT(bp->blk_pad[0] == 0); |
|
1188 ASSERT(bp->blk_pad[1] == 0); |
|
1189 ASSERT(bp->blk_pad[2] == 0); |
|
1190 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); |
|
1191 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && |
|
1192 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { |
|
1193 ASSERT(!BP_SHOULD_BYTESWAP(bp)); |
|
1194 if (zio->io_ndvas != 0) |
|
1195 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); |
|
1196 ASSERT(BP_COUNT_GANG(bp) == 0 || |
|
1197 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); |
|
1198 } |
|
1199 } |
|
1200 |
|
1201 /* |
|
1202 * Some child I/O has indicated that a retry is necessary, so |
|
1203 * we set an error on the I/O and let the logic below do the |
|
1204 * rest. |
|
1205 */ |
|
1206 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) |
|
1207 zio->io_error = ERESTART; |
|
1208 |
|
1209 if (vd != NULL) |
|
1210 vdev_stat_update(zio); |
|
1211 |
|
1212 if (zio->io_error) { |
|
1213 /* |
|
1214 * If this I/O is attached to a particular vdev, |
|
1215 * generate an error message describing the I/O failure |
|
1216 * at the block level. We ignore these errors if the |
|
1217 * device is currently unavailable. |
|
1218 */ |
|
1219 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) |
|
1220 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); |
|
1221 |
|
1222 if ((zio->io_error == EIO || |
|
1223 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && |
|
1224 zio->io_logical == zio) { |
|
1225 /* |
|
1226 * For root I/O requests, tell the SPA to log the error |
|
1227 * appropriately. Also, generate a logical data |
|
1228 * ereport. |
|
1229 */ |
|
1230 spa_log_error(spa, zio); |
|
1231 |
|
1232 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, |
|
1233 0, 0); |
|
1234 } |
|
1235 |
|
1236 /* |
|
1237 * If we are an allocating I/O then we attempt to reissue |
|
1238 * the I/O on another vdev unless the pool is out of space. |
|
1239 * We handle this condition based on the spa's failmode |
|
1240 * property. |
|
1241 */ |
|
1242 if (zio_write_retry && zio->io_error != ENOSPC && |
|
1243 IO_IS_ALLOCATING(zio)) |
|
1244 return (zio_vdev_retry_io(zio)); |
|
1245 |
|
1246 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); |
|
1247 |
|
1248 /* |
|
1249 * For I/O requests that cannot fail, we carry out |
|
1250 * the requested behavior based on the failmode pool |
|
1251 * property. |
|
1252 * |
|
1253 * XXX - Need to differentiate between an ENOSPC as |
|
1254 * a result of vdev failures vs. a full pool. |
|
1255 */ |
|
1256 if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { |
|
1257 int i; |
|
1258 |
|
1259 for (i = 0; i < zio->io_failed_vds_count; i++) { |
|
1260 zio_handle_io_failure(zio, |
|
1261 zio->io_failed_vds[i]); |
|
1262 } |
|
1263 if (zio->io_failed_vds_count == 0) { |
|
1264 zio_handle_io_failure(zio, |
|
1265 vd ? vd : spa->spa_root_vdev); |
|
1266 } |
|
1267 if (zio->io_failed_vds != NULL) { |
|
1268 kmem_free(zio->io_failed_vds, |
|
1269 zio->io_failed_vds_count * |
|
1270 sizeof (vdev_t *)); |
|
1271 zio->io_failed_vds = NULL; |
|
1272 zio->io_failed_vds_count = 0; |
|
1273 } |
|
1274 return (zio_vdev_suspend_io(zio)); |
|
1275 } |
|
1276 } |
|
1277 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); |
|
1278 ASSERT(zio->io_children_notready == 0); |
|
1279 |
|
1280 return (ZIO_PIPELINE_CONTINUE); |
|
1281 } |
|
1282 |
|
1283 static int |
|
1284 zio_done(zio_t *zio) |
|
1285 { |
|
1286 zio_t *pio = zio->io_parent; |
|
1287 spa_t *spa = zio->io_spa; |
|
1288 |
|
1289 ASSERT(zio->io_children_notready == 0); |
|
1290 ASSERT(zio->io_children_notdone == 0); |
|
1291 |
|
1292 zio_clear_transform_stack(zio); |
|
1293 |
|
1294 if (zio->io_done) |
|
1295 zio->io_done(zio); |
|
1296 |
|
1297 ASSERT(zio->io_delegate_list == NULL); |
|
1298 ASSERT(zio->io_delegate_next == NULL); |
|
1299 |
|
1300 if (pio != NULL) { |
|
1301 zio_t *next, *prev; |
|
1302 |
|
1303 mutex_enter(&pio->io_lock); |
|
1304 next = zio->io_sibling_next; |
|
1305 prev = zio->io_sibling_prev; |
|
1306 if (next != NULL) |
|
1307 next->io_sibling_prev = prev; |
|
1308 if (prev != NULL) |
|
1309 prev->io_sibling_next = next; |
|
1310 if (pio->io_child == zio) |
|
1311 pio->io_child = next; |
|
1312 mutex_exit(&pio->io_lock); |
|
1313 |
|
1314 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, |
|
1315 &pio->io_children_notdone); |
|
1316 } |
|
1317 |
|
1318 /* |
|
1319 * Note: this I/O is now done, and will shortly be freed, so there is no |
|
1320 * need to clear this (or any other) flag. |
|
1321 */ |
|
1322 if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) |
|
1323 spa_config_exit(spa, zio); |
|
1324 |
|
1325 if (zio->io_waiter != NULL) { |
|
1326 mutex_enter(&zio->io_lock); |
|
1327 ASSERT(zio->io_stage == ZIO_STAGE_DONE); |
|
1328 zio->io_stalled = zio->io_stage; |
|
1329 cv_broadcast(&zio->io_cv); |
|
1330 mutex_exit(&zio->io_lock); |
|
1331 } else { |
|
1332 zio_destroy(zio); |
|
1333 } |
|
1334 |
|
1335 return (ZIO_PIPELINE_STOP); |
|
1336 } |
|
1337 |
|
1338 /* |
|
1339 * ========================================================================== |
|
1340 * Compression support |
|
1341 * ========================================================================== |
|
1342 */ |
|
1343 static int |
|
1344 zio_write_compress(zio_t *zio) |
|
1345 { |
|
1346 int compress = zio->io_compress; |
|
1347 blkptr_t *bp = zio->io_bp; |
778 blkptr_t *bp = zio->io_bp; |
1348 void *cbuf; |
779 void *cbuf; |
1349 uint64_t lsize = zio->io_size; |
780 uint64_t lsize = zio->io_size; |
1350 uint64_t csize = lsize; |
781 uint64_t csize = lsize; |
1351 uint64_t cbufsize = 0; |
782 uint64_t cbufsize = 0; |
1352 int pass; |
783 int pass = 1; |
|
784 |
|
785 /* |
|
786 * If our children haven't all reached the ready stage, |
|
787 * wait for them and then repeat this pipeline stage. |
|
788 */ |
|
789 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || |
|
790 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) |
|
791 return (ZIO_PIPELINE_STOP); |
|
792 |
|
793 if (!IO_IS_ALLOCATING(zio)) |
|
794 return (ZIO_PIPELINE_CONTINUE); |
|
795 |
|
796 ASSERT(compress != ZIO_COMPRESS_INHERIT); |
1353 |
797 |
1354 if (bp->blk_birth == zio->io_txg) { |
798 if (bp->blk_birth == zio->io_txg) { |
1355 /* |
799 /* |
1356 * We're rewriting an existing block, which means we're |
800 * We're rewriting an existing block, which means we're |
1357 * working on behalf of spa_sync(). For spa_sync() to |
801 * working on behalf of spa_sync(). For spa_sync() to |
1360 * the blocksize, which forces a reallocate, and makes |
804 * the blocksize, which forces a reallocate, and makes |
1361 * convergence take longer. Therefore, after the first |
805 * convergence take longer. Therefore, after the first |
1362 * few passes, stop compressing to ensure convergence. |
806 * few passes, stop compressing to ensure convergence. |
1363 */ |
807 */ |
1364 pass = spa_sync_pass(zio->io_spa); |
808 pass = spa_sync_pass(zio->io_spa); |
1365 if (pass > zio_sync_pass.zp_dontcompress) |
809 ASSERT(pass > 1); |
|
810 |
|
811 if (pass > SYNC_PASS_DONT_COMPRESS) |
1366 compress = ZIO_COMPRESS_OFF; |
812 compress = ZIO_COMPRESS_OFF; |
1367 } else { |
813 |
1368 ASSERT(BP_IS_HOLE(bp)); |
814 /* |
1369 pass = 1; |
815 * Only MOS (objset 0) data should need to be rewritten. |
1370 } |
816 */ |
1371 |
817 ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); |
1372 if (compress != ZIO_COMPRESS_OFF) |
818 |
|
819 /* Make sure someone doesn't change their mind on overwrites */ |
|
820 ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), |
|
821 spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); |
|
822 } |
|
823 |
|
824 if (compress != ZIO_COMPRESS_OFF) { |
1373 if (!zio_compress_data(compress, zio->io_data, zio->io_size, |
825 if (!zio_compress_data(compress, zio->io_data, zio->io_size, |
1374 &cbuf, &csize, &cbufsize)) |
826 &cbuf, &csize, &cbufsize)) { |
1375 compress = ZIO_COMPRESS_OFF; |
827 compress = ZIO_COMPRESS_OFF; |
1376 |
828 } else if (csize != 0) { |
1377 if (compress != ZIO_COMPRESS_OFF && csize != 0) |
829 zio_push_transform(zio, cbuf, csize, cbufsize, NULL); |
1378 zio_push_transform(zio, cbuf, csize, cbufsize); |
830 } |
|
831 } |
1379 |
832 |
1380 /* |
833 /* |
1381 * The final pass of spa_sync() must be all rewrites, but the first |
834 * The final pass of spa_sync() must be all rewrites, but the first |
1382 * few passes offer a trade-off: allocating blocks defers convergence, |
835 * few passes offer a trade-off: allocating blocks defers convergence, |
1383 * but newly allocated blocks are sequential, so they can be written |
836 * but newly allocated blocks are sequential, so they can be written |
1384 * to disk faster. Therefore, we allow the first few passes of |
837 * to disk faster. Therefore, we allow the first few passes of |
1385 * spa_sync() to reallocate new blocks, but force rewrites after that. |
838 * spa_sync() to allocate new blocks, but force rewrites after that. |
1386 * There should only be a handful of blocks after pass 1 in any case. |
839 * There should only be a handful of blocks after pass 1 in any case. |
1387 */ |
840 */ |
1388 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && |
841 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && |
1389 pass > zio_sync_pass.zp_rewrite) { |
842 pass > SYNC_PASS_REWRITE) { |
1390 ASSERT(csize != 0); |
843 ASSERT(csize != 0); |
|
844 uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; |
|
845 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; |
|
846 zio->io_flags |= ZIO_FLAG_IO_REWRITE; |
|
847 } else { |
|
848 BP_ZERO(bp); |
|
849 zio->io_pipeline = ZIO_WRITE_PIPELINE; |
|
850 } |
|
851 |
|
852 if (csize == 0) { |
|
853 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; |
|
854 } else { |
|
855 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); |
1391 BP_SET_LSIZE(bp, lsize); |
856 BP_SET_LSIZE(bp, lsize); |
|
857 BP_SET_PSIZE(bp, csize); |
1392 BP_SET_COMPRESS(bp, compress); |
858 BP_SET_COMPRESS(bp, compress); |
1393 zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); |
859 BP_SET_CHECKSUM(bp, zp->zp_checksum); |
1394 } else { |
860 BP_SET_TYPE(bp, zp->zp_type); |
1395 if (bp->blk_birth == zio->io_txg) |
861 BP_SET_LEVEL(bp, zp->zp_level); |
1396 BP_ZERO(bp); |
862 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); |
1397 if (csize == 0) { |
|
1398 BP_ZERO(bp); |
|
1399 zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; |
|
1400 } else { |
|
1401 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); |
|
1402 BP_SET_LSIZE(bp, lsize); |
|
1403 BP_SET_PSIZE(bp, csize); |
|
1404 BP_SET_COMPRESS(bp, compress); |
|
1405 } |
|
1406 } |
863 } |
1407 |
864 |
1408 return (ZIO_PIPELINE_CONTINUE); |
865 return (ZIO_PIPELINE_CONTINUE); |
1409 } |
866 } |
1410 |
867 |
|
868 /* |
|
869 * ========================================================================== |
|
870 * Execute the I/O pipeline |
|
871 * ========================================================================== |
|
872 */ |
|
873 |
|
874 static void |
|
875 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) |
|
876 { |
|
877 zio_type_t t = zio->io_type; |
|
878 |
|
879 /* |
|
880 * If we're a config writer, the normal issue and interrupt threads |
|
881 * may all be blocked waiting for the config lock. In this case, |
|
882 * select the otherwise-unused taskq for ZIO_TYPE_NULL. |
|
883 */ |
|
884 if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) |
|
885 t = ZIO_TYPE_NULL; |
|
886 |
|
887 /* |
|
888 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. |
|
889 */ |
|
890 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) |
|
891 t = ZIO_TYPE_NULL; |
|
892 |
|
893 (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], |
|
894 (task_func_t *)zio_execute, zio, TQ_SLEEP); |
|
895 } |
|
896 |
|
897 static boolean_t |
|
898 zio_taskq_member(zio_t *zio, enum zio_taskq_type q) |
|
899 { |
|
900 kthread_t *executor = zio->io_executor; |
|
901 spa_t *spa = zio->io_spa; |
|
902 |
|
903 for (zio_type_t t = 0; t < ZIO_TYPES; t++) |
|
904 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) |
|
905 return (B_TRUE); |
|
906 |
|
907 return (B_FALSE); |
|
908 } |
|
909 |
1411 static int |
910 static int |
1412 zio_read_decompress(zio_t *zio) |
911 zio_issue_async(zio_t *zio) |
1413 { |
912 { |
1414 blkptr_t *bp = zio->io_bp; |
913 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); |
1415 void *data; |
914 |
1416 uint64_t size; |
915 return (ZIO_PIPELINE_STOP); |
1417 uint64_t bufsize; |
|
1418 int compress = BP_GET_COMPRESS(bp); |
|
1419 |
|
1420 ASSERT(compress != ZIO_COMPRESS_OFF); |
|
1421 |
|
1422 zio_pop_transform(zio, &data, &size, &bufsize); |
|
1423 |
|
1424 if (zio_decompress_data(compress, data, size, |
|
1425 zio->io_data, zio->io_size)) |
|
1426 zio->io_error = EIO; |
|
1427 |
|
1428 zio_buf_free(data, bufsize); |
|
1429 |
|
1430 return (ZIO_PIPELINE_CONTINUE); |
|
1431 } |
|
1432 |
|
1433 /* |
|
1434 * ========================================================================== |
|
1435 * Gang block support |
|
1436 * ========================================================================== |
|
1437 */ |
|
1438 static void |
|
1439 zio_gang_byteswap(zio_t *zio) |
|
1440 { |
|
1441 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); |
|
1442 |
|
1443 if (BP_SHOULD_BYTESWAP(zio->io_bp)) |
|
1444 byteswap_uint64_array(zio->io_data, zio->io_size); |
|
1445 } |
|
1446 |
|
1447 static int |
|
1448 zio_get_gang_header(zio_t *zio) |
|
1449 { |
|
1450 blkptr_t *bp = zio->io_bp; |
|
1451 uint64_t gsize = SPA_GANGBLOCKSIZE; |
|
1452 void *gbuf = zio_buf_alloc(gsize); |
|
1453 |
|
1454 ASSERT(BP_IS_GANG(bp)); |
|
1455 |
|
1456 zio_push_transform(zio, gbuf, gsize, gsize); |
|
1457 |
|
1458 zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, |
|
1459 NULL, NULL, ZIO_TYPE_READ, zio->io_priority, |
|
1460 zio->io_flags & ZIO_FLAG_GANG_INHERIT, |
|
1461 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); |
|
1462 |
|
1463 return (zio_wait_for_children_done(zio)); |
|
1464 } |
|
1465 |
|
1466 static int |
|
1467 zio_read_gang_members(zio_t *zio) |
|
1468 { |
|
1469 zio_gbh_phys_t *gbh; |
|
1470 uint64_t gsize, gbufsize, loff, lsize; |
|
1471 int i; |
|
1472 |
|
1473 ASSERT(BP_IS_GANG(zio->io_bp)); |
|
1474 |
|
1475 zio_gang_byteswap(zio); |
|
1476 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); |
|
1477 |
|
1478 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { |
|
1479 blkptr_t *gbp = &gbh->zg_blkptr[i]; |
|
1480 lsize = BP_GET_PSIZE(gbp); |
|
1481 |
|
1482 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); |
|
1483 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); |
|
1484 ASSERT3U(loff + lsize, <=, zio->io_size); |
|
1485 ASSERT(i < SPA_GBH_NBLKPTRS); |
|
1486 ASSERT(!BP_IS_HOLE(gbp)); |
|
1487 |
|
1488 zio_nowait(zio_read(zio, zio->io_spa, gbp, |
|
1489 (char *)zio->io_data + loff, lsize, |
|
1490 NULL, NULL, zio->io_priority, |
|
1491 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); |
|
1492 } |
|
1493 |
|
1494 zio_buf_free(gbh, gbufsize); |
|
1495 |
|
1496 return (zio_wait_for_children_done(zio)); |
|
1497 } |
|
1498 |
|
1499 static int |
|
1500 zio_rewrite_gang_members(zio_t *zio) |
|
1501 { |
|
1502 zio_gbh_phys_t *gbh; |
|
1503 uint64_t gsize, gbufsize, loff, lsize; |
|
1504 int i; |
|
1505 |
|
1506 ASSERT(BP_IS_GANG(zio->io_bp)); |
|
1507 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); |
|
1508 |
|
1509 zio_gang_byteswap(zio); |
|
1510 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); |
|
1511 |
|
1512 ASSERT(gsize == gbufsize); |
|
1513 |
|
1514 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { |
|
1515 blkptr_t *gbp = &gbh->zg_blkptr[i]; |
|
1516 lsize = BP_GET_PSIZE(gbp); |
|
1517 |
|
1518 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); |
|
1519 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); |
|
1520 ASSERT3U(loff + lsize, <=, zio->io_size); |
|
1521 ASSERT(i < SPA_GBH_NBLKPTRS); |
|
1522 ASSERT(!BP_IS_HOLE(gbp)); |
|
1523 |
|
1524 zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, |
|
1525 zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, |
|
1526 NULL, NULL, zio->io_priority, |
|
1527 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); |
|
1528 } |
|
1529 |
|
1530 zio_push_transform(zio, gbh, gsize, gbufsize); |
|
1531 |
|
1532 return (zio_wait_for_children_ready(zio)); |
|
1533 } |
|
1534 |
|
1535 static int |
|
1536 zio_free_gang_members(zio_t *zio) |
|
1537 { |
|
1538 zio_gbh_phys_t *gbh; |
|
1539 uint64_t gsize, gbufsize; |
|
1540 int i; |
|
1541 |
|
1542 ASSERT(BP_IS_GANG(zio->io_bp)); |
|
1543 |
|
1544 zio_gang_byteswap(zio); |
|
1545 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); |
|
1546 |
|
1547 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { |
|
1548 blkptr_t *gbp = &gbh->zg_blkptr[i]; |
|
1549 |
|
1550 if (BP_IS_HOLE(gbp)) |
|
1551 continue; |
|
1552 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, |
|
1553 gbp, NULL, NULL)); |
|
1554 } |
|
1555 |
|
1556 zio_buf_free(gbh, gbufsize); |
|
1557 |
|
1558 return (ZIO_PIPELINE_CONTINUE); |
|
1559 } |
|
1560 |
|
1561 static int |
|
1562 zio_claim_gang_members(zio_t *zio) |
|
1563 { |
|
1564 zio_gbh_phys_t *gbh; |
|
1565 uint64_t gsize, gbufsize; |
|
1566 int i; |
|
1567 |
|
1568 ASSERT(BP_IS_GANG(zio->io_bp)); |
|
1569 |
|
1570 zio_gang_byteswap(zio); |
|
1571 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); |
|
1572 |
|
1573 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { |
|
1574 blkptr_t *gbp = &gbh->zg_blkptr[i]; |
|
1575 if (BP_IS_HOLE(gbp)) |
|
1576 continue; |
|
1577 zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, |
|
1578 gbp, NULL, NULL)); |
|
1579 } |
|
1580 |
|
1581 zio_buf_free(gbh, gbufsize); |
|
1582 |
|
1583 return (ZIO_PIPELINE_CONTINUE); |
|
1584 } |
|
1585 |
|
1586 static void |
|
1587 zio_write_allocate_gang_member_done(zio_t *zio) |
|
1588 { |
|
1589 zio_t *pio = zio->io_parent; |
|
1590 dva_t *cdva = zio->io_bp->blk_dva; |
|
1591 dva_t *pdva = pio->io_bp->blk_dva; |
|
1592 uint64_t asize; |
|
1593 int d; |
|
1594 |
|
1595 ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); |
|
1596 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); |
|
1597 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); |
|
1598 ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); |
|
1599 |
|
1600 mutex_enter(&pio->io_lock); |
|
1601 for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { |
|
1602 ASSERT(DVA_GET_GANG(&pdva[d])); |
|
1603 asize = DVA_GET_ASIZE(&pdva[d]); |
|
1604 asize += DVA_GET_ASIZE(&cdva[d]); |
|
1605 DVA_SET_ASIZE(&pdva[d], asize); |
|
1606 } |
|
1607 mutex_exit(&pio->io_lock); |
|
1608 } |
|
1609 |
|
1610 static int |
|
1611 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) |
|
1612 { |
|
1613 blkptr_t *bp = zio->io_bp; |
|
1614 dva_t *dva = bp->blk_dva; |
|
1615 spa_t *spa = zio->io_spa; |
|
1616 zio_gbh_phys_t *gbh; |
|
1617 uint64_t txg = zio->io_txg; |
|
1618 uint64_t resid = zio->io_size; |
|
1619 uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); |
|
1620 uint64_t gsize, loff, lsize; |
|
1621 uint32_t gbps_left; |
|
1622 int ndvas = zio->io_ndvas; |
|
1623 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); |
|
1624 int error; |
|
1625 int i, d; |
|
1626 |
|
1627 gsize = SPA_GANGBLOCKSIZE; |
|
1628 gbps_left = SPA_GBH_NBLKPTRS; |
|
1629 |
|
1630 error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, |
|
1631 B_FALSE); |
|
1632 if (error) { |
|
1633 zio->io_error = error; |
|
1634 return (ZIO_PIPELINE_CONTINUE); |
|
1635 } |
|
1636 |
|
1637 for (d = 0; d < gbh_ndvas; d++) |
|
1638 DVA_SET_GANG(&dva[d], 1); |
|
1639 |
|
1640 bp->blk_birth = txg; |
|
1641 |
|
1642 gbh = zio_buf_alloc(gsize); |
|
1643 bzero(gbh, gsize); |
|
1644 |
|
1645 for (loff = 0, i = 0; loff != zio->io_size; |
|
1646 loff += lsize, resid -= lsize, gbps_left--, i++) { |
|
1647 blkptr_t *gbp = &gbh->zg_blkptr[i]; |
|
1648 dva = gbp->blk_dva; |
|
1649 |
|
1650 ASSERT(gbps_left != 0); |
|
1651 maxalloc = MIN(maxalloc, resid); |
|
1652 |
|
1653 while (resid <= maxalloc * gbps_left) { |
|
1654 error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, |
|
1655 txg, bp, B_FALSE); |
|
1656 if (error == 0) |
|
1657 break; |
|
1658 ASSERT3U(error, ==, ENOSPC); |
|
1659 /* XXX - free up previous allocations? */ |
|
1660 if (maxalloc == SPA_MINBLOCKSIZE) { |
|
1661 zio->io_error = error; |
|
1662 return (ZIO_PIPELINE_CONTINUE); |
|
1663 } |
|
1664 maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); |
|
1665 } |
|
1666 |
|
1667 if (resid <= maxalloc * gbps_left) { |
|
1668 lsize = maxalloc; |
|
1669 BP_SET_LSIZE(gbp, lsize); |
|
1670 BP_SET_PSIZE(gbp, lsize); |
|
1671 BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); |
|
1672 gbp->blk_birth = txg; |
|
1673 zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, txg, |
|
1674 gbp, (char *)zio->io_data + loff, lsize, |
|
1675 zio_write_allocate_gang_member_done, NULL, |
|
1676 zio->io_priority, |
|
1677 zio->io_flags & ZIO_FLAG_GANG_INHERIT, |
|
1678 &zio->io_bookmark)); |
|
1679 } else { |
|
1680 lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); |
|
1681 ASSERT(lsize != SPA_MINBLOCKSIZE); |
|
1682 zio_nowait(zio_write_allocate(zio, spa, |
|
1683 zio->io_checksum, txg, gbp, |
|
1684 (char *)zio->io_data + loff, lsize, |
|
1685 zio_write_allocate_gang_member_done, NULL, |
|
1686 zio->io_priority, |
|
1687 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); |
|
1688 } |
|
1689 } |
|
1690 |
|
1691 ASSERT(resid == 0 && loff == zio->io_size); |
|
1692 |
|
1693 zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; |
|
1694 |
|
1695 zio_push_transform(zio, gbh, gsize, gsize); |
|
1696 |
|
1697 /* |
|
1698 * As much as we'd like this to be 'ready' instead of 'done', |
|
1699 * updating our ASIZE doesn't happen until the io_done callback, |
|
1700 * so we have to wait for that to finish in order for our BP |
|
1701 * to be stable. |
|
1702 */ |
|
1703 return (zio_wait_for_children_done(zio)); |
|
1704 } |
|
1705 |
|
1706 /* |
|
1707 * ========================================================================== |
|
1708 * Allocate and free blocks |
|
1709 * ========================================================================== |
|
1710 */ |
|
1711 static int |
|
1712 zio_dva_allocate(zio_t *zio) |
|
1713 { |
|
1714 spa_t *spa = zio->io_spa; |
|
1715 metaslab_class_t *mc = spa->spa_normal_class; |
|
1716 blkptr_t *bp = zio->io_bp; |
|
1717 int error; |
|
1718 |
|
1719 ASSERT(BP_IS_HOLE(bp)); |
|
1720 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); |
|
1721 ASSERT3U(zio->io_ndvas, >, 0); |
|
1722 ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); |
|
1723 |
|
1724 /* |
|
1725 * For testing purposes, we force I/Os to retry. We don't allow |
|
1726 * retries beyond the first pass since those I/Os are non-allocating |
|
1727 * writes. |
|
1728 */ |
|
1729 if (zio_io_fail_shift && |
|
1730 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && |
|
1731 zio_io_should_fail(zio_io_fail_shift)) |
|
1732 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; |
|
1733 |
|
1734 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); |
|
1735 |
|
1736 error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, |
|
1737 zio->io_txg, NULL, B_FALSE); |
|
1738 |
|
1739 if (error == 0) { |
|
1740 bp->blk_birth = zio->io_txg; |
|
1741 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { |
|
1742 return (zio_write_allocate_gang_members(zio, mc)); |
|
1743 } else { |
|
1744 zio->io_error = error; |
|
1745 } |
|
1746 |
|
1747 return (ZIO_PIPELINE_CONTINUE); |
|
1748 } |
|
1749 |
|
1750 static int |
|
1751 zio_dva_free(zio_t *zio) |
|
1752 { |
|
1753 blkptr_t *bp = zio->io_bp; |
|
1754 |
|
1755 metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); |
|
1756 |
|
1757 BP_ZERO(bp); |
|
1758 |
|
1759 return (ZIO_PIPELINE_CONTINUE); |
|
1760 } |
|
1761 |
|
1762 static int |
|
1763 zio_dva_claim(zio_t *zio) |
|
1764 { |
|
1765 zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); |
|
1766 |
|
1767 return (ZIO_PIPELINE_CONTINUE); |
|
1768 } |
|
1769 |
|
1770 /* |
|
1771 * ========================================================================== |
|
1772 * Read and write to physical devices |
|
1773 * ========================================================================== |
|
1774 */ |
|
1775 |
|
1776 static int |
|
1777 zio_vdev_io_start(zio_t *zio) |
|
1778 { |
|
1779 vdev_t *vd = zio->io_vd; |
|
1780 vdev_t *tvd = vd ? vd->vdev_top : NULL; |
|
1781 blkptr_t *bp = zio->io_bp; |
|
1782 uint64_t align; |
|
1783 spa_t *spa = zio->io_spa; |
|
1784 |
|
1785 /* |
|
1786 * If the pool is already in a failure state then just suspend |
|
1787 * this IO until the problem is resolved. We will reissue them |
|
1788 * at that time. |
|
1789 */ |
|
1790 if (spa_state(spa) == POOL_STATE_IO_FAILURE && |
|
1791 zio->io_type == ZIO_TYPE_WRITE) |
|
1792 return (zio_vdev_suspend_io(zio)); |
|
1793 |
|
1794 /* |
|
1795 * The mirror_ops handle multiple DVAs in a single BP |
|
1796 */ |
|
1797 if (vd == NULL) |
|
1798 return (vdev_mirror_ops.vdev_op_io_start(zio)); |
|
1799 |
|
1800 align = 1ULL << tvd->vdev_ashift; |
|
1801 |
|
1802 if (zio->io_retries == 0 && vd == tvd) |
|
1803 zio->io_flags |= ZIO_FLAG_FAILFAST; |
|
1804 |
|
1805 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { |
|
1806 zio->io_flags |= ZIO_FLAG_PHYSICAL; |
|
1807 zio->io_offset += VDEV_LABEL_START_SIZE; |
|
1808 } |
|
1809 |
|
1810 if (P2PHASE(zio->io_size, align) != 0) { |
|
1811 uint64_t asize = P2ROUNDUP(zio->io_size, align); |
|
1812 char *abuf = zio_buf_alloc(asize); |
|
1813 ASSERT(vd == tvd); |
|
1814 if (zio->io_type == ZIO_TYPE_WRITE) { |
|
1815 bcopy(zio->io_data, abuf, zio->io_size); |
|
1816 bzero(abuf + zio->io_size, asize - zio->io_size); |
|
1817 } |
|
1818 zio_push_transform(zio, abuf, asize, asize); |
|
1819 ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); |
|
1820 zio->io_flags |= ZIO_FLAG_SUBBLOCK; |
|
1821 } |
|
1822 |
|
1823 ASSERT(P2PHASE(zio->io_offset, align) == 0); |
|
1824 ASSERT(P2PHASE(zio->io_size, align) == 0); |
|
1825 ASSERT(bp == NULL || |
|
1826 P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); |
|
1827 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); |
|
1828 |
|
1829 return (vd->vdev_ops->vdev_op_io_start(zio)); |
|
1830 } |
|
1831 |
|
1832 static int |
|
1833 zio_vdev_io_done(zio_t *zio) |
|
1834 { |
|
1835 if (zio->io_vd == NULL) |
|
1836 return (vdev_mirror_ops.vdev_op_io_done(zio)); |
|
1837 |
|
1838 return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); |
|
1839 } |
|
1840 |
|
1841 /* XXPOLICY */ |
|
1842 boolean_t |
|
1843 zio_should_retry(zio_t *zio) |
|
1844 { |
|
1845 vdev_t *vd = zio->io_vd; |
|
1846 |
|
1847 if (zio->io_error == 0) |
|
1848 return (B_FALSE); |
|
1849 if (zio->io_delegate_list != NULL) |
|
1850 return (B_FALSE); |
|
1851 if (vd != NULL) { |
|
1852 if (vd != vd->vdev_top) |
|
1853 return (B_FALSE); |
|
1854 if (vd->vdev_is_failing) |
|
1855 return (B_FALSE); |
|
1856 } |
|
1857 if (zio->io_flags & ZIO_FLAG_DONT_RETRY) |
|
1858 return (B_FALSE); |
|
1859 if (zio->io_retries > 0) |
|
1860 return (B_FALSE); |
|
1861 |
|
1862 return (B_TRUE); |
|
1863 } |
|
1864 |
|
1865 static int |
|
1866 zio_vdev_io_assess(zio_t *zio) |
|
1867 { |
|
1868 vdev_t *vd = zio->io_vd; |
|
1869 vdev_t *tvd = vd ? vd->vdev_top : NULL; |
|
1870 |
|
1871 ASSERT(zio->io_vsd == NULL); |
|
1872 |
|
1873 if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { |
|
1874 void *abuf; |
|
1875 uint64_t asize; |
|
1876 ASSERT(vd == tvd); |
|
1877 zio_pop_transform(zio, &abuf, &asize, &asize); |
|
1878 if (zio->io_type == ZIO_TYPE_READ) |
|
1879 bcopy(abuf, zio->io_data, zio->io_size); |
|
1880 zio_buf_free(abuf, asize); |
|
1881 zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; |
|
1882 } |
|
1883 |
|
1884 if (zio_injection_enabled && !zio->io_error) |
|
1885 zio->io_error = zio_handle_fault_injection(zio, EIO); |
|
1886 |
|
1887 /* |
|
1888 * If the I/O failed, determine whether we should attempt to retry it. |
|
1889 */ |
|
1890 /* XXPOLICY */ |
|
1891 if (zio_should_retry(zio)) { |
|
1892 ASSERT(tvd == vd); |
|
1893 |
|
1894 zio->io_retries++; |
|
1895 zio->io_error = 0; |
|
1896 zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; |
|
1897 /* XXPOLICY */ |
|
1898 zio->io_flags &= ~ZIO_FLAG_FAILFAST; |
|
1899 zio->io_flags |= ZIO_FLAG_DONT_CACHE; |
|
1900 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; |
|
1901 |
|
1902 return (ZIO_PIPELINE_CONTINUE); |
|
1903 } |
|
1904 |
|
1905 return (ZIO_PIPELINE_CONTINUE); |
|
1906 } |
916 } |
1907 |
917 |
1908 void |
918 void |
1909 zio_vdev_io_reissue(zio_t *zio) |
919 zio_interrupt(zio_t *zio) |
1910 { |
920 { |
1911 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); |
921 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); |
1912 ASSERT(zio->io_error == 0); |
922 } |
1913 |
|
1914 zio->io_stage--; |
|
1915 } |
|
1916 |
|
1917 void |
|
1918 zio_vdev_io_redone(zio_t *zio) |
|
1919 { |
|
1920 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); |
|
1921 |
|
1922 zio->io_stage--; |
|
1923 } |
|
1924 |
|
1925 void |
|
1926 zio_vdev_io_bypass(zio_t *zio) |
|
1927 { |
|
1928 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); |
|
1929 ASSERT(zio->io_error == 0); |
|
1930 |
|
1931 zio->io_flags |= ZIO_FLAG_IO_BYPASS; |
|
1932 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; |
|
1933 } |
|
1934 |
|
1935 /* |
|
1936 * ========================================================================== |
|
1937 * Generate and verify checksums |
|
1938 * ========================================================================== |
|
1939 */ |
|
1940 static int |
|
1941 zio_checksum_generate(zio_t *zio) |
|
1942 { |
|
1943 int checksum = zio->io_checksum; |
|
1944 blkptr_t *bp = zio->io_bp; |
|
1945 |
|
1946 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); |
|
1947 |
|
1948 BP_SET_CHECKSUM(bp, checksum); |
|
1949 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); |
|
1950 |
|
1951 zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); |
|
1952 |
|
1953 return (ZIO_PIPELINE_CONTINUE); |
|
1954 } |
|
1955 |
|
1956 static int |
|
1957 zio_gang_checksum_generate(zio_t *zio) |
|
1958 { |
|
1959 zio_cksum_t zc; |
|
1960 zio_gbh_phys_t *gbh = zio->io_data; |
|
1961 |
|
1962 ASSERT(BP_IS_GANG(zio->io_bp)); |
|
1963 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); |
|
1964 |
|
1965 zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); |
|
1966 |
|
1967 zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); |
|
1968 |
|
1969 return (ZIO_PIPELINE_CONTINUE); |
|
1970 } |
|
1971 |
|
1972 static int |
|
1973 zio_checksum_verify(zio_t *zio) |
|
1974 { |
|
1975 if (zio->io_bp != NULL) { |
|
1976 zio->io_error = zio_checksum_error(zio); |
|
1977 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) |
|
1978 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
1979 zio->io_spa, zio->io_vd, zio, 0, 0); |
|
1980 } |
|
1981 |
|
1982 return (ZIO_PIPELINE_CONTINUE); |
|
1983 } |
|
1984 |
|
1985 /* |
|
1986 * Called by RAID-Z to ensure we don't compute the checksum twice. |
|
1987 */ |
|
1988 void |
|
1989 zio_checksum_verified(zio_t *zio) |
|
1990 { |
|
1991 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); |
|
1992 } |
|
1993 |
|
1994 /* |
|
1995 * Set the external verifier for a gang block based on stuff in the bp |
|
1996 */ |
|
1997 void |
|
1998 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) |
|
1999 { |
|
2000 blkptr_t *bp = zio->io_bp; |
|
2001 |
|
2002 zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); |
|
2003 zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); |
|
2004 zcp->zc_word[2] = bp->blk_birth; |
|
2005 zcp->zc_word[3] = 0; |
|
2006 } |
|
2007 |
|
2008 /* |
|
2009 * ========================================================================== |
|
2010 * Define the pipeline |
|
2011 * ========================================================================== |
|
2012 */ |
|
2013 typedef int zio_pipe_stage_t(zio_t *zio); |
|
2014 |
|
2015 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { |
|
2016 NULL, |
|
2017 zio_wait_for_children_ready, |
|
2018 zio_read_init, |
|
2019 zio_issue_async, |
|
2020 zio_write_compress, |
|
2021 zio_checksum_generate, |
|
2022 zio_get_gang_header, |
|
2023 zio_rewrite_gang_members, |
|
2024 zio_free_gang_members, |
|
2025 zio_claim_gang_members, |
|
2026 zio_dva_allocate, |
|
2027 zio_dva_free, |
|
2028 zio_dva_claim, |
|
2029 zio_gang_checksum_generate, |
|
2030 zio_ready, |
|
2031 zio_vdev_io_start, |
|
2032 zio_vdev_io_done, |
|
2033 zio_vdev_io_assess, |
|
2034 zio_wait_for_children_done, |
|
2035 zio_checksum_verify, |
|
2036 zio_read_gang_members, |
|
2037 zio_read_decompress, |
|
2038 zio_assess, |
|
2039 zio_done, |
|
2040 NULL |
|
2041 }; |
|
2042 |
923 |
2043 /* |
924 /* |
2044 * Execute the I/O pipeline until one of the following occurs: |
925 * Execute the I/O pipeline until one of the following occurs: |
2045 * (1) the I/O completes; (2) the pipeline stalls waiting for |
926 * (1) the I/O completes; (2) the pipeline stalls waiting for |
2046 * dependent child I/Os; (3) the I/O issues, so we're waiting |
927 * dependent child I/Os; (3) the I/O issues, so we're waiting |
2051 * there's no CPU work; it never burns a thread in cv_wait(). |
932 * there's no CPU work; it never burns a thread in cv_wait(). |
2052 * |
933 * |
2053 * There's no locking on io_stage because there's no legitimate way |
934 * There's no locking on io_stage because there's no legitimate way |
2054 * for multiple threads to be attempting to process the same I/O. |
935 * for multiple threads to be attempting to process the same I/O. |
2055 */ |
936 */ |
|
937 static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; |
|
938 |
2056 void |
939 void |
2057 zio_execute(zio_t *zio) |
940 zio_execute(zio_t *zio) |
2058 { |
941 { |
|
942 zio->io_executor = curthread; |
|
943 |
2059 while (zio->io_stage < ZIO_STAGE_DONE) { |
944 while (zio->io_stage < ZIO_STAGE_DONE) { |
2060 uint32_t pipeline = zio->io_pipeline; |
945 uint32_t pipeline = zio->io_pipeline; |
|
946 zio_stage_t stage = zio->io_stage; |
2061 int rv; |
947 int rv; |
2062 |
948 |
2063 ASSERT(!MUTEX_HELD(&zio->io_lock)); |
949 ASSERT(!MUTEX_HELD(&zio->io_lock)); |
2064 |
950 |
|
951 while (((1U << ++stage) & pipeline) == 0) |
|
952 continue; |
|
953 |
|
954 ASSERT(stage <= ZIO_STAGE_DONE); |
|
955 ASSERT(zio->io_stall == NULL); |
|
956 |
2065 /* |
957 /* |
2066 * If an error occurred outside the vdev stack, |
958 * If we are in interrupt context and this pipeline stage |
2067 * just execute the interlock stages to clean up. |
959 * will grab a config lock that is held across I/O, |
|
960 * issue async to avoid deadlock. |
2068 */ |
961 */ |
2069 if (zio->io_error && |
962 if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && |
2070 ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) |
963 zio->io_vd == NULL && |
2071 pipeline &= ZIO_ERROR_PIPELINE_MASK; |
964 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { |
2072 |
965 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); |
2073 while (((1U << ++zio->io_stage) & pipeline) == 0) |
966 return; |
2074 continue; |
967 } |
2075 |
968 |
2076 ASSERT(zio->io_stage <= ZIO_STAGE_DONE); |
969 zio->io_stage = stage; |
2077 ASSERT(zio->io_stalled == 0); |
970 rv = zio_pipeline[stage](zio); |
2078 |
|
2079 rv = zio_pipeline[zio->io_stage](zio); |
|
2080 |
971 |
2081 if (rv == ZIO_PIPELINE_STOP) |
972 if (rv == ZIO_PIPELINE_STOP) |
2082 return; |
973 return; |
2083 |
974 |
2084 ASSERT(rv == ZIO_PIPELINE_CONTINUE); |
975 ASSERT(rv == ZIO_PIPELINE_CONTINUE); |
2085 } |
976 } |
2086 } |
977 } |
2087 |
978 |
2088 static boolean_t |
979 /* |
2089 zio_io_should_fail(uint16_t range) |
980 * ========================================================================== |
2090 { |
981 * Initiate I/O, either sync or async |
2091 static uint16_t allocs = 0; |
982 * ========================================================================== |
2092 |
983 */ |
2093 return (P2PHASE(allocs++, 1U<<range) == 0); |
984 int |
|
985 zio_wait(zio_t *zio) |
|
986 { |
|
987 int error; |
|
988 |
|
989 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); |
|
990 ASSERT(zio->io_executor == NULL); |
|
991 |
|
992 zio->io_waiter = curthread; |
|
993 |
|
994 zio_execute(zio); |
|
995 |
|
996 mutex_enter(&zio->io_lock); |
|
997 while (zio->io_executor != NULL) |
|
998 cv_wait(&zio->io_cv, &zio->io_lock); |
|
999 mutex_exit(&zio->io_lock); |
|
1000 |
|
1001 error = zio->io_error; |
|
1002 zio_destroy(zio); |
|
1003 |
|
1004 return (error); |
|
1005 } |
|
1006 |
|
1007 void |
|
1008 zio_nowait(zio_t *zio) |
|
1009 { |
|
1010 ASSERT(zio->io_executor == NULL); |
|
1011 |
|
1012 if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { |
|
1013 /* |
|
1014 * This is a logical async I/O with no parent to wait for it. |
|
1015 * Attach it to the pool's global async root zio so that |
|
1016 * spa_unload() has a way of waiting for async I/O to finish. |
|
1017 */ |
|
1018 spa_t *spa = zio->io_spa; |
|
1019 zio->io_async_root = B_TRUE; |
|
1020 mutex_enter(&spa->spa_async_root_lock); |
|
1021 spa->spa_async_root_count++; |
|
1022 mutex_exit(&spa->spa_async_root_lock); |
|
1023 } |
|
1024 |
|
1025 zio_execute(zio); |
|
1026 } |
|
1027 |
|
1028 /* |
|
1029 * ========================================================================== |
|
1030 * Reexecute or suspend/resume failed I/O |
|
1031 * ========================================================================== |
|
1032 */ |
|
1033 |
|
1034 static void |
|
1035 zio_reexecute(zio_t *pio) |
|
1036 { |
|
1037 zio_t *zio, *zio_next; |
|
1038 |
|
1039 pio->io_flags = pio->io_orig_flags; |
|
1040 pio->io_stage = pio->io_orig_stage; |
|
1041 pio->io_pipeline = pio->io_orig_pipeline; |
|
1042 pio->io_reexecute = 0; |
|
1043 pio->io_error = 0; |
|
1044 for (int c = 0; c < ZIO_CHILD_TYPES; c++) |
|
1045 pio->io_child_error[c] = 0; |
|
1046 |
|
1047 if (IO_IS_ALLOCATING(pio)) { |
|
1048 /* |
|
1049 * Remember the failed bp so that the io_ready() callback |
|
1050 * can update its accounting upon reexecution. The block |
|
1051 * was already freed in zio_done(); we indicate this with |
|
1052 * a fill count of -1 so that zio_free() knows to skip it. |
|
1053 */ |
|
1054 blkptr_t *bp = pio->io_bp; |
|
1055 ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); |
|
1056 bp->blk_fill = BLK_FILL_ALREADY_FREED; |
|
1057 pio->io_bp_orig = *bp; |
|
1058 BP_ZERO(bp); |
|
1059 } |
|
1060 |
|
1061 /* |
|
1062 * As we reexecute pio's children, new children could be created. |
|
1063 * New children go to the head of the io_child list, however, |
|
1064 * so we will (correctly) not reexecute them. The key is that |
|
1065 * the remainder of the io_child list, from 'zio_next' onward, |
|
1066 * cannot be affected by any side effects of reexecuting 'zio'. |
|
1067 */ |
|
1068 for (zio = pio->io_child; zio != NULL; zio = zio_next) { |
|
1069 zio_next = zio->io_sibling_next; |
|
1070 mutex_enter(&pio->io_lock); |
|
1071 pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; |
|
1072 pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; |
|
1073 mutex_exit(&pio->io_lock); |
|
1074 zio_reexecute(zio); |
|
1075 } |
|
1076 |
|
1077 /* |
|
1078 * Now that all children have been reexecuted, execute the parent. |
|
1079 */ |
|
1080 zio_execute(pio); |
|
1081 } |
|
1082 |
|
1083 void |
|
1084 zio_suspend(spa_t *spa, zio_t *zio) |
|
1085 { |
|
1086 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) |
|
1087 fm_panic("Pool '%s' has encountered an uncorrectable I/O " |
|
1088 "failure and the failure mode property for this pool " |
|
1089 "is set to panic.", spa_name(spa)); |
|
1090 |
|
1091 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); |
|
1092 |
|
1093 mutex_enter(&spa->spa_suspend_lock); |
|
1094 |
|
1095 if (spa->spa_suspend_zio_root == NULL) |
|
1096 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); |
|
1097 |
|
1098 spa->spa_suspended = B_TRUE; |
|
1099 |
|
1100 if (zio != NULL) { |
|
1101 ASSERT(zio != spa->spa_suspend_zio_root); |
|
1102 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); |
|
1103 ASSERT(zio->io_parent == NULL); |
|
1104 ASSERT(zio->io_stage == ZIO_STAGE_DONE); |
|
1105 zio_add_child(spa->spa_suspend_zio_root, zio); |
|
1106 } |
|
1107 |
|
1108 mutex_exit(&spa->spa_suspend_lock); |
|
1109 } |
|
1110 |
|
1111 void |
|
1112 zio_resume(spa_t *spa) |
|
1113 { |
|
1114 zio_t *pio, *zio; |
|
1115 |
|
1116 /* |
|
1117 * Reexecute all previously suspended i/o. |
|
1118 */ |
|
1119 mutex_enter(&spa->spa_suspend_lock); |
|
1120 spa->spa_suspended = B_FALSE; |
|
1121 cv_broadcast(&spa->spa_suspend_cv); |
|
1122 pio = spa->spa_suspend_zio_root; |
|
1123 spa->spa_suspend_zio_root = NULL; |
|
1124 mutex_exit(&spa->spa_suspend_lock); |
|
1125 |
|
1126 if (pio == NULL) |
|
1127 return; |
|
1128 |
|
1129 while ((zio = pio->io_child) != NULL) { |
|
1130 zio_remove_child(pio, zio); |
|
1131 zio->io_parent = NULL; |
|
1132 zio_reexecute(zio); |
|
1133 } |
|
1134 |
|
1135 ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); |
|
1136 |
|
1137 (void) zio_wait(pio); |
|
1138 } |
|
1139 |
|
1140 void |
|
1141 zio_resume_wait(spa_t *spa) |
|
1142 { |
|
1143 mutex_enter(&spa->spa_suspend_lock); |
|
1144 while (spa_suspended(spa)) |
|
1145 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); |
|
1146 mutex_exit(&spa->spa_suspend_lock); |
|
1147 } |
|
1148 |
|
1149 /* |
|
1150 * ========================================================================== |
|
1151 * Gang blocks. |
|
1152 * |
|
1153 * A gang block is a collection of small blocks that looks to the DMU |
|
1154 * like one large block. When zio_dva_allocate() cannot find a block |
|
1155 * of the requested size, due to either severe fragmentation or the pool |
|
1156 * being nearly full, it calls zio_write_gang_block() to construct the |
|
1157 * block from smaller fragments. |
|
1158 * |
|
1159 * A gang block consists of a gang header (zio_gbh_phys_t) and up to |
|
1160 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like |
|
1161 * an indirect block: it's an array of block pointers. It consumes |
|
1162 * only one sector and hence is allocatable regardless of fragmentation. |
|
1163 * The gang header's bps point to its gang members, which hold the data. |
|
1164 * |
|
1165 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> |
|
1166 * as the verifier to ensure uniqueness of the SHA256 checksum. |
|
1167 * Critically, the gang block bp's blk_cksum is the checksum of the data, |
|
1168 * not the gang header. This ensures that data block signatures (needed for |
|
1169 * deduplication) are independent of how the block is physically stored. |
|
1170 * |
|
1171 * Gang blocks can be nested: a gang member may itself be a gang block. |
|
1172 * Thus every gang block is a tree in which root and all interior nodes are |
|
1173 * gang headers, and the leaves are normal blocks that contain user data. |
|
1174 * The root of the gang tree is called the gang leader. |
|
1175 * |
|
1176 * To perform any operation (read, rewrite, free, claim) on a gang block, |
|
1177 * zio_gang_assemble() first assembles the gang tree (minus data leaves) |
|
1178 * in the io_gang_tree field of the original logical i/o by recursively |
|
1179 * reading the gang leader and all gang headers below it. This yields |
|
1180 * an in-core tree containing the contents of every gang header and the |
|
1181 * bps for every constituent of the gang block. |
|
1182 * |
|
1183 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree |
|
1184 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() |
|
1185 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. |
|
1186 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). |
|
1187 * zio_read_gang() is a wrapper around zio_read() that omits reading gang |
|
1188 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() |
|
1189 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() |
|
1190 * of the gang header plus zio_checksum_compute() of the data to update the |
|
1191 * gang header's blk_cksum as described above. |
|
1192 * |
|
1193 * The two-phase assemble/issue model solves the problem of partial failure -- |
|
1194 * what if you'd freed part of a gang block but then couldn't read the |
|
1195 * gang header for another part? Assembling the entire gang tree first |
|
1196 * ensures that all the necessary gang header I/O has succeeded before |
|
1197 * starting the actual work of free, claim, or write. Once the gang tree |
|
1198 * is assembled, free and claim are in-memory operations that cannot fail. |
|
1199 * |
|
1200 * In the event that a gang write fails, zio_dva_unallocate() walks the |
|
1201 * gang tree to immediately free (i.e. insert back into the space map) |
|
1202 * everything we've allocated. This ensures that we don't get ENOSPC |
|
1203 * errors during repeated suspend/resume cycles due to a flaky device. |
|
1204 * |
|
1205 * Gang rewrites only happen during sync-to-convergence. If we can't assemble |
|
1206 * the gang tree, we won't modify the block, so we can safely defer the free |
|
1207 * (knowing that the block is still intact). If we *can* assemble the gang |
|
1208 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free |
|
1209 * each constituent bp and we can allocate a new block on the next sync pass. |
|
1210 * |
|
1211 * In all cases, the gang tree allows complete recovery from partial failure. |
|
1212 * ========================================================================== |
|
1213 */ |
|
1214 |
|
1215 static zio_t * |
|
1216 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
|
1217 { |
|
1218 if (gn != NULL) |
|
1219 return (pio); |
|
1220 |
|
1221 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), |
|
1222 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), |
|
1223 &pio->io_bookmark)); |
|
1224 } |
|
1225 |
|
1226 zio_t * |
|
1227 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
|
1228 { |
|
1229 zio_t *zio; |
|
1230 |
|
1231 if (gn != NULL) { |
|
1232 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, |
|
1233 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, |
|
1234 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); |
|
1235 /* |
|
1236 * As we rewrite each gang header, the pipeline will compute |
|
1237 * a new gang block header checksum for it; but no one will |
|
1238 * compute a new data checksum, so we do that here. The one |
|
1239 * exception is the gang leader: the pipeline already computed |
|
1240 * its data checksum because that stage precedes gang assembly. |
|
1241 * (Presently, nothing actually uses interior data checksums; |
|
1242 * this is just good hygiene.) |
|
1243 */ |
|
1244 if (gn != pio->io_logical->io_gang_tree) { |
|
1245 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), |
|
1246 data, BP_GET_PSIZE(bp)); |
|
1247 } |
|
1248 } else { |
|
1249 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, |
|
1250 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, |
|
1251 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); |
|
1252 } |
|
1253 |
|
1254 return (zio); |
|
1255 } |
|
1256 |
|
1257 /* ARGSUSED */ |
|
1258 zio_t * |
|
1259 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
|
1260 { |
|
1261 return (zio_free(pio, pio->io_spa, pio->io_txg, bp, |
|
1262 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); |
|
1263 } |
|
1264 |
|
1265 /* ARGSUSED */ |
|
1266 zio_t * |
|
1267 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) |
|
1268 { |
|
1269 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, |
|
1270 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); |
|
1271 } |
|
1272 |
|
1273 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { |
|
1274 NULL, |
|
1275 zio_read_gang, |
|
1276 zio_rewrite_gang, |
|
1277 zio_free_gang, |
|
1278 zio_claim_gang, |
|
1279 NULL |
|
1280 }; |
|
1281 |
|
1282 static void zio_gang_tree_assemble_done(zio_t *zio); |
|
1283 |
|
1284 static zio_gang_node_t * |
|
1285 zio_gang_node_alloc(zio_gang_node_t **gnpp) |
|
1286 { |
|
1287 zio_gang_node_t *gn; |
|
1288 |
|
1289 ASSERT(*gnpp == NULL); |
|
1290 |
|
1291 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); |
|
1292 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); |
|
1293 *gnpp = gn; |
|
1294 |
|
1295 return (gn); |
|
1296 } |
|
1297 |
|
1298 static void |
|
1299 zio_gang_node_free(zio_gang_node_t **gnpp) |
|
1300 { |
|
1301 zio_gang_node_t *gn = *gnpp; |
|
1302 |
|
1303 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) |
|
1304 ASSERT(gn->gn_child[g] == NULL); |
|
1305 |
|
1306 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); |
|
1307 kmem_free(gn, sizeof (*gn)); |
|
1308 *gnpp = NULL; |
|
1309 } |
|
1310 |
|
1311 static void |
|
1312 zio_gang_tree_free(zio_gang_node_t **gnpp) |
|
1313 { |
|
1314 zio_gang_node_t *gn = *gnpp; |
|
1315 |
|
1316 if (gn == NULL) |
|
1317 return; |
|
1318 |
|
1319 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) |
|
1320 zio_gang_tree_free(&gn->gn_child[g]); |
|
1321 |
|
1322 zio_gang_node_free(gnpp); |
|
1323 } |
|
1324 |
|
1325 static void |
|
1326 zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) |
|
1327 { |
|
1328 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); |
|
1329 |
|
1330 ASSERT(lio->io_logical == lio); |
|
1331 ASSERT(BP_IS_GANG(bp)); |
|
1332 |
|
1333 zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, |
|
1334 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, |
|
1335 lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); |
|
1336 } |
|
1337 |
|
1338 static void |
|
1339 zio_gang_tree_assemble_done(zio_t *zio) |
|
1340 { |
|
1341 zio_t *lio = zio->io_logical; |
|
1342 zio_gang_node_t *gn = zio->io_private; |
|
1343 blkptr_t *bp = zio->io_bp; |
|
1344 |
|
1345 ASSERT(zio->io_parent == lio); |
|
1346 ASSERT(zio->io_child == NULL); |
|
1347 |
|
1348 if (zio->io_error) |
|
1349 return; |
|
1350 |
|
1351 if (BP_SHOULD_BYTESWAP(bp)) |
|
1352 byteswap_uint64_array(zio->io_data, zio->io_size); |
|
1353 |
|
1354 ASSERT(zio->io_data == gn->gn_gbh); |
|
1355 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); |
|
1356 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); |
|
1357 |
|
1358 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { |
|
1359 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; |
|
1360 if (!BP_IS_GANG(gbp)) |
|
1361 continue; |
|
1362 zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); |
|
1363 } |
|
1364 } |
|
1365 |
|
1366 static void |
|
1367 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) |
|
1368 { |
|
1369 zio_t *lio = pio->io_logical; |
|
1370 zio_t *zio; |
|
1371 |
|
1372 ASSERT(BP_IS_GANG(bp) == !!gn); |
|
1373 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); |
|
1374 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); |
|
1375 |
|
1376 /* |
|
1377 * If you're a gang header, your data is in gn->gn_gbh. |
|
1378 * If you're a gang member, your data is in 'data' and gn == NULL. |
|
1379 */ |
|
1380 zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); |
|
1381 |
|
1382 if (gn != NULL) { |
|
1383 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); |
|
1384 |
|
1385 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { |
|
1386 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; |
|
1387 if (BP_IS_HOLE(gbp)) |
|
1388 continue; |
|
1389 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); |
|
1390 data = (char *)data + BP_GET_PSIZE(gbp); |
|
1391 } |
|
1392 } |
|
1393 |
|
1394 if (gn == lio->io_gang_tree) |
|
1395 ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); |
|
1396 |
|
1397 if (zio != pio) |
|
1398 zio_nowait(zio); |
|
1399 } |
|
1400 |
|
1401 static int |
|
1402 zio_gang_assemble(zio_t *zio) |
|
1403 { |
|
1404 blkptr_t *bp = zio->io_bp; |
|
1405 |
|
1406 ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); |
|
1407 |
|
1408 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); |
|
1409 |
|
1410 return (ZIO_PIPELINE_CONTINUE); |
|
1411 } |
|
1412 |
|
1413 static int |
|
1414 zio_gang_issue(zio_t *zio) |
|
1415 { |
|
1416 zio_t *lio = zio->io_logical; |
|
1417 blkptr_t *bp = zio->io_bp; |
|
1418 |
|
1419 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) |
|
1420 return (ZIO_PIPELINE_STOP); |
|
1421 |
|
1422 ASSERT(BP_IS_GANG(bp) && zio == lio); |
|
1423 |
|
1424 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) |
|
1425 zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); |
|
1426 else |
|
1427 zio_gang_tree_free(&lio->io_gang_tree); |
|
1428 |
|
1429 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; |
|
1430 |
|
1431 return (ZIO_PIPELINE_CONTINUE); |
|
1432 } |
|
1433 |
|
1434 static void |
|
1435 zio_write_gang_member_ready(zio_t *zio) |
|
1436 { |
|
1437 zio_t *pio = zio->io_parent; |
|
1438 zio_t *lio = zio->io_logical; |
|
1439 dva_t *cdva = zio->io_bp->blk_dva; |
|
1440 dva_t *pdva = pio->io_bp->blk_dva; |
|
1441 uint64_t asize; |
|
1442 |
|
1443 if (BP_IS_HOLE(zio->io_bp)) |
|
1444 return; |
|
1445 |
|
1446 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); |
|
1447 |
|
1448 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); |
|
1449 ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); |
|
1450 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); |
|
1451 ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); |
|
1452 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); |
|
1453 |
|
1454 mutex_enter(&pio->io_lock); |
|
1455 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { |
|
1456 ASSERT(DVA_GET_GANG(&pdva[d])); |
|
1457 asize = DVA_GET_ASIZE(&pdva[d]); |
|
1458 asize += DVA_GET_ASIZE(&cdva[d]); |
|
1459 DVA_SET_ASIZE(&pdva[d], asize); |
|
1460 } |
|
1461 mutex_exit(&pio->io_lock); |
|
1462 } |
|
1463 |
|
1464 static int |
|
1465 zio_write_gang_block(zio_t *pio) |
|
1466 { |
|
1467 spa_t *spa = pio->io_spa; |
|
1468 blkptr_t *bp = pio->io_bp; |
|
1469 zio_t *lio = pio->io_logical; |
|
1470 zio_t *zio; |
|
1471 zio_gang_node_t *gn, **gnpp; |
|
1472 zio_gbh_phys_t *gbh; |
|
1473 uint64_t txg = pio->io_txg; |
|
1474 uint64_t resid = pio->io_size; |
|
1475 uint64_t lsize; |
|
1476 int ndvas = lio->io_prop.zp_ndvas; |
|
1477 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); |
|
1478 zio_prop_t zp; |
|
1479 int error; |
|
1480 |
|
1481 error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, |
|
1482 bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, |
|
1483 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); |
|
1484 if (error) { |
|
1485 pio->io_error = error; |
|
1486 return (ZIO_PIPELINE_CONTINUE); |
|
1487 } |
|
1488 |
|
1489 if (pio == lio) { |
|
1490 gnpp = &lio->io_gang_tree; |
|
1491 } else { |
|
1492 gnpp = pio->io_private; |
|
1493 ASSERT(pio->io_ready == zio_write_gang_member_ready); |
|
1494 } |
|
1495 |
|
1496 gn = zio_gang_node_alloc(gnpp); |
|
1497 gbh = gn->gn_gbh; |
|
1498 bzero(gbh, SPA_GANGBLOCKSIZE); |
|
1499 |
|
1500 /* |
|
1501 * Create the gang header. |
|
1502 */ |
|
1503 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, |
|
1504 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); |
|
1505 |
|
1506 /* |
|
1507 * Create and nowait the gang children. |
|
1508 */ |
|
1509 for (int g = 0; resid != 0; resid -= lsize, g++) { |
|
1510 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), |
|
1511 SPA_MINBLOCKSIZE); |
|
1512 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); |
|
1513 |
|
1514 zp.zp_checksum = lio->io_prop.zp_checksum; |
|
1515 zp.zp_compress = ZIO_COMPRESS_OFF; |
|
1516 zp.zp_type = DMU_OT_NONE; |
|
1517 zp.zp_level = 0; |
|
1518 zp.zp_ndvas = lio->io_prop.zp_ndvas; |
|
1519 |
|
1520 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], |
|
1521 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, |
|
1522 zio_write_gang_member_ready, NULL, &gn->gn_child[g], |
|
1523 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), |
|
1524 &pio->io_bookmark)); |
|
1525 } |
|
1526 |
|
1527 /* |
|
1528 * Set pio's pipeline to just wait for zio to finish. |
|
1529 */ |
|
1530 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; |
|
1531 |
|
1532 zio_nowait(zio); |
|
1533 |
|
1534 return (ZIO_PIPELINE_CONTINUE); |
|
1535 } |
|
1536 |
|
1537 /* |
|
1538 * ========================================================================== |
|
1539 * Allocate and free blocks |
|
1540 * ========================================================================== |
|
1541 */ |
|
1542 |
|
1543 static int |
|
1544 zio_dva_allocate(zio_t *zio) |
|
1545 { |
|
1546 spa_t *spa = zio->io_spa; |
|
1547 metaslab_class_t *mc = spa->spa_normal_class; |
|
1548 blkptr_t *bp = zio->io_bp; |
|
1549 int error; |
|
1550 |
|
1551 ASSERT(BP_IS_HOLE(bp)); |
|
1552 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); |
|
1553 ASSERT3U(zio->io_prop.zp_ndvas, >, 0); |
|
1554 ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); |
|
1555 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); |
|
1556 |
|
1557 error = metaslab_alloc(spa, mc, zio->io_size, bp, |
|
1558 zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); |
|
1559 |
|
1560 if (error) { |
|
1561 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) |
|
1562 return (zio_write_gang_block(zio)); |
|
1563 zio->io_error = error; |
|
1564 } |
|
1565 |
|
1566 return (ZIO_PIPELINE_CONTINUE); |
|
1567 } |
|
1568 |
|
1569 static int |
|
1570 zio_dva_free(zio_t *zio) |
|
1571 { |
|
1572 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); |
|
1573 |
|
1574 return (ZIO_PIPELINE_CONTINUE); |
|
1575 } |
|
1576 |
|
1577 static int |
|
1578 zio_dva_claim(zio_t *zio) |
|
1579 { |
|
1580 int error; |
|
1581 |
|
1582 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); |
|
1583 if (error) |
|
1584 zio->io_error = error; |
|
1585 |
|
1586 return (ZIO_PIPELINE_CONTINUE); |
|
1587 } |
|
1588 |
|
1589 /* |
|
1590 * Undo an allocation. This is used by zio_done() when an I/O fails |
|
1591 * and we want to give back the block we just allocated. |
|
1592 * This handles both normal blocks and gang blocks. |
|
1593 */ |
|
1594 static void |
|
1595 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) |
|
1596 { |
|
1597 spa_t *spa = zio->io_spa; |
|
1598 boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); |
|
1599 |
|
1600 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); |
|
1601 |
|
1602 if (zio->io_bp == bp && !now) { |
|
1603 /* |
|
1604 * This is a rewrite for sync-to-convergence. |
|
1605 * We can't do a metaslab_free(NOW) because bp wasn't allocated |
|
1606 * during this sync pass, which means that metaslab_sync() |
|
1607 * already committed the allocation. |
|
1608 */ |
|
1609 ASSERT(DVA_EQUAL(BP_IDENTITY(bp), |
|
1610 BP_IDENTITY(&zio->io_bp_orig))); |
|
1611 ASSERT(spa_sync_pass(spa) > 1); |
|
1612 |
|
1613 if (BP_IS_GANG(bp) && gn == NULL) { |
|
1614 /* |
|
1615 * This is a gang leader whose gang header(s) we |
|
1616 * couldn't read now, so defer the free until later. |
|
1617 * The block should still be intact because without |
|
1618 * the headers, we'd never even start the rewrite. |
|
1619 */ |
|
1620 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); |
|
1621 return; |
|
1622 } |
|
1623 } |
|
1624 |
|
1625 if (!BP_IS_HOLE(bp)) |
|
1626 metaslab_free(spa, bp, bp->blk_birth, now); |
|
1627 |
|
1628 if (gn != NULL) { |
|
1629 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { |
|
1630 zio_dva_unallocate(zio, gn->gn_child[g], |
|
1631 &gn->gn_gbh->zg_blkptr[g]); |
|
1632 } |
|
1633 } |
2094 } |
1634 } |
2095 |
1635 |
2096 /* |
1636 /* |
2097 * Try to allocate an intent log block. Return 0 on success, errno on failure. |
1637 * Try to allocate an intent log block. Return 0 on success, errno on failure. |
2098 */ |
1638 */ |
2143 void |
1669 void |
2144 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) |
1670 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) |
2145 { |
1671 { |
2146 ASSERT(!BP_IS_GANG(bp)); |
1672 ASSERT(!BP_IS_GANG(bp)); |
2147 |
1673 |
2148 spa_config_enter(spa, RW_READER, FTAG); |
|
2149 |
|
2150 metaslab_free(spa, bp, txg, B_FALSE); |
1674 metaslab_free(spa, bp, txg, B_FALSE); |
2151 |
1675 } |
2152 spa_config_exit(spa, FTAG); |
1676 |
2153 } |
1677 /* |
2154 |
1678 * ========================================================================== |
2155 /* |
1679 * Read and write to physical devices |
2156 * start an async flush of the write cache for this vdev |
1680 * ========================================================================== |
2157 */ |
1681 */ |
|
1682 |
|
1683 static void |
|
1684 zio_vdev_io_probe_done(zio_t *zio) |
|
1685 { |
|
1686 zio_t *dio; |
|
1687 vdev_t *vd = zio->io_private; |
|
1688 |
|
1689 mutex_enter(&vd->vdev_probe_lock); |
|
1690 ASSERT(vd->vdev_probe_zio == zio); |
|
1691 vd->vdev_probe_zio = NULL; |
|
1692 mutex_exit(&vd->vdev_probe_lock); |
|
1693 |
|
1694 while ((dio = zio->io_delegate_list) != NULL) { |
|
1695 zio->io_delegate_list = dio->io_delegate_next; |
|
1696 dio->io_delegate_next = NULL; |
|
1697 if (!vdev_accessible(vd, dio)) |
|
1698 dio->io_error = ENXIO; |
|
1699 zio_execute(dio); |
|
1700 } |
|
1701 } |
|
1702 |
|
1703 /* |
|
1704 * Probe the device to determine whether I/O failure is specific to this |
|
1705 * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). |
|
1706 */ |
|
1707 static int |
|
1708 zio_vdev_io_probe(zio_t *zio) |
|
1709 { |
|
1710 vdev_t *vd = zio->io_vd; |
|
1711 zio_t *pio = NULL; |
|
1712 boolean_t created_pio = B_FALSE; |
|
1713 |
|
1714 /* |
|
1715 * Don't probe the probe. |
|
1716 */ |
|
1717 if (zio->io_flags & ZIO_FLAG_PROBE) |
|
1718 return (ZIO_PIPELINE_CONTINUE); |
|
1719 |
|
1720 /* |
|
1721 * To prevent 'probe storms' when a device fails, we create |
|
1722 * just one probe i/o at a time. All zios that want to probe |
|
1723 * this vdev will join the probe zio's io_delegate_list. |
|
1724 */ |
|
1725 mutex_enter(&vd->vdev_probe_lock); |
|
1726 |
|
1727 if ((pio = vd->vdev_probe_zio) == NULL) { |
|
1728 vd->vdev_probe_zio = pio = zio_root(zio->io_spa, |
|
1729 zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); |
|
1730 created_pio = B_TRUE; |
|
1731 vd->vdev_probe_wanted = B_TRUE; |
|
1732 spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); |
|
1733 } |
|
1734 |
|
1735 zio->io_delegate_next = pio->io_delegate_list; |
|
1736 pio->io_delegate_list = zio; |
|
1737 |
|
1738 mutex_exit(&vd->vdev_probe_lock); |
|
1739 |
|
1740 if (created_pio) { |
|
1741 zio_nowait(vdev_probe(vd, pio)); |
|
1742 zio_nowait(pio); |
|
1743 } |
|
1744 |
|
1745 return (ZIO_PIPELINE_STOP); |
|
1746 } |
|
1747 |
|
1748 static int |
|
1749 zio_vdev_io_start(zio_t *zio) |
|
1750 { |
|
1751 vdev_t *vd = zio->io_vd; |
|
1752 uint64_t align; |
|
1753 spa_t *spa = zio->io_spa; |
|
1754 |
|
1755 ASSERT(zio->io_error == 0); |
|
1756 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); |
|
1757 |
|
1758 if (vd == NULL) { |
|
1759 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) |
|
1760 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); |
|
1761 |
|
1762 /* |
|
1763 * The mirror_ops handle multiple DVAs in a single BP. |
|
1764 */ |
|
1765 return (vdev_mirror_ops.vdev_op_io_start(zio)); |
|
1766 } |
|
1767 |
|
1768 align = 1ULL << vd->vdev_top->vdev_ashift; |
|
1769 |
|
1770 if (P2PHASE(zio->io_size, align) != 0) { |
|
1771 uint64_t asize = P2ROUNDUP(zio->io_size, align); |
|
1772 char *abuf = zio_buf_alloc(asize); |
|
1773 ASSERT(vd == vd->vdev_top); |
|
1774 if (zio->io_type == ZIO_TYPE_WRITE) { |
|
1775 bcopy(zio->io_data, abuf, zio->io_size); |
|
1776 bzero(abuf + zio->io_size, asize - zio->io_size); |
|
1777 } |
|
1778 zio_push_transform(zio, abuf, asize, asize, zio_subblock); |
|
1779 } |
|
1780 |
|
1781 ASSERT(P2PHASE(zio->io_offset, align) == 0); |
|
1782 ASSERT(P2PHASE(zio->io_size, align) == 0); |
|
1783 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); |
|
1784 |
|
1785 if (vd->vdev_ops->vdev_op_leaf && |
|
1786 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { |
|
1787 |
|
1788 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) |
|
1789 return (ZIO_PIPELINE_STOP); |
|
1790 |
|
1791 if ((zio = vdev_queue_io(zio)) == NULL) |
|
1792 return (ZIO_PIPELINE_STOP); |
|
1793 |
|
1794 if (!vdev_accessible(vd, zio)) { |
|
1795 zio->io_error = ENXIO; |
|
1796 zio_interrupt(zio); |
|
1797 return (ZIO_PIPELINE_STOP); |
|
1798 } |
|
1799 |
|
1800 } |
|
1801 |
|
1802 return (vd->vdev_ops->vdev_op_io_start(zio)); |
|
1803 } |
|
1804 |
|
1805 static int |
|
1806 zio_vdev_io_done(zio_t *zio) |
|
1807 { |
|
1808 vdev_t *vd = zio->io_vd; |
|
1809 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; |
|
1810 boolean_t unexpected_error = B_FALSE; |
|
1811 |
|
1812 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) |
|
1813 return (ZIO_PIPELINE_STOP); |
|
1814 |
|
1815 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); |
|
1816 |
|
1817 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { |
|
1818 |
|
1819 vdev_queue_io_done(zio); |
|
1820 |
|
1821 if (zio->io_type == ZIO_TYPE_WRITE) |
|
1822 vdev_cache_write(zio); |
|
1823 |
|
1824 if (zio_injection_enabled && zio->io_error == 0) |
|
1825 zio->io_error = zio_handle_device_injection(vd, EIO); |
|
1826 |
|
1827 if (zio_injection_enabled && zio->io_error == 0) |
|
1828 zio->io_error = zio_handle_label_injection(zio, EIO); |
|
1829 |
|
1830 if (zio->io_error) { |
|
1831 if (!vdev_accessible(vd, zio)) { |
|
1832 zio->io_error = ENXIO; |
|
1833 } else { |
|
1834 unexpected_error = B_TRUE; |
|
1835 } |
|
1836 } |
|
1837 } |
|
1838 |
|
1839 ops->vdev_op_io_done(zio); |
|
1840 |
|
1841 if (unexpected_error) |
|
1842 return (zio_vdev_io_probe(zio)); |
|
1843 |
|
1844 return (ZIO_PIPELINE_CONTINUE); |
|
1845 } |
|
1846 |
|
1847 static int |
|
1848 zio_vdev_io_assess(zio_t *zio) |
|
1849 { |
|
1850 vdev_t *vd = zio->io_vd; |
|
1851 |
|
1852 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) |
|
1853 return (ZIO_PIPELINE_STOP); |
|
1854 |
|
1855 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) |
|
1856 spa_config_exit(zio->io_spa, SCL_ZIO, zio); |
|
1857 |
|
1858 if (zio->io_vsd != NULL) { |
|
1859 zio->io_vsd_free(zio); |
|
1860 zio->io_vsd = NULL; |
|
1861 } |
|
1862 |
|
1863 if (zio_injection_enabled && zio->io_error == 0) |
|
1864 zio->io_error = zio_handle_fault_injection(zio, EIO); |
|
1865 |
|
1866 /* |
|
1867 * If the I/O failed, determine whether we should attempt to retry it. |
|
1868 */ |
|
1869 if (zio->io_error && vd == NULL && |
|
1870 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { |
|
1871 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ |
|
1872 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ |
|
1873 zio->io_error = 0; |
|
1874 zio->io_flags |= ZIO_FLAG_IO_RETRY | |
|
1875 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; |
|
1876 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; |
|
1877 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); |
|
1878 return (ZIO_PIPELINE_STOP); |
|
1879 } |
|
1880 |
|
1881 /* |
|
1882 * If we got an error on a leaf device, convert it to ENXIO |
|
1883 * if the device is not accessible at all. |
|
1884 */ |
|
1885 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && |
|
1886 !vdev_accessible(vd, zio)) |
|
1887 zio->io_error = ENXIO; |
|
1888 |
|
1889 /* |
|
1890 * If we can't write to an interior vdev (mirror or RAID-Z), |
|
1891 * set vdev_cant_write so that we stop trying to allocate from it. |
|
1892 */ |
|
1893 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && |
|
1894 vd != NULL && !vd->vdev_ops->vdev_op_leaf) |
|
1895 vd->vdev_cant_write = B_TRUE; |
|
1896 |
|
1897 if (zio->io_error) |
|
1898 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; |
|
1899 |
|
1900 return (ZIO_PIPELINE_CONTINUE); |
|
1901 } |
|
1902 |
2158 void |
1903 void |
2159 zio_flush(zio_t *zio, vdev_t *vd) |
1904 zio_vdev_io_reissue(zio_t *zio) |
2160 { |
1905 { |
2161 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, |
1906 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); |
2162 NULL, NULL, ZIO_PRIORITY_NOW, |
1907 ASSERT(zio->io_error == 0); |
2163 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); |
1908 |
2164 } |
1909 zio->io_stage--; |
|
1910 } |
|
1911 |
|
1912 void |
|
1913 zio_vdev_io_redone(zio_t *zio) |
|
1914 { |
|
1915 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); |
|
1916 |
|
1917 zio->io_stage--; |
|
1918 } |
|
1919 |
|
1920 void |
|
1921 zio_vdev_io_bypass(zio_t *zio) |
|
1922 { |
|
1923 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); |
|
1924 ASSERT(zio->io_error == 0); |
|
1925 |
|
1926 zio->io_flags |= ZIO_FLAG_IO_BYPASS; |
|
1927 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; |
|
1928 } |
|
1929 |
|
1930 /* |
|
1931 * ========================================================================== |
|
1932 * Generate and verify checksums |
|
1933 * ========================================================================== |
|
1934 */ |
|
1935 static int |
|
1936 zio_checksum_generate(zio_t *zio) |
|
1937 { |
|
1938 blkptr_t *bp = zio->io_bp; |
|
1939 enum zio_checksum checksum; |
|
1940 |
|
1941 if (bp == NULL) { |
|
1942 /* |
|
1943 * This is zio_write_phys(). |
|
1944 * We're either generating a label checksum, or none at all. |
|
1945 */ |
|
1946 checksum = zio->io_prop.zp_checksum; |
|
1947 |
|
1948 if (checksum == ZIO_CHECKSUM_OFF) |
|
1949 return (ZIO_PIPELINE_CONTINUE); |
|
1950 |
|
1951 ASSERT(checksum == ZIO_CHECKSUM_LABEL); |
|
1952 } else { |
|
1953 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { |
|
1954 ASSERT(!IO_IS_ALLOCATING(zio)); |
|
1955 checksum = ZIO_CHECKSUM_GANG_HEADER; |
|
1956 } else { |
|
1957 checksum = BP_GET_CHECKSUM(bp); |
|
1958 } |
|
1959 } |
|
1960 |
|
1961 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); |
|
1962 |
|
1963 return (ZIO_PIPELINE_CONTINUE); |
|
1964 } |
|
1965 |
|
1966 static int |
|
1967 zio_checksum_verify(zio_t *zio) |
|
1968 { |
|
1969 blkptr_t *bp = zio->io_bp; |
|
1970 int error; |
|
1971 |
|
1972 if (bp == NULL) { |
|
1973 /* |
|
1974 * This is zio_read_phys(). |
|
1975 * We're either verifying a label checksum, or nothing at all. |
|
1976 */ |
|
1977 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) |
|
1978 return (ZIO_PIPELINE_CONTINUE); |
|
1979 |
|
1980 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); |
|
1981 } |
|
1982 |
|
1983 if ((error = zio_checksum_error(zio)) != 0) { |
|
1984 zio->io_error = error; |
|
1985 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
|
1986 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, |
|
1987 zio->io_spa, zio->io_vd, zio, 0, 0); |
|
1988 } |
|
1989 } |
|
1990 |
|
1991 return (ZIO_PIPELINE_CONTINUE); |
|
1992 } |
|
1993 |
|
1994 /* |
|
1995 * Called by RAID-Z to ensure we don't compute the checksum twice. |
|
1996 */ |
|
1997 void |
|
1998 zio_checksum_verified(zio_t *zio) |
|
1999 { |
|
2000 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); |
|
2001 } |
|
2002 |
|
2003 /* |
|
2004 * ========================================================================== |
|
2005 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. |
|
2006 * An error of 0 indictes success. ENXIO indicates whole-device failure, |
|
2007 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO |
|
2008 * indicate errors that are specific to one I/O, and most likely permanent. |
|
2009 * Any other error is presumed to be worse because we weren't expecting it. |
|
2010 * ========================================================================== |
|
2011 */ |
|
2012 int |
|
2013 zio_worst_error(int e1, int e2) |
|
2014 { |
|
2015 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; |
|
2016 int r1, r2; |
|
2017 |
|
2018 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) |
|
2019 if (e1 == zio_error_rank[r1]) |
|
2020 break; |
|
2021 |
|
2022 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) |
|
2023 if (e2 == zio_error_rank[r2]) |
|
2024 break; |
|
2025 |
|
2026 return (r1 > r2 ? e1 : e2); |
|
2027 } |
|
2028 |
|
2029 /* |
|
2030 * ========================================================================== |
|
2031 * I/O completion |
|
2032 * ========================================================================== |
|
2033 */ |
|
2034 static int |
|
2035 zio_ready(zio_t *zio) |
|
2036 { |
|
2037 blkptr_t *bp = zio->io_bp; |
|
2038 zio_t *pio = zio->io_parent; |
|
2039 |
|
2040 if (zio->io_ready) { |
|
2041 if (BP_IS_GANG(bp) && |
|
2042 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) |
|
2043 return (ZIO_PIPELINE_STOP); |
|
2044 |
|
2045 ASSERT(IO_IS_ALLOCATING(zio)); |
|
2046 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); |
|
2047 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); |
|
2048 |
|
2049 zio->io_ready(zio); |
|
2050 } |
|
2051 |
|
2052 if (bp != NULL && bp != &zio->io_bp_copy) |
|
2053 zio->io_bp_copy = *bp; |
|
2054 |
|
2055 if (zio->io_error) |
|
2056 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; |
|
2057 |
|
2058 if (pio != NULL) |
|
2059 zio_notify_parent(pio, zio, ZIO_WAIT_READY); |
|
2060 |
|
2061 return (ZIO_PIPELINE_CONTINUE); |
|
2062 } |
|
2063 |
|
2064 static int |
|
2065 zio_done(zio_t *zio) |
|
2066 { |
|
2067 spa_t *spa = zio->io_spa; |
|
2068 zio_t *pio = zio->io_parent; |
|
2069 zio_t *lio = zio->io_logical; |
|
2070 blkptr_t *bp = zio->io_bp; |
|
2071 vdev_t *vd = zio->io_vd; |
|
2072 uint64_t psize = zio->io_size; |
|
2073 |
|
2074 /* |
|
2075 * If our of children haven't all completed, |
|
2076 * wait for them and then repeat this pipeline stage. |
|
2077 */ |
|
2078 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || |
|
2079 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || |
|
2080 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) |
|
2081 return (ZIO_PIPELINE_STOP); |
|
2082 |
|
2083 for (int c = 0; c < ZIO_CHILD_TYPES; c++) |
|
2084 for (int w = 0; w < ZIO_WAIT_TYPES; w++) |
|
2085 ASSERT(zio->io_children[c][w] == 0); |
|
2086 |
|
2087 if (bp != NULL) { |
|
2088 ASSERT(bp->blk_pad[0] == 0); |
|
2089 ASSERT(bp->blk_pad[1] == 0); |
|
2090 ASSERT(bp->blk_pad[2] == 0); |
|
2091 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || |
|
2092 (pio != NULL && bp == pio->io_bp)); |
|
2093 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && |
|
2094 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { |
|
2095 ASSERT(!BP_SHOULD_BYTESWAP(bp)); |
|
2096 ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); |
|
2097 ASSERT(BP_COUNT_GANG(bp) == 0 || |
|
2098 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); |
|
2099 } |
|
2100 } |
|
2101 |
|
2102 /* |
|
2103 * If there were child vdev or gang errors, they apply to us now. |
|
2104 */ |
|
2105 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); |
|
2106 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); |
|
2107 |
|
2108 zio_pop_transforms(zio); /* note: may set zio->io_error */ |
|
2109 |
|
2110 vdev_stat_update(zio, psize); |
|
2111 |
|
2112 if (zio->io_error) { |
|
2113 /* |
|
2114 * If this I/O is attached to a particular vdev, |
|
2115 * generate an error message describing the I/O failure |
|
2116 * at the block level. We ignore these errors if the |
|
2117 * device is currently unavailable. |
|
2118 */ |
|
2119 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) |
|
2120 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); |
|
2121 |
|
2122 if ((zio->io_error == EIO || |
|
2123 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { |
|
2124 /* |
|
2125 * For logical I/O requests, tell the SPA to log the |
|
2126 * error and generate a logical data ereport. |
|
2127 */ |
|
2128 spa_log_error(spa, zio); |
|
2129 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, |
|
2130 0, 0); |
|
2131 } |
|
2132 } |
|
2133 |
|
2134 if (zio->io_error && zio == lio) { |
|
2135 /* |
|
2136 * Determine whether zio should be reexecuted. This will |
|
2137 * propagate all the way to the root via zio_notify_parent(). |
|
2138 */ |
|
2139 ASSERT(vd == NULL && bp != NULL); |
|
2140 |
|
2141 if (IO_IS_ALLOCATING(zio)) |
|
2142 if (zio->io_error != ENOSPC) |
|
2143 zio->io_reexecute |= ZIO_REEXECUTE_NOW; |
|
2144 else |
|
2145 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; |
|
2146 |
|
2147 if ((zio->io_type == ZIO_TYPE_READ || |
|
2148 zio->io_type == ZIO_TYPE_FREE) && |
|
2149 zio->io_error == ENXIO && |
|
2150 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) |
|
2151 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; |
|
2152 |
|
2153 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) |
|
2154 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; |
|
2155 } |
|
2156 |
|
2157 /* |
|
2158 * If there were logical child errors, they apply to us now. |
|
2159 * We defer this until now to avoid conflating logical child |
|
2160 * errors with errors that happened to the zio itself when |
|
2161 * updating vdev stats and reporting FMA events above. |
|
2162 */ |
|
2163 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); |
|
2164 |
|
2165 if (zio->io_reexecute) { |
|
2166 /* |
|
2167 * This is a logical I/O that wants to reexecute. |
|
2168 * |
|
2169 * Reexecute is top-down. When an i/o fails, if it's not |
|
2170 * the root, it simply notifies its parent and sticks around. |
|
2171 * The parent, seeing that it still has children in zio_done(), |
|
2172 * does the same. This percolates all the way up to the root. |
|
2173 * The root i/o will reexecute or suspend the entire tree. |
|
2174 * |
|
2175 * This approach ensures that zio_reexecute() honors |
|
2176 * all the original i/o dependency relationships, e.g. |
|
2177 * parents not executing until children are ready. |
|
2178 */ |
|
2179 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); |
|
2180 |
|
2181 if (IO_IS_ALLOCATING(zio)) |
|
2182 zio_dva_unallocate(zio, zio->io_gang_tree, bp); |
|
2183 |
|
2184 zio_gang_tree_free(&zio->io_gang_tree); |
|
2185 |
|
2186 if (pio != NULL) { |
|
2187 /* |
|
2188 * We're not a root i/o, so there's nothing to do |
|
2189 * but notify our parent. Don't propagate errors |
|
2190 * upward since we haven't permanently failed yet. |
|
2191 */ |
|
2192 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; |
|
2193 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); |
|
2194 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { |
|
2195 /* |
|
2196 * We'd fail again if we reexecuted now, so suspend |
|
2197 * until conditions improve (e.g. device comes online). |
|
2198 */ |
|
2199 zio_suspend(spa, zio); |
|
2200 } else { |
|
2201 /* |
|
2202 * Reexecution is potentially a huge amount of work. |
|
2203 * Hand it off to the otherwise-unused claim taskq. |
|
2204 */ |
|
2205 (void) taskq_dispatch( |
|
2206 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], |
|
2207 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); |
|
2208 } |
|
2209 return (ZIO_PIPELINE_STOP); |
|
2210 } |
|
2211 |
|
2212 ASSERT(zio->io_child == NULL); |
|
2213 ASSERT(zio->io_reexecute == 0); |
|
2214 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); |
|
2215 |
|
2216 if (zio->io_done) |
|
2217 zio->io_done(zio); |
|
2218 |
|
2219 zio_gang_tree_free(&zio->io_gang_tree); |
|
2220 |
|
2221 ASSERT(zio->io_delegate_list == NULL); |
|
2222 ASSERT(zio->io_delegate_next == NULL); |
|
2223 |
|
2224 if (pio != NULL) { |
|
2225 zio_remove_child(pio, zio); |
|
2226 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); |
|
2227 } |
|
2228 |
|
2229 if (zio->io_waiter != NULL) { |
|
2230 mutex_enter(&zio->io_lock); |
|
2231 zio->io_executor = NULL; |
|
2232 cv_broadcast(&zio->io_cv); |
|
2233 mutex_exit(&zio->io_lock); |
|
2234 } else { |
|
2235 zio_destroy(zio); |
|
2236 } |
|
2237 |
|
2238 return (ZIO_PIPELINE_STOP); |
|
2239 } |
|
2240 |
|
2241 /* |
|
2242 * ========================================================================== |
|
2243 * I/O pipeline definition |
|
2244 * ========================================================================== |
|
2245 */ |
|
2246 static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { |
|
2247 NULL, |
|
2248 zio_issue_async, |
|
2249 zio_read_bp_init, |
|
2250 zio_write_bp_init, |
|
2251 zio_checksum_generate, |
|
2252 zio_gang_assemble, |
|
2253 zio_gang_issue, |
|
2254 zio_dva_allocate, |
|
2255 zio_dva_free, |
|
2256 zio_dva_claim, |
|
2257 zio_ready, |
|
2258 zio_vdev_io_start, |
|
2259 zio_vdev_io_done, |
|
2260 zio_vdev_io_assess, |
|
2261 zio_checksum_verify, |
|
2262 zio_done |
|
2263 }; |